Commented out lines of code that were not used when determining performance...

Commented out lines of code that were not used when determining performance metrics for question 3b and 4

Commented out lines of code that were not used when determining performance...
30b89988 · Nischol Antao · 84ced50e · 30b89988 · 30b89988
Commit 30b89988 authored 6 years ago by Nischol Antao
--- a/src/question_3b_pyspark_cluster_zeppelin.py
+++ b/src/question_3b_pyspark_cluster_zeppelin.py
-%spark2.pyspark
+# Uncomment the line below when running in Zeppelin notebooks
+#%spark2.pyspark
 # Import the libraries we will need
 import time

--- a/src/question_4_pyspark_cluster_zeppelin.py
+++ b/src/question_4_pyspark_cluster_zeppelin.py
-%spark2.pyspark
+# Uncomment the line below when running in Zeppelin notebooks
+#%spark2.pyspark
 # Import the libraries we will need
 import time
@@ -19,12 +20,12 @@ masterData = sqlContext.read.format('com.databricks.spark.csv').options(header='
 pitchingData = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('s3n://cs498ccafinalproject/Pitching.csv')
 # Display the schema for both files
-masterData.printSchema()
+#masterData.printSchema()
-pitchingData.printSchema()
+#pitchingData.printSchema()
 # Start the benchmark timer
 startTime = time.time()
-print 'Starting benchmark at: {}'.format(startTime)
+#print 'Starting benchmark at: {}'.format(startTime)
 # Merge the two data frames
 questionData = pitchingData.join(masterData, masterData.playerID == pitchingData.playerID, 'left')
@@ -42,13 +43,28 @@ sqlDF = spark.sql('select yearID, throws, avg(ERA) as ERA from questionData grou
 sqlDF = sqlDF.na.drop(subset=["throws"])
 # Display results
-sqlDF.show()
+#sqlDF.show()
+# Generate our query
+sqlDF2 = spark.sql('select yearID, throws, avg(BAOpp) as BAOpp from questionData group by yearID, throws order by yearID asc')
+# Remove NA, INF
+sqlDF2 = sqlDF2.na.drop(subset=["throws"])
+# Display results
+sqlDF2 = sqlDF2.na.drop()
+#sqlDF2.show()
+pandas_sqlDF = sqlDF.toPandas()
+pandas_sqlDF2 = sqlDF2.toPandas()
 # Export to CSV
-sqlDF.write.csv('/tmp/Question4.csv')
+#pandas_sqlDF.to_csv('spark_question4_ERA_right_vs_lefty_pitchers.csv')
+#pandas_sqlDF.to_csv('spark_question4_BAOpp_right_vs_lefty_pitchers')
 # End the benchmark timer
 endTime = time.time()
-print 'Ending benchmark at: {}'.format(endTime)
+#print 'Ending benchmark at: {}'.format(endTime)
 totalTime = endTime - startTime
 print 'Total processing time: {}'.format(totalTime)
\ No newline at end of file