Skip to content
Snippets Groups Projects
Commit 30b89988 authored by Nischol Antao's avatar Nischol Antao
Browse files

Commented out lines of code that were not used when determining performance...

Commented out lines of code that were not used when determining performance metrics for question 3b and 4
parent 84ced50e
No related branches found
No related tags found
No related merge requests found
%spark2.pyspark # Uncomment the line below when running in Zeppelin notebooks
#%spark2.pyspark
# Import the libraries we will need # Import the libraries we will need
import time import time
......
%spark2.pyspark # Uncomment the line below when running in Zeppelin notebooks
#%spark2.pyspark
# Import the libraries we will need # Import the libraries we will need
import time import time
...@@ -19,12 +20,12 @@ masterData = sqlContext.read.format('com.databricks.spark.csv').options(header=' ...@@ -19,12 +20,12 @@ masterData = sqlContext.read.format('com.databricks.spark.csv').options(header='
pitchingData = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('s3n://cs498ccafinalproject/Pitching.csv') pitchingData = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('s3n://cs498ccafinalproject/Pitching.csv')
# Display the schema for both files # Display the schema for both files
masterData.printSchema() #masterData.printSchema()
pitchingData.printSchema() #pitchingData.printSchema()
# Start the benchmark timer # Start the benchmark timer
startTime = time.time() startTime = time.time()
print 'Starting benchmark at: {}'.format(startTime) #print 'Starting benchmark at: {}'.format(startTime)
# Merge the two data frames # Merge the two data frames
questionData = pitchingData.join(masterData, masterData.playerID == pitchingData.playerID, 'left') questionData = pitchingData.join(masterData, masterData.playerID == pitchingData.playerID, 'left')
...@@ -42,13 +43,28 @@ sqlDF = spark.sql('select yearID, throws, avg(ERA) as ERA from questionData grou ...@@ -42,13 +43,28 @@ sqlDF = spark.sql('select yearID, throws, avg(ERA) as ERA from questionData grou
sqlDF = sqlDF.na.drop(subset=["throws"]) sqlDF = sqlDF.na.drop(subset=["throws"])
# Display results # Display results
sqlDF.show() #sqlDF.show()
# Generate our query
sqlDF2 = spark.sql('select yearID, throws, avg(BAOpp) as BAOpp from questionData group by yearID, throws order by yearID asc')
# Remove NA, INF
sqlDF2 = sqlDF2.na.drop(subset=["throws"])
# Display results
sqlDF2 = sqlDF2.na.drop()
#sqlDF2.show()
pandas_sqlDF = sqlDF.toPandas()
pandas_sqlDF2 = sqlDF2.toPandas()
# Export to CSV # Export to CSV
sqlDF.write.csv('/tmp/Question4.csv') #pandas_sqlDF.to_csv('spark_question4_ERA_right_vs_lefty_pitchers.csv')
#pandas_sqlDF.to_csv('spark_question4_BAOpp_right_vs_lefty_pitchers')
# End the benchmark timer # End the benchmark timer
endTime = time.time() endTime = time.time()
print 'Ending benchmark at: {}'.format(endTime) #print 'Ending benchmark at: {}'.format(endTime)
totalTime = endTime - startTime totalTime = endTime - startTime
print 'Total processing time: {}'.format(totalTime) print 'Total processing time: {}'.format(totalTime)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment