-
Nischol Antao authored
Modified local programs to output csv data. Measured the amount of time each query took on a local pc. Started a document for Challenges_Encountered
Nischol Antao authoredModified local programs to output csv data. Measured the amount of time each query took on a local pc. Started a document for Challenges_Encountered
Pandas_Quest_1_Player_Country_Origins.py 2.63 KiB
#-------------------------------------------------------------------------------
# Name: Q1_Player_Country_Origins
# Purpose: Find Which Countries produce the most major league baseball players
#
# Author: antaonn
#
# Created: 16/04/2018
# Copyright: (c) antaonn 2018
# Licence: <your licence>
#-------------------------------------------------------------------------------
import time
starttime = time.time()
import os.path
import pandas as pd
import csv
# This is only needed for the Local Folder Structure in the project
my_path = os.path.abspath(os.path.dirname(__file__))
masterpath = os.path.join(my_path, "../data/Master.csv")
fieldingpath = os.path.join(my_path, "../data/Fielding.csv")
# Create Data Frames for Salary Table and Team Table
df_master = pd.read_csv(masterpath)
df_field = pd.read_csv(fieldingpath)
# From the Fielding Database extract a single entry for a player for a given year
df_field_filt = df_field.filter(items=['playerID', 'yearID']).drop_duplicates()
#print df_field_filt
# For Fun find the most frequent and least frequent birthdays of all MLB Players
# From the Master Player Database extract the country of birth for every player
##df_master_filt = df_master.filter(items=['playerID', 'birthCountry', 'birthMonth', 'birthDay'])
##df_master_filt.groupby(['birthMonth', 'birthDay']).describe().to_csv('Birthday.csv')
# From the Master Player Database extract the country of birth for every player
df_master_filt = df_master.filter(items=['playerID', 'birthCountry'])
#print df_master_filt
# Merge Databases so we get country of origin
merge_master_field = df_field_filt.merge(df_master_filt).sort_values('yearID')
#print merge
merge_master_field_filt = merge_master_field.filter(items=['yearID', 'birthCountry'])
yearlist = merge_master_field_filt.yearID.unique()
with open('Baseball_Global_Representation_By_Year.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow(["Year", "Country", "Player Count"])
for year in yearlist:
merge_master_field_filt_year = merge_master_field_filt[(merge_master_field_filt.yearID == year)]
yearlycountrylist = merge_master_field_filt_year.birthCountry.unique()
for country in yearlycountrylist:
if isinstance(country, basestring):
#print (year, country, merge_master_field_filt_year[(merge_master_field_filt_year.birthCountry == country)]['birthCountry'].count())
writer.writerow([year, country, merge_master_field_filt_year[(merge_master_field_filt_year.birthCountry == country)]['birthCountry'].count()])
print (time.time()-starttime)