Skip to content
Snippets Groups Projects
Pandas_Quest_1_Player_Country_Origins.py 2.63 KiB
#-------------------------------------------------------------------------------
# Name:        Q1_Player_Country_Origins
# Purpose: Find Which Countries produce the most major league baseball players
#
# Author:      antaonn
#
# Created:     16/04/2018
# Copyright:   (c) antaonn 2018
# Licence:     <your licence>
#-------------------------------------------------------------------------------
import time
starttime = time.time()

import os.path
import pandas as pd

import csv



# This is only needed for the Local Folder Structure in the project
my_path = os.path.abspath(os.path.dirname(__file__))

masterpath = os.path.join(my_path, "../data/Master.csv")
fieldingpath = os.path.join(my_path, "../data/Fielding.csv")

# Create Data Frames for Salary Table and Team Table
df_master = pd.read_csv(masterpath)
df_field = pd.read_csv(fieldingpath)

# From the Fielding Database extract a single entry for a player for a given year

df_field_filt = df_field.filter(items=['playerID', 'yearID']).drop_duplicates()
#print df_field_filt

# For Fun find the most frequent and least frequent birthdays of all MLB Players
# From the Master Player Database extract the country of birth for every player

##df_master_filt = df_master.filter(items=['playerID', 'birthCountry', 'birthMonth', 'birthDay'])
##df_master_filt.groupby(['birthMonth', 'birthDay']).describe().to_csv('Birthday.csv')

# From the Master Player Database extract the country of birth for every player
df_master_filt = df_master.filter(items=['playerID', 'birthCountry'])
#print df_master_filt

# Merge Databases so we get country of origin

merge_master_field = df_field_filt.merge(df_master_filt).sort_values('yearID')
#print merge

merge_master_field_filt = merge_master_field.filter(items=['yearID', 'birthCountry'])

yearlist = merge_master_field_filt.yearID.unique()



with open('Baseball_Global_Representation_By_Year.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Year", "Country", "Player Count"])


    for year in yearlist:
        merge_master_field_filt_year = merge_master_field_filt[(merge_master_field_filt.yearID == year)]
        yearlycountrylist = merge_master_field_filt_year.birthCountry.unique()
        for country in yearlycountrylist:
            if isinstance(country, basestring):
            #print (year, country, merge_master_field_filt_year[(merge_master_field_filt_year.birthCountry == country)]['birthCountry'].count())
                writer.writerow([year, country, merge_master_field_filt_year[(merge_master_field_filt_year.birthCountry == country)]['birthCountry'].count()])

print (time.time()-starttime)