From c1a186936463141c77f964017da24ed1b92fc1ea Mon Sep 17 00:00:00 2001 From: Nischol Antao <antao_nischol@cat.com> Date: Sat, 21 Apr 2018 11:32:01 -0500 Subject: [PATCH] Final Source code for Question 1, and associated iPython Notebooks. Will include a separate Notebook for Visualizing question 1, because matplotlib is having issues in the version of Anaconda on the EC2 instance. --- notebooks/Question1.html | 170 +++++++++++++++++------ notebooks/Question1.ipynb | 160 +++++++++++++++------ notebooks/Question1.md | 106 ++++++++++---- src/question_1_pyspark.py | 284 ++++++++++++++++++++++++++++++++++---- 4 files changed, 581 insertions(+), 139 deletions(-) diff --git a/notebooks/Question1.html b/notebooks/Question1.html index d09a640..eb2d1c7 100644 --- a/notebooks/Question1.html +++ b/notebooks/Question1.html @@ -1,7 +1,7 @@ <!DOCTYPE html> <html> <head><meta charset="utf-8" /> -<title>SparkTest</title> +<title>Question_1</title> <script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script> @@ -11958,7 +11958,7 @@ div#notebook { </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [585]:</div> +<div class="prompt input_prompt">In [1]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Import SparkContext. This is the main entry point for Spark functionality</span> @@ -11990,7 +11990,7 @@ div#notebook { </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [586]:</div> +<div class="prompt input_prompt">In [2]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># The Master will need to change when running on a cluster. </span> @@ -12007,7 +12007,7 @@ div#notebook { </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [587]:</div> +<div class="prompt input_prompt">In [3]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># We instantiate a SparkContext object with the SparkConfig</span> @@ -12033,7 +12033,7 @@ div#notebook { </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [588]:</div> +<div class="prompt input_prompt">In [4]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># We create a sql context object, so that we can read in csv files easily, and create a data frame</span> @@ -12067,7 +12067,7 @@ div#notebook { </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [589]:</div> +<div class="prompt input_prompt">In [5]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Join the two tables, and filter the colums we need. </span> @@ -12149,7 +12149,7 @@ only showing top 20 rows </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [590]:</div> +<div class="prompt input_prompt">In [6]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Additional Examples showing how to get additional statistics</span> @@ -12167,7 +12167,7 @@ only showing top 20 rows <div class="output"> -<div class="output_area"><div class="prompt output_prompt">Out[590]:</div> +<div class="output_area"><div class="prompt output_prompt">Out[6]:</div> <div class="output_text output_subarea output_execute_result"> @@ -12182,7 +12182,7 @@ only showing top 20 rows </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [591]:</div> +<div class="prompt input_prompt">In [7]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Additional Examples showing how to get additional statistics</span> @@ -12200,7 +12200,7 @@ only showing top 20 rows <div class="output"> -<div class="output_area"><div class="prompt output_prompt">Out[591]:</div> +<div class="output_area"><div class="prompt output_prompt">Out[7]:</div> <div class="output_text output_subarea output_execute_result"> @@ -12215,7 +12215,7 @@ only showing top 20 rows </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [592]:</div> +<div class="prompt input_prompt">In [8]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Additional Examples showing how to get additional statistics</span> @@ -12233,7 +12233,7 @@ only showing top 20 rows <div class="output"> -<div class="output_area"><div class="prompt output_prompt">Out[592]:</div> +<div class="output_area"><div class="prompt output_prompt">Out[8]:</div> <div class="output_text output_subarea output_execute_result"> @@ -12248,7 +12248,7 @@ only showing top 20 rows </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [593]:</div> +<div class="prompt input_prompt">In [9]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Additional Examples showing how to get additional statistics</span> @@ -12265,7 +12265,7 @@ only showing top 20 rows <div class="output"> -<div class="output_area"><div class="prompt output_prompt">Out[593]:</div> +<div class="output_area"><div class="prompt output_prompt">Out[9]:</div> <div class="output_text output_subarea output_execute_result"> @@ -12292,7 +12292,7 @@ only showing top 20 rows </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [594]:</div> +<div class="prompt input_prompt">In [10]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Additional Examples showing how to get additional statistics</span> @@ -12303,14 +12303,21 @@ only showing top 20 rows <span class="c1"># Percentage Increase - Germany (300%) from 1 to 4. [Not Statistically significant] </span> <span class="c1"># Percentage Decrease - Aruba (-67%) from 3 to 1. [Not Statistically significant]</span> -<span class="n">df_2001</span> <span class="o">=</span> <span class="n">df_final</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df_final</span><span class="o">.</span><span class="n">yearID</span><span class="o">==</span><span class="mi">2001</span><span class="p">)</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'count(1)'</span><span class="p">,</span> <span class="s1">'countNum2001'</span><span class="p">)</span> -<span class="n">df_2016</span> <span class="o">=</span> <span class="n">df_final</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df_final</span><span class="o">.</span><span class="n">yearID</span><span class="o">==</span><span class="mi">2016</span><span class="p">)</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'count(1)'</span><span class="p">,</span> <span class="s1">'countNum2016'</span><span class="p">)</span> + +<span class="n">df_2001</span> <span class="o">=</span> <span class="n">df_final</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df_final</span><span class="o">.</span><span class="n">yearID</span><span class="o">==</span><span class="mi">2001</span><span class="p">)</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'count(1)'</span><span class="p">,</span> <span class="s1">'countNum2001'</span><span class="p">)</span><span class="o">.</span>\ + <span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'birthCountry'</span><span class="p">,</span> <span class="s1">'country2001'</span> <span class="p">)</span> +<span class="n">df_2016</span> <span class="o">=</span> <span class="n">df_final</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df_final</span><span class="o">.</span><span class="n">yearID</span><span class="o">==</span><span class="mi">2016</span><span class="p">)</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'count(1)'</span><span class="p">,</span> <span class="s1">'countNum2016'</span><span class="p">)</span><span class="o">.</span>\ + <span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'birthCountry'</span><span class="p">,</span> <span class="s1">'country2016'</span> <span class="p">)</span> + -<span class="n">df_change</span> <span class="o">=</span> <span class="n">df_2016</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">df_2001</span><span class="p">,</span> <span class="n">df_2016</span><span class="o">.</span><span class="n">birthCountry</span><span class="o">==</span><span class="n">df_2001</span><span class="o">.</span><span class="n">birthCountry</span><span class="p">,</span> <span class="s1">'inner'</span><span class="p">)</span><span class="o">.</span>\ + +<span class="n">df_change</span> <span class="o">=</span> <span class="n">df_2016</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">df_2001</span><span class="p">,</span> <span class="n">df_2016</span><span class="o">.</span><span class="n">country2016</span><span class="o">==</span><span class="n">df_2001</span><span class="o">.</span><span class="n">country2001</span><span class="p">,</span> <span class="s1">'inner'</span><span class="p">)</span><span class="o">.</span>\ <span class="n">withColumn</span><span class="p">(</span><span class="s2">"diff"</span><span class="p">,</span> <span class="n">df_2016</span><span class="o">.</span><span class="n">countNum2016</span><span class="o">-</span><span class="n">df_2001</span><span class="o">.</span><span class="n">countNum2001</span><span class="p">)</span> -<span class="n">df_perc_change</span> <span class="o">=</span> <span class="n">df_change</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"percentChange"</span><span class="p">,</span> <span class="p">(</span><span class="n">df_change</span><span class="o">.</span><span class="n">diff</span><span class="o">/</span><span class="n">df_change</span><span class="o">.</span><span class="n">countNum2001</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> +<span class="n">df_perc_change</span> <span class="o">=</span> <span class="n">df_change</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"percentChange"</span><span class="p">,</span> <span class="p">(</span><span class="n">df_change</span><span class="o">.</span><span class="n">diff</span><span class="o">/</span><span class="n">df_change</span><span class="o">.</span><span class="n">countNum2001</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="p">)</span> + +<span class="n">df_perc_change</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> </pre></div> </div> @@ -12323,27 +12330,27 @@ only showing top 20 rows <div class="output_area"><div class="prompt"></div> <div class="output_subarea output_stream output_stdout output_text"> -<pre>+------+------------+------------+------+------------+------------+----+-------------------+ -|yearID|birthCountry|countNum2016|yearID|birthCountry|countNum2001|diff| percentChange| -+------+------------+------------+------+------------+------------+----+-------------------+ -| 2016| Germany| 4| 2001| Germany| 1| 3| 300.0| -| 2016| D.R.| 134| 2001| D.R.| 109| 25| 22.93577981651376| -| 2016| Nicaragua| 3| 2001| Nicaragua| 2| 1| 50.0| -| 2016| Curacao| 4| 2001| Curacao| 2| 2| 100.0| -| 2016| Cuba| 30| 2001| Cuba| 15| 15| 100.0| -| 2016| Panama| 6| 2001| Panama| 10| -4| -40.0| -| 2016| Venezuela| 102| 2001| Venezuela| 50| 52| 104.0| -| 2016| USA| 967| 2001| USA| 899| 68| 7.563959955506118| -| 2016| South Korea| 9| 2001| South Korea| 3| 6| 200.0| -| 2016| Mexico| 15| 2001| Mexico| 17| -2| -11.76470588235294| -| 2016| Aruba| 1| 2001| Aruba| 3| -2| -66.66666666666666| -| 2016| P.R.| 26| 2001| P.R.| 53| -27|-50.943396226415096| -| 2016| CAN| 13| 2001| CAN| 13| 0| 0.0| -| 2016| V.I.| 2| 2001| V.I.| 2| 0| 0.0| -| 2016| Japan| 9| 2001| Japan| 11| -2|-18.181818181818183| -| 2016| Australia| 4| 2001| Australia| 6| -2| -33.33333333333333| -| 2016| Colombia| 6| 2001| Colombia| 3| 3| 100.0| -+------+------------+------------+------+------------+------------+----+-------------------+ +<pre>+------+-----------+------------+------+-----------+------------+----+-------------------+ +|yearID|country2016|countNum2016|yearID|country2001|countNum2001|diff| percentChange| ++------+-----------+------------+------+-----------+------------+----+-------------------+ +| 2016| Germany| 4| 2001| Germany| 1| 3| 300.0| +| 2016| D.R.| 134| 2001| D.R.| 109| 25| 22.93577981651376| +| 2016| Nicaragua| 3| 2001| Nicaragua| 2| 1| 50.0| +| 2016| Curacao| 4| 2001| Curacao| 2| 2| 100.0| +| 2016| Cuba| 30| 2001| Cuba| 15| 15| 100.0| +| 2016| Panama| 6| 2001| Panama| 10| -4| -40.0| +| 2016| Venezuela| 102| 2001| Venezuela| 50| 52| 104.0| +| 2016| USA| 967| 2001| USA| 899| 68| 7.563959955506118| +| 2016|South Korea| 9| 2001|South Korea| 3| 6| 200.0| +| 2016| Mexico| 15| 2001| Mexico| 17| -2| -11.76470588235294| +| 2016| Aruba| 1| 2001| Aruba| 3| -2| -66.66666666666666| +| 2016| P.R.| 26| 2001| P.R.| 53| -27|-50.943396226415096| +| 2016| CAN| 13| 2001| CAN| 13| 0| 0.0| +| 2016| V.I.| 2| 2001| V.I.| 2| 0| 0.0| +| 2016| Japan| 9| 2001| Japan| 11| -2|-18.181818181818183| +| 2016| Australia| 4| 2001| Australia| 6| -2| -33.33333333333333| +| 2016| Colombia| 6| 2001| Colombia| 3| 3| 100.0| ++------+-----------+------------+------+-----------+------------+----+-------------------+ </pre> </div> @@ -12352,15 +12359,90 @@ only showing top 20 rows </div> </div> +</div> +<div class="cell border-box-sizing text_cell rendered"> +<div class="prompt input_prompt"> +</div> +<div class="inner_cell"> +<div class="text_cell_render border-box-sizing rendered_html"> +<h4 id="Pyspark-Data-Operations-to-Track-the-Growth-of-a-Sport-in-a-Single-Country">Pyspark Data Operations to Track the Growth of a Sport in a Single Country<a class="anchor-link" href="#Pyspark-Data-Operations-to-Track-the-Growth-of-a-Sport-in-a-Single-Country">¶</a></h4><p>We can also slice the Dataframe to look at the number of players represented, from a specific country, over a specific time period. We can do this to track the growth of the sport in the country. The example below shows the growth of the sport in Venezuela from after the year 2000.</p> + +</div> +</div> </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [595]:</div> +<div class="prompt input_prompt">In [11]:</div> <div class="inner_cell"> <div class="input_area"> -<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Examples to show how to print the results to an output file</span> +<div class=" highlight hl-ipython3"><pre><span></span><span class="n">df_ven_last_15</span> <span class="o">=</span> <span class="n">df_final</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df_final</span><span class="o">.</span><span class="n">yearID</span><span class="o">></span><span class="mi">2000</span><span class="p">)</span><span class="o">.</span>\ + <span class="nb">filter</span><span class="p">(</span><span class="n">df_final</span><span class="o">.</span><span class="n">birthCountry</span><span class="o">==</span><span class="s2">"Venezuela"</span><span class="p">)</span><span class="o">.</span>\ + <span class="n">withColumnRenamed</span><span class="p">(</span><span class="s1">'count(1)'</span><span class="p">,</span> <span class="s1">'count'</span><span class="p">)</span> + +<span class="n">df_ven_last_15</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> +</pre></div> + +</div> +</div> +</div> + +<div class="output_wrapper"> +<div class="output"> + -<span class="n">df_final</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="s1">'Question1.csv'</span><span class="p">)</span> +<div class="output_area"><div class="prompt"></div> +<div class="output_subarea output_stream output_stdout output_text"> +<pre>+------+------------+-----+ +|yearID|birthCountry|count| ++------+------------+-----+ +| 2001| Venezuela| 50| +| 2002| Venezuela| 54| +| 2003| Venezuela| 61| +| 2004| Venezuela| 66| +| 2005| Venezuela| 65| +| 2006| Venezuela| 70| +| 2007| Venezuela| 73| +| 2008| Venezuela| 83| +| 2009| Venezuela| 87| +| 2010| Venezuela| 81| +| 2011| Venezuela| 84| +| 2012| Venezuela| 90| +| 2013| Venezuela| 94| +| 2014| Venezuela| 97| +| 2015| Venezuela| 99| +| 2016| Venezuela| 102| ++------+------------+-----+ + +</pre> +</div> +</div> + +</div> +</div> + +</div> +<div class="cell border-box-sizing text_cell rendered"> +<div class="prompt input_prompt"> +</div> +<div class="inner_cell"> +<div class="text_cell_render border-box-sizing rendered_html"> +<h4 id="Pyspark-Test-Results">Pyspark Test Results<a class="anchor-link" href="#Pyspark-Test-Results">¶</a></h4><p>We convert our spark data frames to pandas data frames, so it is easy to save them in a human readable csv format. These files contain the answers to the questions we posed.</p> + +</div> +</div> +</div> +<div class="cell border-box-sizing code_cell rendered"> +<div class="input"> +<div class="prompt input_prompt">In [12]:</div> +<div class="inner_cell"> + <div class="input_area"> +<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Examples to show how to print the results to an output file</span> +<span class="n">pandas_final</span> <span class="o">=</span> <span class="n">df_final</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> +<span class="n">pandas_perc_change</span> <span class="o">=</span> <span class="n">df_perc_change</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> +<span class="n">pandas_ven_last_15</span> <span class="o">=</span> <span class="n">df_ven_last_15</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> +<span class="n">pandas_final</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">'spark_question1_global_representation.csv'</span><span class="p">)</span> +<span class="n">pandas_perc_change</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">'spark_question1_global_change_last_15.csv'</span><span class="p">)</span> +<span class="n">pandas_ven_last_15</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">'spark_question1_venezuela_last_15.csv'</span><span class="p">)</span> </pre></div> </div> @@ -12370,7 +12452,7 @@ only showing top 20 rows </div> <div class="cell border-box-sizing code_cell rendered"> <div class="input"> -<div class="prompt input_prompt">In [596]:</div> +<div class="prompt input_prompt">In [13]:</div> <div class="inner_cell"> <div class="input_area"> <div class=" highlight hl-ipython3"><pre><span></span><span class="n">sc</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> diff --git a/notebooks/Question1.ipynb b/notebooks/Question1.ipynb index 906179d..7f5713d 100644 --- a/notebooks/Question1.ipynb +++ b/notebooks/Question1.ipynb @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 585, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -99,6 +99,7 @@ "from pyspark.sql.functions import count\n", "\n", "\n", + "\n", "\n" ] }, @@ -114,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 586, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -129,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 587, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -152,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 588, + "execution_count": 4, "metadata": { "collapsed": true }, @@ -192,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 589, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -269,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 590, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -280,7 +281,7 @@ "19105" ] }, - "execution_count": 590, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -295,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": 591, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -306,7 +307,7 @@ "53" ] }, - "execution_count": 591, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -321,7 +322,7 @@ }, { "cell_type": "code", - "execution_count": 592, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -332,7 +333,7 @@ "1343" ] }, - "execution_count": 592, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -347,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 593, + "execution_count": 9, "metadata": { "collapsed": false }, @@ -358,7 +359,7 @@ "22" ] }, - "execution_count": 593, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -380,12 +381,12 @@ "\n", "From the Data it is obvious that USA produces the most players. It has 967 players in 2016 and 899 players in 2011. The Dominican Republic and Venezuela also had large representations with 134 and 102 players respectively. \n", "\n", - "In terms of a statistically significant increase in players, Venezuela saw a 104% increase in players (50 to 102) represented from 2001 to 2016. Puerto Rico surprisingly showed a 51% decrease in players (53 to 26) represented from 2001 to 2016. " + "In terms of a statistically significant increase in players, Venezuela saw a 104% increase in players (50 to 102) represented from 2001 to 2016. Puerto Rico surprisingly showed a 51% decrease in players (53 to 26) represented from 2001 to 2016. \n" ] }, { "cell_type": "code", - "execution_count": 594, + "execution_count": 10, "metadata": { "collapsed": false }, @@ -394,27 +395,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "+------+------------+------------+------+------------+------------+----+-------------------+\n", - "|yearID|birthCountry|countNum2016|yearID|birthCountry|countNum2001|diff| percentChange|\n", - "+------+------------+------------+------+------------+------------+----+-------------------+\n", - "| 2016| Germany| 4| 2001| Germany| 1| 3| 300.0|\n", - "| 2016| D.R.| 134| 2001| D.R.| 109| 25| 22.93577981651376|\n", - "| 2016| Nicaragua| 3| 2001| Nicaragua| 2| 1| 50.0|\n", - "| 2016| Curacao| 4| 2001| Curacao| 2| 2| 100.0|\n", - "| 2016| Cuba| 30| 2001| Cuba| 15| 15| 100.0|\n", - "| 2016| Panama| 6| 2001| Panama| 10| -4| -40.0|\n", - "| 2016| Venezuela| 102| 2001| Venezuela| 50| 52| 104.0|\n", - "| 2016| USA| 967| 2001| USA| 899| 68| 7.563959955506118|\n", - "| 2016| South Korea| 9| 2001| South Korea| 3| 6| 200.0|\n", - "| 2016| Mexico| 15| 2001| Mexico| 17| -2| -11.76470588235294|\n", - "| 2016| Aruba| 1| 2001| Aruba| 3| -2| -66.66666666666666|\n", - "| 2016| P.R.| 26| 2001| P.R.| 53| -27|-50.943396226415096|\n", - "| 2016| CAN| 13| 2001| CAN| 13| 0| 0.0|\n", - "| 2016| V.I.| 2| 2001| V.I.| 2| 0| 0.0|\n", - "| 2016| Japan| 9| 2001| Japan| 11| -2|-18.181818181818183|\n", - "| 2016| Australia| 4| 2001| Australia| 6| -2| -33.33333333333333|\n", - "| 2016| Colombia| 6| 2001| Colombia| 3| 3| 100.0|\n", - "+------+------------+------------+------+------------+------------+----+-------------------+\n", + "+------+-----------+------------+------+-----------+------------+----+-------------------+\n", + "|yearID|country2016|countNum2016|yearID|country2001|countNum2001|diff| percentChange|\n", + "+------+-----------+------------+------+-----------+------------+----+-------------------+\n", + "| 2016| Germany| 4| 2001| Germany| 1| 3| 300.0|\n", + "| 2016| D.R.| 134| 2001| D.R.| 109| 25| 22.93577981651376|\n", + "| 2016| Nicaragua| 3| 2001| Nicaragua| 2| 1| 50.0|\n", + "| 2016| Curacao| 4| 2001| Curacao| 2| 2| 100.0|\n", + "| 2016| Cuba| 30| 2001| Cuba| 15| 15| 100.0|\n", + "| 2016| Panama| 6| 2001| Panama| 10| -4| -40.0|\n", + "| 2016| Venezuela| 102| 2001| Venezuela| 50| 52| 104.0|\n", + "| 2016| USA| 967| 2001| USA| 899| 68| 7.563959955506118|\n", + "| 2016|South Korea| 9| 2001|South Korea| 3| 6| 200.0|\n", + "| 2016| Mexico| 15| 2001| Mexico| 17| -2| -11.76470588235294|\n", + "| 2016| Aruba| 1| 2001| Aruba| 3| -2| -66.66666666666666|\n", + "| 2016| P.R.| 26| 2001| P.R.| 53| -27|-50.943396226415096|\n", + "| 2016| CAN| 13| 2001| CAN| 13| 0| 0.0|\n", + "| 2016| V.I.| 2| 2001| V.I.| 2| 0| 0.0|\n", + "| 2016| Japan| 9| 2001| Japan| 11| -2|-18.181818181818183|\n", + "| 2016| Australia| 4| 2001| Australia| 6| -2| -33.33333333333333|\n", + "| 2016| Colombia| 6| 2001| Colombia| 3| 3| 100.0|\n", + "+------+-----------+------------+------+-----------+------------+----+-------------------+\n", "\n" ] } @@ -428,32 +429,103 @@ "# Percentage Increase - Germany (300%) from 1 to 4. [Not Statistically significant] \n", "# Percentage Decrease - Aruba (-67%) from 3 to 1. [Not Statistically significant]\n", "\n", - "df_2001 = df_final.filter(df_final.yearID==2001).withColumnRenamed('count(1)', 'countNum2001')\n", - "df_2016 = df_final.filter(df_final.yearID==2016).withColumnRenamed('count(1)', 'countNum2016')\n", + "\n", + "df_2001 = df_final.filter(df_final.yearID==2001).withColumnRenamed('count(1)', 'countNum2001').\\\n", + " withColumnRenamed('birthCountry', 'country2001' )\n", + "df_2016 = df_final.filter(df_final.yearID==2016).withColumnRenamed('count(1)', 'countNum2016').\\\n", + " withColumnRenamed('birthCountry', 'country2016' )\n", + " \n", "\n", "\n", - "df_change = df_2016.join(df_2001, df_2016.birthCountry==df_2001.birthCountry, 'inner').\\\n", + "\n", + "df_change = df_2016.join(df_2001, df_2016.country2016==df_2001.country2001, 'inner').\\\n", " withColumn(\"diff\", df_2016.countNum2016-df_2001.countNum2001)\n", " \n", - "df_perc_change = df_change.withColumn(\"percentChange\", (df_change.diff/df_change.countNum2001)*100).show()\n" + "df_perc_change = df_change.withColumn(\"percentChange\", (df_change.diff/df_change.countNum2001)*100)\n", + "\n", + "df_perc_change.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pyspark Data Operations to Track the Growth of a Sport in a Single Country\n", + "\n", + "We can also slice the Dataframe to look at the number of players represented, from a specific country, over a specific time period. We can do this to track the growth of the sport in the country. The example below shows the growth of the sport in Venezuela from after the year 2000. " ] }, { "cell_type": "code", - "execution_count": 595, + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+------------+-----+\n", + "|yearID|birthCountry|count|\n", + "+------+------------+-----+\n", + "| 2001| Venezuela| 50|\n", + "| 2002| Venezuela| 54|\n", + "| 2003| Venezuela| 61|\n", + "| 2004| Venezuela| 66|\n", + "| 2005| Venezuela| 65|\n", + "| 2006| Venezuela| 70|\n", + "| 2007| Venezuela| 73|\n", + "| 2008| Venezuela| 83|\n", + "| 2009| Venezuela| 87|\n", + "| 2010| Venezuela| 81|\n", + "| 2011| Venezuela| 84|\n", + "| 2012| Venezuela| 90|\n", + "| 2013| Venezuela| 94|\n", + "| 2014| Venezuela| 97|\n", + "| 2015| Venezuela| 99|\n", + "| 2016| Venezuela| 102|\n", + "+------+------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "df_ven_last_15 = df_final.filter(df_final.yearID>2000).\\\n", + " filter(df_final.birthCountry==\"Venezuela\").\\\n", + " withColumnRenamed('count(1)', 'count')\n", + " \n", + "df_ven_last_15.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pyspark Test Results\n", + "We convert our spark data frames to pandas data frames, so it is easy to save them in a human readable csv format. These files contain the answers to the questions we posed." + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Examples to show how to print the results to an output file\n", - "\n", - "df_final.write.csv('Question1.csv')" + "pandas_final = df_final.toPandas()\n", + "pandas_perc_change = df_perc_change.toPandas()\n", + "pandas_ven_last_15 = df_ven_last_15.toPandas()\n", + "pandas_final.to_csv('spark_question1_global_representation.csv')\n", + "pandas_perc_change.to_csv('spark_question1_global_change_last_15.csv')\n", + "pandas_ven_last_15.to_csv('spark_question1_venezuela_last_15.csv')" ] }, { "cell_type": "code", - "execution_count": 596, + "execution_count": 13, "metadata": { "collapsed": false }, diff --git a/notebooks/Question1.md b/notebooks/Question1.md index 2ca5ce9..542de15 100644 --- a/notebooks/Question1.md +++ b/notebooks/Question1.md @@ -84,6 +84,7 @@ from pyspark.sql.functions import count + ``` #### Pyspark Configuration & Instantiation @@ -277,6 +278,7 @@ From the Data it is obvious that USA produces the most players. It has 967 playe In terms of a statistically significant increase in players, Venezuela saw a 104% increase in players (50 to 102) represented from 2001 to 2016. Puerto Rico surprisingly showed a 51% decrease in players (53 to 26) represented from 2001 to 2016. + ```python # Additional Examples showing how to get additional statistics # Highest growth and Highest Decline in the Last 15 years @@ -286,46 +288,96 @@ In terms of a statistically significant increase in players, Venezuela saw a 104 # Percentage Increase - Germany (300%) from 1 to 4. [Not Statistically significant] # Percentage Decrease - Aruba (-67%) from 3 to 1. [Not Statistically significant] -df_2001 = df_final.filter(df_final.yearID==2001).withColumnRenamed('count(1)', 'countNum2001') -df_2016 = df_final.filter(df_final.yearID==2016).withColumnRenamed('count(1)', 'countNum2016') + +df_2001 = df_final.filter(df_final.yearID==2001).withColumnRenamed('count(1)', 'countNum2001').\ + withColumnRenamed('birthCountry', 'country2001' ) +df_2016 = df_final.filter(df_final.yearID==2016).withColumnRenamed('count(1)', 'countNum2016').\ + withColumnRenamed('birthCountry', 'country2016' ) + + -df_change = df_2016.join(df_2001, df_2016.birthCountry==df_2001.birthCountry, 'inner').\ +df_change = df_2016.join(df_2001, df_2016.country2016==df_2001.country2001, 'inner').\ withColumn("diff", df_2016.countNum2016-df_2001.countNum2001) -df_perc_change = df_change.withColumn("percentChange", (df_change.diff/df_change.countNum2001)*100).show() +df_perc_change = df_change.withColumn("percentChange", (df_change.diff/df_change.countNum2001)*100) + +df_perc_change.show() ``` - +------+------------+------------+------+------------+------------+----+-------------------+ - |yearID|birthCountry|countNum2016|yearID|birthCountry|countNum2001|diff| percentChange| - +------+------------+------------+------+------------+------------+----+-------------------+ - | 2016| Germany| 4| 2001| Germany| 1| 3| 300.0| - | 2016| D.R.| 134| 2001| D.R.| 109| 25| 22.93577981651376| - | 2016| Nicaragua| 3| 2001| Nicaragua| 2| 1| 50.0| - | 2016| Curacao| 4| 2001| Curacao| 2| 2| 100.0| - | 2016| Cuba| 30| 2001| Cuba| 15| 15| 100.0| - | 2016| Panama| 6| 2001| Panama| 10| -4| -40.0| - | 2016| Venezuela| 102| 2001| Venezuela| 50| 52| 104.0| - | 2016| USA| 967| 2001| USA| 899| 68| 7.563959955506118| - | 2016| South Korea| 9| 2001| South Korea| 3| 6| 200.0| - | 2016| Mexico| 15| 2001| Mexico| 17| -2| -11.76470588235294| - | 2016| Aruba| 1| 2001| Aruba| 3| -2| -66.66666666666666| - | 2016| P.R.| 26| 2001| P.R.| 53| -27|-50.943396226415096| - | 2016| CAN| 13| 2001| CAN| 13| 0| 0.0| - | 2016| V.I.| 2| 2001| V.I.| 2| 0| 0.0| - | 2016| Japan| 9| 2001| Japan| 11| -2|-18.181818181818183| - | 2016| Australia| 4| 2001| Australia| 6| -2| -33.33333333333333| - | 2016| Colombia| 6| 2001| Colombia| 3| 3| 100.0| - +------+------------+------------+------+------------+------------+----+-------------------+ + +------+-----------+------------+------+-----------+------------+----+-------------------+ + |yearID|country2016|countNum2016|yearID|country2001|countNum2001|diff| percentChange| + +------+-----------+------------+------+-----------+------------+----+-------------------+ + | 2016| Germany| 4| 2001| Germany| 1| 3| 300.0| + | 2016| D.R.| 134| 2001| D.R.| 109| 25| 22.93577981651376| + | 2016| Nicaragua| 3| 2001| Nicaragua| 2| 1| 50.0| + | 2016| Curacao| 4| 2001| Curacao| 2| 2| 100.0| + | 2016| Cuba| 30| 2001| Cuba| 15| 15| 100.0| + | 2016| Panama| 6| 2001| Panama| 10| -4| -40.0| + | 2016| Venezuela| 102| 2001| Venezuela| 50| 52| 104.0| + | 2016| USA| 967| 2001| USA| 899| 68| 7.563959955506118| + | 2016|South Korea| 9| 2001|South Korea| 3| 6| 200.0| + | 2016| Mexico| 15| 2001| Mexico| 17| -2| -11.76470588235294| + | 2016| Aruba| 1| 2001| Aruba| 3| -2| -66.66666666666666| + | 2016| P.R.| 26| 2001| P.R.| 53| -27|-50.943396226415096| + | 2016| CAN| 13| 2001| CAN| 13| 0| 0.0| + | 2016| V.I.| 2| 2001| V.I.| 2| 0| 0.0| + | 2016| Japan| 9| 2001| Japan| 11| -2|-18.181818181818183| + | 2016| Australia| 4| 2001| Australia| 6| -2| -33.33333333333333| + | 2016| Colombia| 6| 2001| Colombia| 3| 3| 100.0| + +------+-----------+------------+------+-----------+------------+----+-------------------+ +#### Pyspark Data Operations to Track the Growth of a Sport in a Single Country + +We can also slice the Dataframe to look at the number of players represented, from a specific country, over a specific time period. We can do this to track the growth of the sport in the country. The example below shows the growth of the sport in Venezuela from after the year 2000. + ```python -# Examples to show how to print the results to an output file +df_ven_last_15 = df_final.filter(df_final.yearID>2000).\ + filter(df_final.birthCountry=="Venezuela").\ + withColumnRenamed('count(1)', 'count') + +df_ven_last_15.show() +``` + + +------+------------+-----+ + |yearID|birthCountry|count| + +------+------------+-----+ + | 2001| Venezuela| 50| + | 2002| Venezuela| 54| + | 2003| Venezuela| 61| + | 2004| Venezuela| 66| + | 2005| Venezuela| 65| + | 2006| Venezuela| 70| + | 2007| Venezuela| 73| + | 2008| Venezuela| 83| + | 2009| Venezuela| 87| + | 2010| Venezuela| 81| + | 2011| Venezuela| 84| + | 2012| Venezuela| 90| + | 2013| Venezuela| 94| + | 2014| Venezuela| 97| + | 2015| Venezuela| 99| + | 2016| Venezuela| 102| + +------+------------+-----+ + + -df_final.write.csv('Question1.csv') +#### Pyspark Test Results +We convert our spark data frames to pandas data frames, so it is easy to save them in a human readable csv format. These files contain the answers to the questions we posed. + + +```python +# Examples to show how to print the results to an output file +pandas_final = df_final.toPandas() +pandas_perc_change = df_perc_change.toPandas() +pandas_ven_last_15 = df_ven_last_15.toPandas() +pandas_final.to_csv('spark_question1_global_representation.csv') +pandas_perc_change.to_csv('spark_question1_global_change_last_15.csv') +pandas_ven_last_15.to_csv('spark_question1_venezuela_last_15.csv') ``` diff --git a/src/question_1_pyspark.py b/src/question_1_pyspark.py index 3d7d876..99d2fd0 100644 --- a/src/question_1_pyspark.py +++ b/src/question_1_pyspark.py @@ -1,42 +1,168 @@ -#------------------------------------------------------------------------------- -# Name: module1 -# Purpose: +# ## How has the Global Representation of Baseball Players changed over time? What countries produce the most baseball +# players in number? What countries have showed the highest increase and Highest Decline in players in the last 15 years. +# ____ # -# Author: antaonn # -# Created: 21/04/2018 -# Copyright: (c) antaonn 2018 -# Licence: <your licence> -#------------------------------------------------------------------------------- +# In order to determine how the global representation of MLB Players has changed from 1870 to 2016, we look at Historical +# Baseball Data available on the Internet. The specific source of data chosen here is a database of baseball statistics over +# the years 1870 to 2016. http://www.seanlahman.com/baseball-database.html +# +# This database has 27 tables. However to obtain the answer for our query above, we need to cross reference data from 2 +# tables in this database. The Master.csv table lists every player that has played the game from 1870 to 2016, along with +# their country of origin. Its schema is listed below. +# +# #### Table 1: Master Table Schema +# +# +# | Field | Description | +# | ---------- | -------------------------------------- | +# | playerID | A unique code asssigned to each player | +# | birthYear | Year player was born | +# | birthMonth | Month player was born | +# | birthDay | Day player was born | +# | birthCount | Country where player was born | +# | birthState | State where player was born | +# | birthCity | City where player was born | +# | deathYear | Year player died | +# | deathMonth | Month player died | +# | deathDay | Day player died | +# | deathCount | Country where player died | +# | deathState | State where player died | +# | deathCity | City where player died | +# | nameFirst | Player's first name | +# | nameLast | Player's last name | +# | nameGiven | Player's given name | +# | weight | Player's weight in pounds | +# | height | Player's height in inches | +# | bats | Player's batting hand (left, right) | +# | throws | Player's throwing hand (left or right) | +# | debut | Date that player made first appearance | +# | finalGame | Date that player made last appearance | +# | retroID | ID used by retrosheet | +# | bbrefID | ID used by Baseball Reference website | +# +# The Fielding.csv table lists the Fielding statistics for every player, who has played the game of baseball from 1870 to +# 2016, along with the year those statistics were recorded. Its schema is listed below +# +# #### Table 2 Fielding Table schema +# +# +# | Field | Description | +# | --------- | -------------------------------------- | +# | playerID | A unique code asssigned to each player | +# | yearID | Year | +# | stint | players stint | +# | teamID | Team | +# | lgID | League | +# | Pos | Position | +# | G | Games | +# | GS | Games Started | +# | InnOuts | Time Played (As Outs) | +# | PO | PutOuts | +# | A | Assists | +# | E | Errors | +# | DP | Double Plays | +# | PB | Passed Balls (Catcher) | +# | WP | Wild Pitches (Catcher) | +# | SB | Opponent Stolen Bases | +# | CS | Opponent Caught Stealing | +# | ZR | Zone Rating | +# +# +# We Utilize Apache Spark to perform the required database operations to answer our questions. The Code below explains the +# process of answering these questions, and shows how easy it is to use Spark to analyze Big Data. The Code to implement this +# query is implemented in Python, and can either be run on a local server or a cluster of servers. The example below was run +# on an Amazon EC2 Free Tier Ubuntu Server instance. The EC2 instance was set up with Python (Anaconda 3-4.1.1), Java, +# Scala, py4j, Spark and Hadoop. The code was written and executed in a Jupyter Notebook. Several guides are available on the +# internet describing how to install and run spark on an EC2 instance. One that particularly covers all these facets is +# https://medium.com/@josemarcialportilla/getting-spark-python-and-jupyter-notebook-running-on-amazon-ec2-dec599e1c297 + +# #### Pyspark Libraries +# Import the pyspark libraries to allow python to interact with spark. A description of the basic functionality of each of +# these libaries is provided in the code comments below. A more detailed explanation of the functionality of each of these +# libraries can be found in Apache's documentation on Spark https://spark.apache.org/docs/latest/api/python/index.html + +# In[1]: # Import SparkContext. This is the main entry point for Spark functionality -# Import Sparkconf. We use Spark Conf to easily change the configuration settings when changing between local mode cluster mode. -# Import SQLContext from pyspark.sql. We use the libraries here to read in data in csv format. The format of our native database +# Import Sparkconf. We use Spark Conf to easily change the configuration settings when changing between local mode cluster +# mode. +# Import SQLContext from pyspark.sql. We use the libraries here to read in data in csv format. The format of our native +# database. # Import count from pyspark.sql.functions. This is used for the count operations needed to answer our questions -import time -starttime = time.time() from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql.functions import count + + + + + +# #### Pyspark Configuration & Instantiation +# We configure spark for local mode or cluster mode, configure our application name, and configure logging. Several other +# configuration settings can be programmed as well. A detailed explanation of these can be found at +# https://spark.apache.org/docs/latest/configuration.html +# +# We pass the configuration to an instance of a SparkContext object, so that we can begin using Apache Spark + +# In[2]: + # The Master will need to change when running on a cluster. -# If we need to specify multiple cores we can list something like local[2] for 2 cores, or local[*] to use all available cores. -# All the available Configuration settings can be found at https://spark.apache.org/docs/latest/configuration.html +# If we need to specify multiple cores we can list something like local[2] for 2 cores, or local[*] to use all available +# cores. All the available Configuration settings can be found at https://spark.apache.org/docs/latest/configuration.html sc_conf = SparkConf().setMaster('local[*]').setAppName('Question1').set('spark.logConf', True) + +# In[3]: + # We instantiate a SparkContext object with the SparkConfig sc = SparkContext(conf=sc_conf) + +# #### Pyspark CSV file Processing +# We use the SQLContext library to easily allow us to read the csv files 'Master.csv' and 'Fielding.csv'. These files are +# currently stored in Amazon s3 storage (s3://cs498ccafinalproject/) and are publicly available for download. They were +# copied over to a local EC2 instance by using the AWS command line interace command +# +# ```aws s3 cp s3://cs498ccafinalproject . --recursive``` + +# In[4]: + # We create a sql context object, so that we can read in csv files easily, and create a data frame sqlContext = SQLContext(sc) df_master = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('Master.csv') df_field = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('Fielding.csv') + +# #### Pyspark Data Operations to Determine how the Global Representation of Baseball players have changed from 1870 to 2016 +# +# In order to determine how the Global representation of Major League Baseball players has changed over time, we perform the +# following operations +# +# 1) We perform an innner join on the Fielding.csv and Master.csv tables, using the playerID as a unique key. +# +# 2) We select only the columns that we need (playerID, birthCountry and yearID) to answer our question +# +# 3) We drop duplicate entries in the joined table. These can arise from players who played on multiple teams in the same +# year, or players who were called up to the majors, and dropped down to the minors multiple times a year. +# +# 4) We clean the database to remove any Null entries, for when the players country of orgin was unknown. This is especially +# common for the years between 1870 and 1912 +# +# 5) We group the cleaned data by yearID and birthCountry, then perform an aggregation operation to determine the count. +# +# 6) We then sort the data by yearID +# +# This gives us a dataframe that lists the number of players born in a specific country, for every year from 1870 to 2016. + +# In[5]: + # Join the two tables, and filter the colums we need. # Remove duplicates # Clean Null Entries @@ -47,25 +173,135 @@ df_field = sqlContext.read.format('com.databricks.spark.csv').options(header='tr keep = [df_field.playerID, df_field.yearID, df_master.birthCountry ] df_merge = df_field.join(df_master, df_field.playerID==df_master.playerID, 'inner').select(*keep).dropDuplicates() df_clean = df_merge.filter(df_merge.birthCountry != "") -df_final = df_clean.groupBy(df_clean.yearID, df_clean.birthCountry).\ - agg(count("*")).\ - orderBy(df_clean.yearID) +df_final = df_clean.groupBy(df_clean.yearID, df_clean.birthCountry). agg(count("*")). orderBy(df_clean.yearID) + +df_final.show() + + + + +# #### Pyspark Additional Statistics +# To put our data into context, we can also look up the following information +# +# 1) How many people have played in major league baseball from 1870 to 2016 +# +# 2) How many unique countries have been represented by players in Major League Baseball from 1870 to 2016 +# +# 3) How many people played Major League Baseball in the Year 2016 +# +# 4) How many unique countries were represented by players in Major League Baseball from 1870 to 2016 +# +# + +# In[6]: + +# Additional Examples showing how to get additional statistics +# Number of players in MLB from 1870 to 2016. +# Answer: 19105 + +df_master.count() + + +# In[7]: + +# Additional Examples showing how to get additional statistics +# Number of Unique Countries that have had players in MLB from 1870 to 2016 +# Answer: 53 + +df_clean.select(df_clean.birthCountry).distinct().count() + + +# In[8]: + +# Additional Examples showing how to get additional statistics +# Number of MLB Players in 2016 +# Answer: 1343 + +df_merge.filter(df_merge.yearID==2016).count() + + +# In[9]: + +# Additional Examples showing how to get additional statistics +# Number of Countries represented in 2016 +# Answer: 22 +df_merge.filter(df_merge.yearID==2016).groupBy(df_merge.birthCountry).agg(count("*")).count() + -#df_final.show() +# #### Pyspark Data Operations to show what Countries produce the most Major League Baseball players and which countries have +# shown the greatest increase and greatest decline in Major league players between 2001 and 2016. +# +# To determine which countries have produced the most baseball players in 2016, we slice the dataframe we obtained to +# determine global representation of players, for the year 2016. We can additionally look at a slice of this dataframe from +# 2001. If we join the two slices, and compute the differnce between players represented in 2016 and 2001, we can determine +# the corresponding percentage increase/decrease, as well as get a snapshot of which teams produce the most baseball players. +# +# From the Data it is obvious that USA produces the most players. It has 967 players in 2016 and 899 players in 2011. The +# Dominican Republic and Venezuela also had large representations with 134 and 102 players respectively. +# +# In terms of a statistically significant increase in players, Venezuela saw a 104% increase in players (50 to 102) +# represented from 2001 to 2016. Puerto Rico surprisingly showed a 51% decrease in players (53 to 26) represented from 2001 +# to 2016. +# + + +# In[10]: -df_2001 = df_final.filter(df_final.yearID==2001).withColumnRenamed('count(1)', 'countNum2001') -df_2016 = df_final.filter(df_final.yearID==2016).withColumnRenamed('count(1)', 'countNum2016') +# Additional Examples showing how to get additional statistics +# Highest growth and Highest Decline in the Last 15 years +# Answer: +# Significant Increase - Venezuela (104%) from 50 to 102 +# Significant Decrease - Puerto Rico (-51%) from 53 to 26 +# Percentage Increase - Germany (300%) from 1 to 4. [Not Statistically significant] +# Percentage Decrease - Aruba (-67%) from 3 to 1. [Not Statistically significant] -df_change = df_2016.join(df_2001, df_2016.birthCountry==df_2001.birthCountry, 'inner').\ +df_2001 = df_final.filter(df_final.yearID==2001).withColumnRenamed('count(1)', 'countNum2001').\ + withColumnRenamed('birthCountry', 'country2001' ) +df_2016 = df_final.filter(df_final.yearID==2016).withColumnRenamed('count(1)', 'countNum2016').\ + withColumnRenamed('birthCountry', 'country2016' ) + + + + +df_change = df_2016.join(df_2001, df_2016.country2016==df_2001.country2001, 'inner').\ withColumn("diff", df_2016.countNum2016-df_2001.countNum2001) df_perc_change = df_change.withColumn("percentChange", (df_change.diff/df_change.countNum2001)*100) -#df_perc_change.show() +df_perc_change.show() + + +# #### Pyspark Data Operations to Track the Growth of a Sport in a Single Country +# +# We can also slice the Dataframe to look at the number of players represented, from a specific country, over a specific time +# period. We can do this to track the growth of the sport in the country. The example below shows the growth of the sport in +# Venezuela from after the year 2000. + +# In[11]: + +df_ven_last_15 = df_final.filter(df_final.yearID>2000). filter(df_final.birthCountry=="Venezuela").\ + withColumnRenamed('count(1)', 'count') + +df_ven_last_15.show() + + +# #### Pyspark Test Results +# We convert our spark data frames to pandas data frames, so it is easy to save them in a human readable csv format. These +# files contain the answers to the questions we posed. + +# In[12]: + +# Examples to show how to print the results to an output file +pandas_final = df_final.toPandas() +pandas_perc_change = df_perc_change.toPandas() +pandas_ven_last_15 = df_ven_last_15.toPandas() +pandas_final.to_csv('spark_question1_global_representation.csv') +pandas_perc_change.to_csv('spark_question1_global_change_last_15.csv') +pandas_ven_last_15.to_csv('spark_question1_venezuela_last_15.csv') + -#df_final.write.csv('Question1.csv') +# In[13]: sc.stop() -print (time.time()-starttime) -- GitLab