Question 3a

At What Age to Players provide most Value? After how many years in the league are players most productive, and when do their skills start to decline?

Approach

For this question we will use the value the player provided as a batter. The value calculation is simply the number of successful batting attempts divided by the number of batting attempts:

Hits / At Bats = Batting Value

# Clear the environment
rm(list=ls())

Load the necessary libraries

We will use the following libraries for this analysis:

Package Name Description / Reason for use
pacman Package manager - automatically installs the packages if we need them
sqldf Ability to perform simple SQL queries on dataframes
highcharter Product highCharts / D3 graphs
knitr Transform analysis into html for viewing
SparkR Connections to retrieve data from our Spark instance
tictoc Ability to measure performance of the environment
RCurl Retrieve remote files
# Load the necessary libraries, using pacman as a package manager
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(sqldf, highcharter, knitr, SparkR, tictoc, RCurl)

Start the benchmark

tic()

Load the data files

allStarFullDataFile <-'https://s3.amazonaws.com/cs498ccafinalproject/AllstarFull.csv'
appearancesDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Appearances.csv'
awardsManagersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/AwardsManagers.csv'
awardsPlayersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/AwardsPlayers.csv'
awardsShareManagersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/AwardsShareManagers.csv'
awardsSharePlayersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/AwardsSharePlayers.csv'
battingDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Batting.csv'
battingPostDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/BattingPost.csv'
collegePlayingDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/CollegePlaying.csv'
fieldingDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Fielding.csv'
fieldingOFDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/FieldingOF.csv'
fieldingOFsplitDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/FieldingOFsplit.csv'
fieldingPostDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/FieldingPost.csv'
hallOfFameDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/HallOfFame.csv'
homeGamesDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/HomeGames.csv'
managersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Managers.csv'
managersHalfDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/ManagersHalf.csv'
masterDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Master.csv'
parksDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Parks.csv'
pitchingDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Pitching.csv'
pitchingPostDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/PitchingPost.csv'
salariesDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Salaries.csv'
schoolsDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Schools.csv'
seriesPostDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/SeriesPost.csv'
teamsDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Teams.csv'
teamsFranchisesDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/TeamsFranchises.csv'
teamsHalfDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/TeamsHalf.csv'

# Retrieve the data we need
battingData <- read.csv(textConnection(getURL(battingDataFile)))    # Question 3a, 3b, 5
masterData <- read.csv(textConnection(getURL(masterDataFile)))      # Question 3a, 3b, 4, 5, 6

# Remove the data location variables
rm(allStarFullDataFile, appearancesDataFile, awardsManagersDataFile, awardsPlayersDataFile, awardsSharePlayersDataFile, awardsShareManagersDataFile, battingDataFile,
   battingPostDataFile, collegePlayingDataFile, fieldingDataFile, fieldingOFDataFile, fieldingOFsplitDataFile, fieldingPostDataFile, hallOfFameDataFile,
   homeGamesDataFile, managersDataFile, managersHalfDataFile, masterDataFile, parksDataFile, pitchingDataFile, pitchingPostDataFile, salariesDataFile,
   schoolsDataFile, seriesPostDataFile, teamsDataFile, teamsFranchisesDataFile, teamsHalfDataFile)

Shape the data

# Merge the batting data with the master data into a new data frame and remove the old ones since we dont need them anymore
questionData <- merge(x = battingData, y = masterData, all.x = TRUE, all.y = FALSE, by.x = 'playerID', by.y = 'playerID')

# Remove entries where the player had no at bats
questionData <- subset(questionData, AB > 0)

# Calculate the players age (Using the year only since we aren't calculating the value for every single game and it is summarized in the playing year)
questionData$playerAge <-  questionData$yearID - questionData$birthYear

# Remove entries where we don't know the age because the birth year was NA
questionData <- subset(questionData, !is.na(questionData$playerAge))

# Take a brief look at the players ages
summary(questionData$playerAge)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   25.00   28.00   28.22   31.00   59.00
# Calculate the batting value
questionData$battingValue <- as.numeric(questionData$H) / as.numeric(questionData$AB)
rm(battingData, masterData)

# Take a brief look at the batting values we just calculated
summary(questionData$battingValue)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1471  0.2308  0.2089  0.2743  1.0000
# Remove entries where data was incomplete because we couldn't calculate the value
questionData <- subset(questionData, !is.na(battingValue))

# Remove outliers - in this case we are going to remove any player 50 years or older
questionData <- subset(questionData, playerAge < 50)

Draw the visualization

chart <- hcboxplot(x = questionData$battingValue, var = questionData$playerAge, name = 'Batting Value', outliers = FALSE) %>%
  hc_title(text = 'Player Batting Value by Age') %>%
  hc_chart(zoomType = 'xy') %>% 
  hc_add_theme(hc_theme_elementary()) %>%
  hc_credits(enabled = TRUE, text = "Source: Sean Lahman, Baseball Database Website",
             href = "http://www.seanlahman.com/baseball-archive/statistics/",
             style = list(fontSize = '10px'))

chart

Display the total processing time

print(paste('Total Processing time: '))
## [1] "Total Processing time: "
toc()
## 5.168 sec elapsed