At What Age to Players provide most Value? After how many years in the league are players most productive, and when do their skills start to decline?
For this question we will use the value the player provided as a batter. The value calculation is simply the number of successful batting attempts divided by the number of batting attempts:
Hits / At Bats = Batting Value
# Clear the environment
rm(list=ls())
We will use the following libraries for this analysis:
Package Name | Description / Reason for use |
---|---|
pacman | Package manager - automatically installs the packages if we need them |
sqldf | Ability to perform simple SQL queries on dataframes |
highcharter | Product highCharts / D3 graphs |
knitr | Transform analysis into html for viewing |
SparkR | Connections to retrieve data from our Spark instance |
tictoc | Ability to measure performance of the environment |
RCurl | Retrieve remote files |
# Load the necessary libraries, using pacman as a package manager
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(sqldf, highcharter, knitr, SparkR, tictoc, RCurl)
tic()
allStarFullDataFile <-'https://s3.amazonaws.com/cs498ccafinalproject/AllstarFull.csv'
appearancesDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Appearances.csv'
awardsManagersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/AwardsManagers.csv'
awardsPlayersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/AwardsPlayers.csv'
awardsShareManagersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/AwardsShareManagers.csv'
awardsSharePlayersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/AwardsSharePlayers.csv'
battingDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Batting.csv'
battingPostDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/BattingPost.csv'
collegePlayingDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/CollegePlaying.csv'
fieldingDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Fielding.csv'
fieldingOFDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/FieldingOF.csv'
fieldingOFsplitDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/FieldingOFsplit.csv'
fieldingPostDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/FieldingPost.csv'
hallOfFameDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/HallOfFame.csv'
homeGamesDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/HomeGames.csv'
managersDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Managers.csv'
managersHalfDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/ManagersHalf.csv'
masterDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Master.csv'
parksDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Parks.csv'
pitchingDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Pitching.csv'
pitchingPostDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/PitchingPost.csv'
salariesDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Salaries.csv'
schoolsDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Schools.csv'
seriesPostDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/SeriesPost.csv'
teamsDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/Teams.csv'
teamsFranchisesDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/TeamsFranchises.csv'
teamsHalfDataFile <- 'https://s3.amazonaws.com/cs498ccafinalproject/TeamsHalf.csv'
# Retrieve the data we need
battingData <- read.csv(textConnection(getURL(battingDataFile))) # Question 3a, 3b, 5
masterData <- read.csv(textConnection(getURL(masterDataFile))) # Question 3a, 3b, 4, 5, 6
# Remove the data location variables
rm(allStarFullDataFile, appearancesDataFile, awardsManagersDataFile, awardsPlayersDataFile, awardsSharePlayersDataFile, awardsShareManagersDataFile, battingDataFile,
battingPostDataFile, collegePlayingDataFile, fieldingDataFile, fieldingOFDataFile, fieldingOFsplitDataFile, fieldingPostDataFile, hallOfFameDataFile,
homeGamesDataFile, managersDataFile, managersHalfDataFile, masterDataFile, parksDataFile, pitchingDataFile, pitchingPostDataFile, salariesDataFile,
schoolsDataFile, seriesPostDataFile, teamsDataFile, teamsFranchisesDataFile, teamsHalfDataFile)
# Merge the batting data with the master data into a new data frame and remove the old ones since we dont need them anymore
questionData <- merge(x = battingData, y = masterData, all.x = TRUE, all.y = FALSE, by.x = 'playerID', by.y = 'playerID')
# Remove entries where the player had no at bats
questionData <- subset(questionData, AB > 0)
# Calculate the players age (Using the year only since we aren't calculating the value for every single game and it is summarized in the playing year)
questionData$playerAge <- questionData$yearID - questionData$birthYear
# Remove entries where we don't know the age because the birth year was NA
questionData <- subset(questionData, !is.na(questionData$playerAge))
# Take a brief look at the players ages
summary(questionData$playerAge)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 25.00 28.00 28.22 31.00 59.00
# Calculate the batting value
questionData$battingValue <- as.numeric(questionData$H) / as.numeric(questionData$AB)
rm(battingData, masterData)
# Take a brief look at the batting values we just calculated
summary(questionData$battingValue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1471 0.2308 0.2089 0.2743 1.0000
# Remove entries where data was incomplete because we couldn't calculate the value
questionData <- subset(questionData, !is.na(battingValue))
# Remove outliers - in this case we are going to remove any player 50 years or older
questionData <- subset(questionData, playerAge < 50)
chart <- hcboxplot(x = questionData$battingValue, var = questionData$playerAge, name = 'Batting Value', outliers = FALSE) %>%
hc_title(text = 'Player Batting Value by Age') %>%
hc_chart(zoomType = 'xy') %>%
hc_add_theme(hc_theme_elementary()) %>%
hc_credits(enabled = TRUE, text = "Source: Sean Lahman, Baseball Database Website",
href = "http://www.seanlahman.com/baseball-archive/statistics/",
style = list(fontSize = '10px'))
chart
print(paste('Total Processing time: '))
## [1] "Total Processing time: "
toc()
## 5.168 sec elapsed