Created
January 2, 2017 18:17
-
-
Save darh78/a8a30e6d03c74c5cd0d1fe5911fcab7f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#title: Overall rank of MLB teams in regular season | |
#output: html_document | |
# author: D. Hernández | |
#tags: [R, baseball, lahman, MLB, postseason] | |
# Loading packages | |
library(Lahman) # Data source | |
library(dplyr) # To ease the data manipulation | |
### Data preparation | |
# getting the data from Lahman package: Since 1995 to 2015 | |
Teams_data <- tbl_df(Teams) %>% | |
select(yearID, name, teamIDBR, W, L, DivWin, WCWin, WSWin) %>% | |
filter(yearID >= 1994) | |
#Let's add a variable with the winning percentage, `WLP`, and order the tibble by `yearID` and `WLP` (in descending order). | |
Teams_data <- mutate(Teams_data, | |
WLP = W/(W+L)) %>% | |
arrange(yearID, desc(WLP)) | |
# Now that `Teams_data` is ordered, let's add a new variable with the overall rank (called `OverallRank`) for each team on each regular season, defining number 1 to the team with the best `WLP`, 2 for the team with the best second `WLP` and so on. | |
Teams_data <- mutate(Teams_data, | |
OverallRank = ave(WLP, yearID, FUN = seq_along)) | |
# By the time of building this article, the R Lahman Package (in version 5.0-0) did not have the 2016 data. So, I had to build an Excel file to get the 2016 season's info [baseball-reference](http://www.baseball-reference.com/leagues/MLB/2016-standings.shtml). Then, bind this 2016 data to the `Teams_data` tibble. | |
# get 2016 data from baseball-reference | |
library(readxl) # To read Excel files into R | |
T2016 <- tbl_df(read_excel("C:/Users/1328/Documents/R projects/darh78.github.io/data/T2016.xlsx")) | |
# binding both tibbles | |
Teams_data <- rbind(Teams_data, T2016) | |
#During this period (1994-2016), four MLB franchises changed their names at least once, so I standardized each team ID with the ID of MLB franchises that played the 2016 season. | |
Teams_data <- mutate(Teams_data, | |
FranchID = ifelse(teamIDBR == "ANA" | teamIDBR == "CAL" | teamIDBR == "LAA", "LAA", # Anaheim Angels | |
ifelse(teamIDBR == "FLA" | teamIDBR == "MIA", "MIA", # Miami Marlins | |
ifelse(teamIDBR == "MON" | teamIDBR == "WAS" | teamIDBR == "WSN", "WSN", # Washington Nationals | |
ifelse(teamIDBR == "TBD" | teamIDBR == "TBR", "TBR", # Tampa Bay Rays | |
teamIDBR) | |
) | |
) | |
) | |
) | |
### Data cleaning | |
#Additionally, to better prepare `Teams_data` for the analysis, let's modify some of the `classes` of the variables and give better names to some of them: | |
Teams_data$WLP <- as.numeric(Teams_data$WLP) | |
Teams_data$yearID <- as.integer(Teams_data$yearID) | |
Teams_data$W <- as.integer(Teams_data$W) | |
Teams_data$L <- as.integer(Teams_data$L) | |
Teams_data$OverallRank <- as.integer(Teams_data$OverallRank) | |
Teams_data <- Teams_data %>% rename(Season = yearID, Team = name) | |
#Resulting on a tibble with this preview: | |
Teams_data %>% slice(c(1, 100, 200, 300, 400, 500)) | |
### Exploratory Data Analysis | |
#Before analyzing or ploting anything, let's see a summary of the variables in the tibble `TeamsStd`: | |
summary(Teams_data) | |
#From this summary we can extract that the maximum number of wins by a team in a single season is 116, while the maximum number of losses is 119. Let's check who were those teams: | |
knitr::kable(Teams_data %>% | |
filter(W == 116 | L == 119) %>% | |
select(Season, Team, W, L, WLP, OverallRank)) | |
#These two records correspond also to the minumim and maximum `WLP` registered in this period. | |
#Now, let's visualize the winning percentage of each team between 1994 and 2016. | |
library(ggplot2) # To visualize results | |
library(ggthemes)# To format vizes | |
Linegraph <- ggplot(Teams_data, aes(x = Season, y = OverallRank)) + | |
geom_point(aes(color = OverallRank), size = 1.35) + #color = "cadetblue3", | |
scale_colour_gradient2(low = "darkgreen", mid = "gold", high = "red", midpoint = 10) + | |
geom_smooth(color = "black", size = .7, se = F) + | |
guides(color = FALSE) + | |
scale_y_reverse(breaks = c(1,30)) + | |
facet_wrap(~ FranchID, ncol = 5) + | |
labs(title = "Overall rank of MLB teams in regular season", | |
subtitle = "based on WLP in the Wild Card Era (since 1994)", | |
caption = "Data from Lahman R package 5.0-0")+ | |
theme_tufte() + | |
theme(axis.ticks = element_blank(), | |
panel.grid.major.y = element_line(colour = "gray86", linetype = "dotted", size = 0.1), | |
panel.grid.minor.y = element_blank(), | |
strip.text.x = element_text(size = 10, family = "serif", face = "bold", colour = "black", angle = 0), | |
axis.text.x=element_text(angle = 90, hjust = 0, vjust = 1, size = 7), | |
axis.text.y=element_text(angle = 0, hjust = 1, vjust = 0.5, size = 6)) + | |
scale_x_continuous(breaks = c(1994, 1998, 2002, 2006, 2010, 2014)) | |
Linegraph | |
#Now, let's see how teams clinched the postseason and who of them became World Champs based on their overall rank during regular season. | |
#First let's add a new variable, called `clinch` with this info. | |
Teams_data <- mutate(Teams_data, | |
clinch = ifelse((DivWin == "Y" | WCWin == "Y") & WSWin == "N", "Clinched Playoff", | |
ifelse(WSWin == "Y", "World Champion", NA))) | |
#And let's add those results to the previous sparkline plot. | |
Linegraph_ps <- ggplot(Teams_data, aes(x = Season, y = OverallRank)) + | |
geom_line(color = "cadetblue3", size = .8) + | |
geom_point(aes(shape = clinch, color = clinch, fill = clinch)) + | |
scale_color_manual(name = "Team's performance", | |
breaks = c("Clinched Playoff", "World Champion"), | |
values = c("darkblue", "red3"), | |
labels = c("Clinched Playoff", "World Champion")) + | |
scale_shape_manual(name = "Team's performance", | |
breaks = c("Clinched Playoff", "World Champion"), | |
values = c(21, 18), | |
labels = c("Clinched Playoff", "World Champion")) + | |
scale_fill_manual(name = "Team's performance", | |
breaks = c("Clinched Playoff", "World Champion"), | |
values = c("white", "red3"), | |
labels = c("Clinched Playoff", "World Champion")) + | |
scale_y_reverse(breaks = c(1,30)) + | |
facet_wrap(~ FranchID, ncol = 5) + | |
labs(title = "Overall rank of MLB teams in regular season", | |
subtitle = "based on WLP in Wild Card Era (since 1995)", | |
caption = "Data from Lahman R package 5.0-0")+ | |
theme_tufte() + | |
theme(axis.ticks = element_blank(), | |
panel.grid.major.y = element_line(colour = "gray86", linetype = "dotted", size = 0.1), | |
panel.grid.minor.y = element_blank(), | |
strip.text.x = element_text(size = 10, family = "serif", face = "bold", colour = "black", angle = 0), | |
axis.text.x=element_text(angle = 90, hjust = -2, vjust = 1, size = 7), | |
axis.text.y=element_text(angle = 0, hjust = 1, vjust = 0.5, size = 6)) + | |
scale_x_continuous(breaks = c(1994, 1998, 2002, 2006, 2010, 2014)) | |
Linegraph_ps |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment