Created
December 31, 2019 13:19
-
-
Save bayesball/8d6719dea05a5ea4e8d5930bdfa4a6cf to your computer and use it in GitHub Desktop.
R work for why are baseball games so long post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(CalledStrike) | |
library(broom) | |
###################################### | |
# function retro_work() | |
###################################### | |
# given a Retrosheet play-by-play dataset | |
# collects the number of PAs and | |
# number of pitches for all games in | |
# specific season | |
retro_work <- function(d){ | |
d %>% | |
mutate(pseq = gsub("[.>123N+*]", "", PITCH_SEQ_TX), | |
N.Pitches = nchar(pseq)) -> d | |
d %>% | |
group_by(GAME_ID) %>% | |
summarize(PA = n(), | |
N_Pitches = sum(N.Pitches), | |
maxINNING = max(INN_CT)) %>% | |
filter(maxINNING == 9) -> S | |
S$GAME_ID <- as.character(S$GAME_ID) | |
S | |
} | |
######################################## | |
# function one_season() | |
######################################## | |
# inputs are the season number and the output from | |
# the retro_work() function | |
# function will read in the Retrosheet game logs | |
# dataset from disk and merge the game duration variable | |
# with the retroS data frame | |
one_season <- function(season, retroS){ | |
filename <- paste("~/Dropbox/Google Drive/gamelogs/gamelogs/gl", | |
season, ".txt", sep = '') | |
gdata <- read_csv(filename, col_names = FALSE) | |
header <- read_csv("~/Dropbox/Google Drive/gamelogs/gamelogs/game_log_header.csv") | |
names(gdata) <- names(header) | |
gdata %>% | |
mutate(GAME_ID = paste(HomeTeam, Date, DoubleHeader, | |
sep = "")) -> gdata | |
inner_join(retroS, select(gdata, GAME_ID, Duration), | |
by = "GAME_ID") -> retroS | |
retroS$Season <- season | |
retroS | |
} | |
# using the two functions | |
# load the Retrosheet play-by-play dataset on disk | |
load("~/Dropbox/Google Drive/Retrosheet/pbp.2019.Rdata") | |
S2019 <- retro_work(d2019) | |
S2019 <- one_season(2019, S2019) | |
# I repeated this operation for all seasons 2000 through | |
# 2019 -- row merged these data frames into the data frame | |
# Sall | |
# graph of the mean duration of the games against season | |
Sall %>% | |
group_by(Season) %>% | |
summarize(M = mean(Duration)) %>% | |
ggplot(aes(Season, M)) + | |
geom_point(size = 4, color = "red") + | |
geom_smooth(method = "loess", se = FALSE) + | |
increasefont() + | |
ylab("Mean Duration (Min)") + | |
ggtitle("Mean Length of a 9 Inning Game") + | |
centertitle() | |
# graph of the mean number of PAs against season | |
Sall %>% | |
group_by(Season) %>% | |
summarize(M = mean(PA)) %>% | |
ggplot(aes(Season, M)) + | |
geom_point(size = 4, color = "red") + | |
geom_smooth(method = "loess", se = FALSE) + | |
increasefont() + | |
ylab("Mean PA") + | |
ggtitle("Mean Number of PA of a 9 Inning Game") + | |
centertitle() | |
# graph of the mean number of pitches per PA | |
Sall %>% | |
group_by(Season) %>% | |
summarize(M = mean(N_Pitches / PA)) %>% | |
ggplot(aes(Season, M)) + | |
geom_point(size = 4, color = "red") + | |
geom_smooth(method = "loess", se = FALSE) + | |
increasefont() + | |
ylab("Mean Pitches / PA") + | |
ggtitle("Mean Number of Pitches per PA") + | |
centertitle() | |
# graph of the mean number of pitches per game | |
Sall %>% | |
group_by(Season) %>% | |
summarize(M = mean(N_Pitches)) %>% | |
ggplot(aes(Season, M)) + | |
geom_point(size = 4, color = "red") + | |
geom_smooth(method = "loess", se = FALSE) + | |
increasefont() + | |
ylab("Mean Pitches per Game") + | |
ggtitle("Mean Number of Pitches per Game") + | |
centertitle() | |
# fit regressions of (N_Pitches, Duration) for | |
# all seasons | |
regressions <- Sall %>% group_by(Season) %>% | |
do(tidy(lm(Duration ~ N_Pitches, data=.))) | |
# graph of regression slopes against season | |
regressions %>% | |
filter(term == "N_Pitches") %>% | |
ggplot(aes(Season, estimate)) + | |
geom_point(size = 4, color = "red") + | |
geom_smooth(method = "loess", se = FALSE) + | |
increasefont() + | |
ylab("Mean Time Per Pitch (Min)") + | |
ggtitle("Mean Time Per Pitch") + | |
centertitle() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment