Last active
May 16, 2021 22:37
-
-
Save ChiBearsStats/dac3266037797032a23f38fd9d64d6a8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(nflscrapR) | |
library(tidyverse) | |
library(ggrepel) | |
library(ggimage) | |
#Load the PBP Data from 2009-2018 (based on Ben Baldwin's post) | |
first <- 2009 #first season to grab. min available=2009 | |
last <- 2018 # most recent season | |
datalist = list() | |
for (yr in first:last) { | |
pbp <- read_csv(url(paste0("https://github.com/ryurko/nflscrapR-data/raw/master/play_by_play_data/regular_season/reg_pbp_", yr, ".csv"))) | |
games <- read_csv(url(paste0("https://raw.githubusercontent.com/ryurko/nflscrapR-data/master/games_data/regular_season/reg_games_", yr, ".csv"))) | |
pbp <- pbp %>% inner_join(games %>% distinct(game_id, week, season)) %>% select(-fumble_recovery_2_yards) | |
datalist[[yr]] <- pbp # add it to your list | |
} | |
pbp_all <- dplyr::bind_rows(datalist) | |
#Fix teams with inconsistent acronyms | |
pbp_all <- pbp_all %>% | |
mutate_at(vars(home_team, away_team, posteam, defteam), funs(case_when( | |
. %in% "JAX" ~ "JAC", | |
. %in% "STL" ~ "LA", | |
. %in% "SD" ~ "LAC", | |
TRUE ~ . | |
))) | |
#Save the data for future use | |
saveRDS(pbp_all, file="pbpdata2009to2019.rds") | |
pbp_all <- readRDS("pbpdata2009to2019.rds") | |
#Using Ben Baldwin's method to clean the data. Some columns are unneccesary to create, but I just standardly copy+paste it all | |
pbp_all_rp <- pbp_all %>% | |
filter(!is.na(epa), !is.na(posteam), play_type=="no_play" | play_type=="pass" | play_type=="run") %>% | |
mutate( | |
pass = if_else(str_detect(desc, "(pass)|(sacked)|(scramble)"), 1, 0), | |
rush = if_else(str_detect(desc, "(left end)|(left tackle)|(left guard)|(up the middle)|(right guard)|(right tackle)|(right end)") & pass == 0, 1, 0), | |
success = ifelse(epa>0, 1 , 0), | |
passer_player_name = ifelse(play_type == "no_play" & pass == 1, | |
str_extract(desc, "(?<=\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?(?=\\s((pass)|(sack)|(scramble)))"), | |
passer_player_name), | |
receiver_player_name = ifelse(play_type == "no_play" & str_detect(desc, "pass"), | |
str_extract(desc, "(?<=to\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?"), | |
receiver_player_name), | |
rusher_player_name = ifelse(play_type == "no_play" & rush == 1, | |
str_extract(desc, "(?<=\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?(?=\\s((left end)|(left tackle)|(left guard)| (up the middle)|(right guard)|(right tackle)|(right end)))"), | |
rusher_player_name), | |
name = ifelse(!is.na(passer_player_name), passer_player_name, rusher_player_name), | |
yards_gained=ifelse(play_type=="no_play",NA,yards_gained), | |
play=1 | |
) %>% | |
filter(pass==1 | rush==1) | |
#Let's select only the columns of interest | |
final_pbp_data <- pbp_all_rp %>% select(c(posteam, season, down, first_down_rush, first_down_pass, first_down_penalty, ydstogo, play_type, desc)) | |
#First let's calculate the expected third down conversion rate per yards to go for all of the teams | |
final_pbp_data_expected <- final_pbp_data %>% | |
filter(down == 3) %>% | |
group_by(ydstogo) %>% | |
summarise(totalthirddowns = n(), | |
firstdowns = sum(first_down_rush, first_down_pass, first_down_penalty), | |
expected_success_rate = (firstdowns/totalthirddowns)) | |
final_pbp_data_expected %>% filter(ydstogo > 9) | |
#Expectedly, the sample size of third downs plummets as you get farther away from the sticks. Let's bin so the avgs don't get totally wonky. | |
# E.G. Third and 19 is probably not easier to convert than Third and 18, just a small sample | |
final_pbp_data_expected <- final_pbp_data %>% | |
filter(down == 3) %>% | |
mutate(binnedydstogo = ifelse(ydstogo > 24, 25, ydstogo)) %>% | |
group_by(binnedydstogo) %>% | |
summarise(totalthirddowns = n(), | |
firstdowns = sum(first_down_rush, first_down_pass, first_down_penalty), | |
expected_success_rate = (firstdowns/totalthirddowns)) | |
#Sweet! Now we have our expected conversion rates for each distance. | |
ggplot(final_pbp_data_expected, aes(x = binnedydstogo, y = expected_success_rate)) + | |
geom_point() + | |
ylab("Conversion Rate") + | |
xlab("Yards to Go") + | |
labs(title = "3rd Down Conversion Rates At Different Yards To Go", subtitle = "@ChiBearsStats") + | |
annotate("text", x = 20, y = .4, label = "Data courtesy: nflscrapR \n Years 2009-2018", size = 4, color = "black") | |
ggsave('thirddownconversionratesytg.png', dpi=1000) | |
#Now let's see how each team ACTUALLY did | |
final_pbp_data_actual <- final_pbp_data %>% | |
filter(down == 3) %>% | |
mutate(binnedydstogo = ifelse(ydstogo > 24, 25, ydstogo)) %>% | |
group_by(posteam, season, binnedydstogo) %>% | |
summarise(teamthirddowns = n(), | |
teamfirstdowns = sum(first_down_rush, first_down_pass, first_down_penalty), | |
team_success_rate = (teamfirstdowns/teamthirddowns)) | |
#Let's bring over those expected values | |
final_pbp_data_actual <- final_pbp_data_actual %>% inner_join(final_pbp_data_expected, by = "binnedydstogo") | |
#Now we can calculate how teams did relative to their expected value | |
final_pbp_data_actual <- final_pbp_data_actual %>% | |
mutate(expectedfirstdowns = teamthirddowns*expected_success_rate) %>% | |
group_by(posteam, season) %>% | |
summarise(sum(teamfirstdowns), | |
sum(teamthirddowns), | |
sum(expectedfirstdowns), | |
difference = (sum(teamfirstdowns)-sum(expectedfirstdowns)), | |
difference_percentage = ((sum(teamfirstdowns)-sum(expectedfirstdowns))/sum(expectedfirstdowns))*100) | |
#Let's quickly calculate offensive EPA/play | |
pbp_offenses <- pbp_all_rp %>% | |
group_by(posteam, season) %>% | |
summarise(epa_per_play = (sum(epa)/n())) | |
#Join it with the previous data to create our final graphing data | |
final_graph_data <- final_pbp_data_actual %>% inner_join(pbp_offenses, by = c("posteam", "season")) | |
#Let's take a look at the r squared so we can add it to the graph | |
success_lm = lm(difference_percentage ~ epa_per_play, data=final_graph_data) | |
rsquared <- summary(success_lm)$r.squared | |
rsquared <- signif(rsquared, digits = 2) | |
#Finally! We have our final graph data | |
ggplot(final_graph_data, aes(x=difference_percentage, y=epa_per_play)) + | |
geom_point() + | |
geom_text_repel( | |
data = subset(final_graph_data, (epa_per_play > .2) | |
| difference_percentage > 25 | |
| epa_per_play < -.21 | |
| difference_percentage < -25), | |
aes(label = paste(posteam, substr(season, 3, 4)))) + | |
annotate("text", x = 20, y= -.15,label = paste("R^2=", rsquared, sep = " "), size = 3.5, color = "black") + | |
annotate("text", x = 20, y= -.2, angle = 0,label = "Data courtesy: nflscrapR, Years 2009-2018", size = 4, color = "black") + | |
annotate("text", x = -10, y= .3, angle = 0,label = "<- Worse Conversion Rate than Expected", size = 2.5, color = "blue") + | |
annotate("text", x = 10, y= .3, angle = 0,label = "Better Conversion Rate than Expected ->", size = 2.5, color = "blue") + | |
ylab("EPA per Play (All Plays)") + | |
xlab("Difference of Conversion Rate vs Expected Based On Yards To Go (%)") + | |
scale_x_continuous(breaks = c(-30, -20, -10, 0, 10, 20, 30)) + | |
labs(title = "Do Better Offenses Convert More Third Downs? (Controlling for Distance)", subtitle = "@ChiBearsStats") + | |
geom_hline(yintercept = 0, lty = 4, color = "black") + | |
geom_vline(xintercept = 0, lty = 4, color = "black") + | |
geom_smooth(lty = 2) | |
ggsave('epavsadjthirddowns.png', dpi=1000) | |
#Hmmmm it looks like 3rd down expectancy is actually correlated with offense skill, not based solely on distance to go. Let's make sure league wide trends over time aren't skewing the numbers here | |
season_values <- final_pbp_data %>% | |
filter(down == 3) %>% | |
mutate(binnedydstogo = ifelse(ydstogo > 24, 25, ydstogo)) %>% | |
group_by(binnedydstogo, season) %>% | |
summarise(totalthirddowns = n(), | |
firstdowns = sum(first_down_rush, first_down_pass, first_down_penalty), | |
expected_success_rate = (firstdowns/totalthirddowns)) | |
final_pbp_data_actual_by_season <- final_pbp_data %>% | |
filter(down == 3) %>% | |
mutate(binnedydstogo = ifelse(ydstogo > 24, 25, ydstogo)) %>% | |
group_by(posteam, season, binnedydstogo) %>% | |
summarise(teamthirddowns = n(), | |
teamfirstdowns = sum(first_down_rush, first_down_pass, first_down_penalty), | |
team_success_rate = (teamfirstdowns/teamthirddowns)) %>% | |
inner_join(season_values, by = c("season", "binnedydstogo")) %>% | |
mutate(expectedfirstdowns = teamthirddowns*expected_success_rate) %>% | |
group_by(posteam, season) %>% | |
summarise(sum(teamfirstdowns), | |
sum(teamthirddowns), | |
sum(expectedfirstdowns), | |
difference = (sum(teamfirstdowns)-sum(expectedfirstdowns)), | |
difference_percentage = ((sum(teamfirstdowns)-sum(expectedfirstdowns))/sum(expectedfirstdowns))*100) %>% | |
inner_join(pbp_offenses, by = c("posteam", "season")) | |
successlm_byseason = lm(difference_percentage ~ epa_per_play, data=final_pbp_data_actual_by_season) | |
rsquared_byseason <- summary(successlm_byseason)$r.squared | |
rsquared_byseason <- signif(rsquared_byseason, digits = 2) | |
#Graph | |
ggplot(final_pbp_data_actual_by_season, aes(x=difference_percentage, y=epa_per_play)) + | |
geom_point() + | |
geom_text_repel( | |
data = subset(final_pbp_data_actual_by_season, (epa_per_play > .2) | |
| difference_percentage > 25 | |
| epa_per_play < -.21 | |
| difference_percentage < -25), | |
aes(label = paste(posteam, substr(season, 3, 4)))) + | |
annotate("text", x = 20, y= -.15,label = paste("R^2=", rsquared_byseason, sep = " "), size = 3.5, color = "black") + | |
annotate("text", x = 20, y= -.2, angle = 0,label = "Data courtesy: nflscrapR, Years 2009-2018", size = 4, color = "black") + | |
annotate("text", x = -10, y= .3, angle = 0,label = "<- Worse Conversion Rate than Expected", size = 2.5, color = "blue") + | |
annotate("text", x = 10, y= .3, angle = 0,label = "Better Conversion Rate than Expected ->", size = 2.5, color = "blue") + | |
ylab("EPA per Play (All Plays)") + | |
xlab("Difference of Conversion Rate vs Expected Based On Yards To Go (%)") + | |
scale_x_continuous(breaks = c(-30, -20, -10, 0, 10, 20, 30)) + | |
labs(title = "Do Better Offenses Convert More Third Downs? (Controlling for Distance)", subtitle = "@ChiBearsStats") + | |
geom_hline(yintercept = 0, lty = 4, color = "black") + | |
geom_vline(xintercept = 0, lty = 4, color = "black") + | |
geom_smooth(lty = 2) | |
ggsave('epavsadjthirddownsperseason.png', dpi=1000) | |
#No significant difference between the two | |
#Maybe Points per Game doesn't have the same relationship? | |
#Let's write a function to get each team's points per game using nflscrapR's season_games function | |
get_ppg <- function(year){ | |
ppg_stats <- season_games(year) | |
ppg_stats1 <- ppg_stats %>% | |
mutate(team = home) %>% | |
group_by(team) %>% | |
summarise(points = sum(homescore)) | |
ppg_stats2 <- ppg_stats %>% | |
mutate(team = away) %>% | |
group_by(team) %>% | |
summarise(points = sum(awayscore)) | |
ppg_stats3 <- bind_rows(ppg_stats1, ppg_stats2) %>% | |
group_by(team) %>% | |
summarise(total_points = sum(points)) %>% | |
mutate(season = year) | |
return(ppg_stats3) | |
} | |
#I'm sure there's a quicker way to do this, but I'll do this way for now and learn a quicker way as I go | |
ppg_2018 <- get_ppg(2018) | |
ppg_2017 <- get_ppg(2017) | |
ppg_2016 <- get_ppg(2016) | |
ppg_2015 <- get_ppg(2015) | |
ppg_2014 <- get_ppg(2014) | |
ppg_2013 <- get_ppg(2013) | |
ppg_2012 <- get_ppg(2012) | |
ppg_2011 <- get_ppg(2011) | |
ppg_2010 <- get_ppg(2010) | |
ppg_2009 <- get_ppg(2009) | |
ppg <- bind_rows(ppg_2009, ppg_2010, ppg_2011, ppg_2012, ppg_2013, ppg_2014, ppg_2015, ppg_2016, ppg_2017, ppg_2018) %>% | |
mutate(teamppg = total_points/16) %>% | |
mutate_at(vars(team), funs(case_when( | |
. %in% "JAX" ~ "JAC", | |
. %in% "STL" ~ "LA", | |
. %in% "SD" ~ "LAC", | |
TRUE ~ . | |
))) | |
#We'll change team to posteam to more easily join it with the earlier data set | |
ppg <- ppg %>% rename(posteam = team) | |
final_graph_data_by_epa <- final_pbp_data_actual %>% inner_join(ppg, by = c("posteam", "season")) | |
successlm_byepa = lm(difference_percentage ~ teamppg, data=final_graph_data_by_epa) | |
rsquared_byepa <- summary(successlm_byepa)$r.squared | |
rsquared_byepa <- signif(rsquared_byepa, digits = 2) | |
#Graph | |
ggplot(final_graph_data_by_epa, aes(x=difference_percentage, y=teamppg)) + | |
geom_point() + | |
geom_text_repel( | |
data = subset(final_graph_data_by_epa, (teamppg > 34) | |
| difference_percentage > 25 | |
| teamppg < 12.5 | |
| difference_percentage < -25), | |
aes(label = paste(posteam, substr(season, 3, 4))), | |
nudge_y = -1.5) + | |
annotate("text", x = 20, y= 15,label = paste("R^2=", rsquared_byepa, sep = " "), size = 3.5, color = "black") + | |
annotate("text", x = 20, y= 10.5, angle = 0,label = "Data courtesy: nflscrapR, Years 2009-2018", size = 4, color = "black") + | |
annotate("text", x = -10, y= 37.5, angle = 0,label = "<- Worse Conversion Rate than Expected", size = 2.5, color = "blue") + | |
annotate("text", x = 10, y= 37.5, angle = 0,label = "Better Conversion Rate than Expected ->", size = 2.5, color = "blue") + | |
ylab("Points Per Game") + | |
xlab("Difference of Conversion Rate vs Expected Based On Yards To Go (%)") + | |
scale_x_continuous(breaks = c(-30, -20, -10, 0, 10, 20, 30)) + | |
scale_y_continuous(breaks = c(10, 15, 20, 25, 30, 35)) + | |
labs(title = "Do Better Offenses Convert More Third Downs? (Controlling for Distance)", subtitle = "@ChiBearsStats") + | |
geom_hline(yintercept = mean(final_graph_data_by_epa$teamppg), lty = 4, color = "black") + | |
geom_vline(xintercept = 0, lty = 4, color = "black") + | |
geom_smooth(lty = 2) | |
ggsave('epavsadjthirddownsperppg.png', dpi=1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment