Skip to content

Instantly share code, notes, and snippets.

@ChiBearsStats
Last active May 16, 2021 22:37
Show Gist options
  • Save ChiBearsStats/dac3266037797032a23f38fd9d64d6a8 to your computer and use it in GitHub Desktop.
Save ChiBearsStats/dac3266037797032a23f38fd9d64d6a8 to your computer and use it in GitHub Desktop.
library(dplyr)
library(nflscrapR)
library(tidyverse)
library(ggrepel)
library(ggimage)
#Load the PBP Data from 2009-2018 (based on Ben Baldwin's post)
first <- 2009 #first season to grab. min available=2009
last <- 2018 # most recent season
datalist = list()
for (yr in first:last) {
pbp <- read_csv(url(paste0("https://github.com/ryurko/nflscrapR-data/raw/master/play_by_play_data/regular_season/reg_pbp_", yr, ".csv")))
games <- read_csv(url(paste0("https://raw.githubusercontent.com/ryurko/nflscrapR-data/master/games_data/regular_season/reg_games_", yr, ".csv")))
pbp <- pbp %>% inner_join(games %>% distinct(game_id, week, season)) %>% select(-fumble_recovery_2_yards)
datalist[[yr]] <- pbp # add it to your list
}
pbp_all <- dplyr::bind_rows(datalist)
#Fix teams with inconsistent acronyms
pbp_all <- pbp_all %>%
mutate_at(vars(home_team, away_team, posteam, defteam), funs(case_when(
. %in% "JAX" ~ "JAC",
. %in% "STL" ~ "LA",
. %in% "SD" ~ "LAC",
TRUE ~ .
)))
#Save the data for future use
saveRDS(pbp_all, file="pbpdata2009to2019.rds")
pbp_all <- readRDS("pbpdata2009to2019.rds")
#Using Ben Baldwin's method to clean the data. Some columns are unneccesary to create, but I just standardly copy+paste it all
pbp_all_rp <- pbp_all %>%
filter(!is.na(epa), !is.na(posteam), play_type=="no_play" | play_type=="pass" | play_type=="run") %>%
mutate(
pass = if_else(str_detect(desc, "(pass)|(sacked)|(scramble)"), 1, 0),
rush = if_else(str_detect(desc, "(left end)|(left tackle)|(left guard)|(up the middle)|(right guard)|(right tackle)|(right end)") & pass == 0, 1, 0),
success = ifelse(epa>0, 1 , 0),
passer_player_name = ifelse(play_type == "no_play" & pass == 1,
str_extract(desc, "(?<=\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?(?=\\s((pass)|(sack)|(scramble)))"),
passer_player_name),
receiver_player_name = ifelse(play_type == "no_play" & str_detect(desc, "pass"),
str_extract(desc, "(?<=to\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?"),
receiver_player_name),
rusher_player_name = ifelse(play_type == "no_play" & rush == 1,
str_extract(desc, "(?<=\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?(?=\\s((left end)|(left tackle)|(left guard)| (up the middle)|(right guard)|(right tackle)|(right end)))"),
rusher_player_name),
name = ifelse(!is.na(passer_player_name), passer_player_name, rusher_player_name),
yards_gained=ifelse(play_type=="no_play",NA,yards_gained),
play=1
) %>%
filter(pass==1 | rush==1)
#Let's select only the columns of interest
final_pbp_data <- pbp_all_rp %>% select(c(posteam, season, down, first_down_rush, first_down_pass, first_down_penalty, ydstogo, play_type, desc))
#First let's calculate the expected third down conversion rate per yards to go for all of the teams
final_pbp_data_expected <- final_pbp_data %>%
filter(down == 3) %>%
group_by(ydstogo) %>%
summarise(totalthirddowns = n(),
firstdowns = sum(first_down_rush, first_down_pass, first_down_penalty),
expected_success_rate = (firstdowns/totalthirddowns))
final_pbp_data_expected %>% filter(ydstogo > 9)
#Expectedly, the sample size of third downs plummets as you get farther away from the sticks. Let's bin so the avgs don't get totally wonky.
# E.G. Third and 19 is probably not easier to convert than Third and 18, just a small sample
final_pbp_data_expected <- final_pbp_data %>%
filter(down == 3) %>%
mutate(binnedydstogo = ifelse(ydstogo > 24, 25, ydstogo)) %>%
group_by(binnedydstogo) %>%
summarise(totalthirddowns = n(),
firstdowns = sum(first_down_rush, first_down_pass, first_down_penalty),
expected_success_rate = (firstdowns/totalthirddowns))
#Sweet! Now we have our expected conversion rates for each distance.
ggplot(final_pbp_data_expected, aes(x = binnedydstogo, y = expected_success_rate)) +
geom_point() +
ylab("Conversion Rate") +
xlab("Yards to Go") +
labs(title = "3rd Down Conversion Rates At Different Yards To Go", subtitle = "@ChiBearsStats") +
annotate("text", x = 20, y = .4, label = "Data courtesy: nflscrapR \n Years 2009-2018", size = 4, color = "black")
ggsave('thirddownconversionratesytg.png', dpi=1000)
#Now let's see how each team ACTUALLY did
final_pbp_data_actual <- final_pbp_data %>%
filter(down == 3) %>%
mutate(binnedydstogo = ifelse(ydstogo > 24, 25, ydstogo)) %>%
group_by(posteam, season, binnedydstogo) %>%
summarise(teamthirddowns = n(),
teamfirstdowns = sum(first_down_rush, first_down_pass, first_down_penalty),
team_success_rate = (teamfirstdowns/teamthirddowns))
#Let's bring over those expected values
final_pbp_data_actual <- final_pbp_data_actual %>% inner_join(final_pbp_data_expected, by = "binnedydstogo")
#Now we can calculate how teams did relative to their expected value
final_pbp_data_actual <- final_pbp_data_actual %>%
mutate(expectedfirstdowns = teamthirddowns*expected_success_rate) %>%
group_by(posteam, season) %>%
summarise(sum(teamfirstdowns),
sum(teamthirddowns),
sum(expectedfirstdowns),
difference = (sum(teamfirstdowns)-sum(expectedfirstdowns)),
difference_percentage = ((sum(teamfirstdowns)-sum(expectedfirstdowns))/sum(expectedfirstdowns))*100)
#Let's quickly calculate offensive EPA/play
pbp_offenses <- pbp_all_rp %>%
group_by(posteam, season) %>%
summarise(epa_per_play = (sum(epa)/n()))
#Join it with the previous data to create our final graphing data
final_graph_data <- final_pbp_data_actual %>% inner_join(pbp_offenses, by = c("posteam", "season"))
#Let's take a look at the r squared so we can add it to the graph
success_lm = lm(difference_percentage ~ epa_per_play, data=final_graph_data)
rsquared <- summary(success_lm)$r.squared
rsquared <- signif(rsquared, digits = 2)
#Finally! We have our final graph data
ggplot(final_graph_data, aes(x=difference_percentage, y=epa_per_play)) +
geom_point() +
geom_text_repel(
data = subset(final_graph_data, (epa_per_play > .2)
| difference_percentage > 25
| epa_per_play < -.21
| difference_percentage < -25),
aes(label = paste(posteam, substr(season, 3, 4)))) +
annotate("text", x = 20, y= -.15,label = paste("R^2=", rsquared, sep = " "), size = 3.5, color = "black") +
annotate("text", x = 20, y= -.2, angle = 0,label = "Data courtesy: nflscrapR, Years 2009-2018", size = 4, color = "black") +
annotate("text", x = -10, y= .3, angle = 0,label = "<- Worse Conversion Rate than Expected", size = 2.5, color = "blue") +
annotate("text", x = 10, y= .3, angle = 0,label = "Better Conversion Rate than Expected ->", size = 2.5, color = "blue") +
ylab("EPA per Play (All Plays)") +
xlab("Difference of Conversion Rate vs Expected Based On Yards To Go (%)") +
scale_x_continuous(breaks = c(-30, -20, -10, 0, 10, 20, 30)) +
labs(title = "Do Better Offenses Convert More Third Downs? (Controlling for Distance)", subtitle = "@ChiBearsStats") +
geom_hline(yintercept = 0, lty = 4, color = "black") +
geom_vline(xintercept = 0, lty = 4, color = "black") +
geom_smooth(lty = 2)
ggsave('epavsadjthirddowns.png', dpi=1000)
#Hmmmm it looks like 3rd down expectancy is actually correlated with offense skill, not based solely on distance to go. Let's make sure league wide trends over time aren't skewing the numbers here
season_values <- final_pbp_data %>%
filter(down == 3) %>%
mutate(binnedydstogo = ifelse(ydstogo > 24, 25, ydstogo)) %>%
group_by(binnedydstogo, season) %>%
summarise(totalthirddowns = n(),
firstdowns = sum(first_down_rush, first_down_pass, first_down_penalty),
expected_success_rate = (firstdowns/totalthirddowns))
final_pbp_data_actual_by_season <- final_pbp_data %>%
filter(down == 3) %>%
mutate(binnedydstogo = ifelse(ydstogo > 24, 25, ydstogo)) %>%
group_by(posteam, season, binnedydstogo) %>%
summarise(teamthirddowns = n(),
teamfirstdowns = sum(first_down_rush, first_down_pass, first_down_penalty),
team_success_rate = (teamfirstdowns/teamthirddowns)) %>%
inner_join(season_values, by = c("season", "binnedydstogo")) %>%
mutate(expectedfirstdowns = teamthirddowns*expected_success_rate) %>%
group_by(posteam, season) %>%
summarise(sum(teamfirstdowns),
sum(teamthirddowns),
sum(expectedfirstdowns),
difference = (sum(teamfirstdowns)-sum(expectedfirstdowns)),
difference_percentage = ((sum(teamfirstdowns)-sum(expectedfirstdowns))/sum(expectedfirstdowns))*100) %>%
inner_join(pbp_offenses, by = c("posteam", "season"))
successlm_byseason = lm(difference_percentage ~ epa_per_play, data=final_pbp_data_actual_by_season)
rsquared_byseason <- summary(successlm_byseason)$r.squared
rsquared_byseason <- signif(rsquared_byseason, digits = 2)
#Graph
ggplot(final_pbp_data_actual_by_season, aes(x=difference_percentage, y=epa_per_play)) +
geom_point() +
geom_text_repel(
data = subset(final_pbp_data_actual_by_season, (epa_per_play > .2)
| difference_percentage > 25
| epa_per_play < -.21
| difference_percentage < -25),
aes(label = paste(posteam, substr(season, 3, 4)))) +
annotate("text", x = 20, y= -.15,label = paste("R^2=", rsquared_byseason, sep = " "), size = 3.5, color = "black") +
annotate("text", x = 20, y= -.2, angle = 0,label = "Data courtesy: nflscrapR, Years 2009-2018", size = 4, color = "black") +
annotate("text", x = -10, y= .3, angle = 0,label = "<- Worse Conversion Rate than Expected", size = 2.5, color = "blue") +
annotate("text", x = 10, y= .3, angle = 0,label = "Better Conversion Rate than Expected ->", size = 2.5, color = "blue") +
ylab("EPA per Play (All Plays)") +
xlab("Difference of Conversion Rate vs Expected Based On Yards To Go (%)") +
scale_x_continuous(breaks = c(-30, -20, -10, 0, 10, 20, 30)) +
labs(title = "Do Better Offenses Convert More Third Downs? (Controlling for Distance)", subtitle = "@ChiBearsStats") +
geom_hline(yintercept = 0, lty = 4, color = "black") +
geom_vline(xintercept = 0, lty = 4, color = "black") +
geom_smooth(lty = 2)
ggsave('epavsadjthirddownsperseason.png', dpi=1000)
#No significant difference between the two
#Maybe Points per Game doesn't have the same relationship?
#Let's write a function to get each team's points per game using nflscrapR's season_games function
get_ppg <- function(year){
ppg_stats <- season_games(year)
ppg_stats1 <- ppg_stats %>%
mutate(team = home) %>%
group_by(team) %>%
summarise(points = sum(homescore))
ppg_stats2 <- ppg_stats %>%
mutate(team = away) %>%
group_by(team) %>%
summarise(points = sum(awayscore))
ppg_stats3 <- bind_rows(ppg_stats1, ppg_stats2) %>%
group_by(team) %>%
summarise(total_points = sum(points)) %>%
mutate(season = year)
return(ppg_stats3)
}
#I'm sure there's a quicker way to do this, but I'll do this way for now and learn a quicker way as I go
ppg_2018 <- get_ppg(2018)
ppg_2017 <- get_ppg(2017)
ppg_2016 <- get_ppg(2016)
ppg_2015 <- get_ppg(2015)
ppg_2014 <- get_ppg(2014)
ppg_2013 <- get_ppg(2013)
ppg_2012 <- get_ppg(2012)
ppg_2011 <- get_ppg(2011)
ppg_2010 <- get_ppg(2010)
ppg_2009 <- get_ppg(2009)
ppg <- bind_rows(ppg_2009, ppg_2010, ppg_2011, ppg_2012, ppg_2013, ppg_2014, ppg_2015, ppg_2016, ppg_2017, ppg_2018) %>%
mutate(teamppg = total_points/16) %>%
mutate_at(vars(team), funs(case_when(
. %in% "JAX" ~ "JAC",
. %in% "STL" ~ "LA",
. %in% "SD" ~ "LAC",
TRUE ~ .
)))
#We'll change team to posteam to more easily join it with the earlier data set
ppg <- ppg %>% rename(posteam = team)
final_graph_data_by_epa <- final_pbp_data_actual %>% inner_join(ppg, by = c("posteam", "season"))
successlm_byepa = lm(difference_percentage ~ teamppg, data=final_graph_data_by_epa)
rsquared_byepa <- summary(successlm_byepa)$r.squared
rsquared_byepa <- signif(rsquared_byepa, digits = 2)
#Graph
ggplot(final_graph_data_by_epa, aes(x=difference_percentage, y=teamppg)) +
geom_point() +
geom_text_repel(
data = subset(final_graph_data_by_epa, (teamppg > 34)
| difference_percentage > 25
| teamppg < 12.5
| difference_percentage < -25),
aes(label = paste(posteam, substr(season, 3, 4))),
nudge_y = -1.5) +
annotate("text", x = 20, y= 15,label = paste("R^2=", rsquared_byepa, sep = " "), size = 3.5, color = "black") +
annotate("text", x = 20, y= 10.5, angle = 0,label = "Data courtesy: nflscrapR, Years 2009-2018", size = 4, color = "black") +
annotate("text", x = -10, y= 37.5, angle = 0,label = "<- Worse Conversion Rate than Expected", size = 2.5, color = "blue") +
annotate("text", x = 10, y= 37.5, angle = 0,label = "Better Conversion Rate than Expected ->", size = 2.5, color = "blue") +
ylab("Points Per Game") +
xlab("Difference of Conversion Rate vs Expected Based On Yards To Go (%)") +
scale_x_continuous(breaks = c(-30, -20, -10, 0, 10, 20, 30)) +
scale_y_continuous(breaks = c(10, 15, 20, 25, 30, 35)) +
labs(title = "Do Better Offenses Convert More Third Downs? (Controlling for Distance)", subtitle = "@ChiBearsStats") +
geom_hline(yintercept = mean(final_graph_data_by_epa$teamppg), lty = 4, color = "black") +
geom_vline(xintercept = 0, lty = 4, color = "black") +
geom_smooth(lty = 2)
ggsave('epavsadjthirddownsperppg.png', dpi=1000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment