Last active
June 29, 2021 01:35
-
-
Save z-feldman/b9d9cda9bb4a142fc6e64a19e2385f50 to your computer and use it in GitHub Desktop.
Create simple SRS/Opponent Adjusted Point Differential using NFL data in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SRS, simple rating system, is a way to control point differential for opponent and location | |
# this is a great primer: https://www.pro-football-reference.com/blog/index4837.html?p=37 | |
# To do this, we create a matrix with one row for each game, and columns of the 32 teams, plus a 33rd column for location | |
# We then solve the matrix/system of equations with the right hand side being the result of the game (scoring margin) | |
# Our matrix is populated with 1's, -1's, and 0's. 1 for home team, -1 for away team, 0 for teams not playing in the game | |
# In the location column 1 for home, 0 for neutral | |
# The equation for margin of victory for a single game is mov = home_field + home_team_rating - away_team_rating | |
# Our 1's and -1's are identifiers for the teams, and then solving the matrix will give us the ratings for each team | |
# that minimizes the error | |
# Since schedules are inbalanced (each team doesn't play each team the same number of times), we cannot fully solve the matrix, | |
# we are just finding the ratings that minimize the overall error | |
# Load (install if needed) libraries -------------------------------------- | |
library(tidyverse) | |
library(magrittr) | |
library(limSolve) | |
library(broom) | |
# Read in games data from Lee Sharpe's website, filter for games w -------- | |
games <- read_csv("http://nflgamedata.com/games.csv") %>% filter(season == 2019, game_type == "REG") | |
# Store team names for column naming purposes | |
all_teams <- unique(games$home_team[order(games$home_team)]) | |
# Create matrix of games -------------------------------------------------- | |
srs_matrix <- data.frame(matrix(NA, ncol = 32, nrow = nrow(games))) | |
colnames(srs_matrix) <- all_teams | |
# Populate matrix --------------------------------------------------------- | |
# 1 for home, -1 for away, 0 for everything else | |
# Add column that's 1 for home, 0 for neutral (this games file is respective to home team so there won't be any Away) | |
srs_matrix %<>% mutate( | |
across( | |
.cols = everything(), | |
.fns = ~ case_when( | |
games$home_team == cur_column() ~ 1, | |
games$away_team == cur_column() ~ -1, | |
TRUE ~ 0 | |
) | |
), | |
home_field_advantage = if_else(games$location == "Home", 1, 0) | |
) | |
# Store results, respective to home team | |
srs_mov <- games %>% select(result) | |
# Solve matrix with least squares ------------------------------------------------------------ | |
srs_ratings <- limSolve::lsei(srs_matrix, srs_mov) | |
# Store ratings in data frame | |
ratings_tibble <- tibble( | |
Team = colnames(srs_matrix), | |
SRS = srs_ratings[[1]]) %>% arrange(desc(SRS)) | |
ratings_tibble | |
# Function to predict game with ratings ----------------------------------- | |
predict_game <- function(team, opp, home_team = team, rating) { | |
location <- case_when( | |
home_team == team ~ 1, | |
home_team == opp ~ -1, | |
home_team == "neutral" ~ 0) | |
if(rating == "SRS"){ | |
pluck(srs_ratings, "X", team) - pluck(srs_ratings, "X", opp) + location*pluck(srs_ratings, "X", "home_field_advantage") | |
} else if(rating == "linear"){ | |
ratings_linear_tibble %>% filter(Team == team) %>% select(Rating) %>% as.numeric() - | |
ratings_linear_tibble %>% filter(Team == opp) %>% select(Rating) %>% as.numeric() + | |
location*ratings_linear_tibble %>% filter(Team == "home_field_advantage") %>% select(Rating) %>% as.numeric() | |
} else | |
return("You entered the rating system incorrectly") | |
} | |
predict_game("BAL", "KC", rating = "SRS") | |
# If Baltimore hosted Kansas City after week 17 they'd be ~6.5 point favorites, according to these ratings | |
# Don't try to use linear until we create the linear ratings below | |
# Use Linear Regression --------------------------------------------------- | |
# We can also do this with linear regression. Our equation is result ~ location + team + opp | |
# Double Games ------------------------------------------------------------ | |
# We'll utilize Lee Sharpe's double games function to give us each game in two rows | |
# One with respect to the home team, one to the away | |
double_games <- function(g) | |
{ | |
g1 <- g %>% | |
rename(team=away_team,team_score=away_score, | |
opp=home_team,opp_score=home_score, | |
team_moneyline=away_moneyline,opp_moneyline=home_moneyline, | |
team_spread_odds=away_spread_odds,opp_spread_odds=home_spread_odds, | |
team_coach=away_coach,opp_coach=home_coach) %>% | |
mutate(location=ifelse(location == "Home","Away",location), | |
result=-1*result,spread_line=-1*spread_line) | |
g2 <- g %>% | |
rename(team=home_team,team_score=home_score, | |
opp=away_team,opp_score=away_score, | |
team_moneyline=home_moneyline,opp_moneyline=away_moneyline, | |
team_spread_odds=home_spread_odds,opp_spread_odds=away_spread_odds, | |
team_coach=home_coach,opp_coach=away_coach) | |
g <- bind_rows(g1,g2) %>% | |
arrange(gameday,gametime,old_game_id,location) | |
return(g) | |
} | |
games %<>% double_games() | |
# Train Linear Model ------------------------------------------------------ | |
# First we will set the location to be a factor with Neutral as the first level | |
# This will set Neutral to 0 and give us our home and away rating relative to it, which is what we want | |
games %<>% mutate(location = factor(location, levels = c("Neutral", "Home", "Away"))) | |
# Now we can train our model | |
ratings_linear <- lm(data = games, formula = result ~ team + opp + location) | |
ratings_linear %>% summary() | |
# Just like with location, one team is set equal to zero and all other ratings are relative to it | |
# Since Arizona is first alphabetically that is the team held out, and why it is not in the summary output | |
# We can take the average of the team ratings and subtract it from every rating, including a zero for Arizona | |
# This will make it so the average of all teams is 0, which is how we want to interpret the ratings... | |
# points above/below an average team | |
# Linearly Adjust Ratings ------------------------------------------------- | |
# tidy() will pull the summary output and make it easier to work with | |
ratings_linear_tidy <- broom::tidy(ratings_linear) | |
linear_adjust_teams <- | |
ratings_linear_tidy %>% | |
select(term, estimate) %>% | |
filter(str_detect(term, "^team")) %>% # we only need the terms that start with 'team' | |
summarise(mean(estimate)) %>% | |
as.numeric() | |
home_field <- pluck(ratings_linear, "coefficients", "locationHome") | |
ratings_linear_tibble <- tibble("Team" = c(all_teams, "home_field_advantage"), | |
"Rating" = c(c(0, ratings_linear_tidy$estimate[2:32]) - linear_adjust_teams, home_field)) %>% arrange(desc(Rating)) | |
ratings_linear_tibble | |
# above we combine 0 (Arizona's rating) and the other 31 ratings, subtract all by the average (the linear_adjust number) and throw in home field | |
predict_game("BAL", "KC", rating = "linear") | |
# we get the same, or about the same, result as with SRS | |
# under the hood, basically the same thing is being calculated, there will be very small discrepencies between the two |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment