Skip to content

Instantly share code, notes, and snippets.

@jddbucknole
Forked from z-feldman/power_ratings.R
Created June 29, 2021 01:35
Show Gist options
  • Save jddbucknole/51fc3d922a7fe6a8806e55d68b1e80d7 to your computer and use it in GitHub Desktop.
Save jddbucknole/51fc3d922a7fe6a8806e55d68b1e80d7 to your computer and use it in GitHub Desktop.
Create simple SRS/Opponent Adjusted Point Differential using NFL data in R
# SRS, simple rating system, is a way to control point differential for opponent and location
# this is a great primer: https://www.pro-football-reference.com/blog/index4837.html?p=37
# To do this, we create a matrix with one row for each game, and columns of the 32 teams, plus a 33rd column for location
# We then solve the matrix/system of equations with the right hand side being the result of the game (scoring margin)
# Our matrix is populated with 1's, -1's, and 0's. 1 for home team, -1 for away team, 0 for teams not playing in the game
# In the location column 1 for home, 0 for neutral
# The equation for margin of victory for a single game is mov = home_field + home_team_rating - away_team_rating
# Our 1's and -1's are identifiers for the teams, and then solving the matrix will give us the ratings for each team
# that minimizes the error
# Since schedules are inbalanced (each team doesn't play each team the same number of times), we cannot fully solve the matrix,
# we are just finding the ratings that minimize the overall error
# Load (install if needed) libraries --------------------------------------
library(tidyverse)
library(magrittr)
library(limSolve)
library(broom)
# Read in games data from Lee Sharpe's website, filter for games w --------
games <- read_csv("http://nflgamedata.com/games.csv") %>% filter(season == 2019, game_type == "REG")
# Store team names for column naming purposes
all_teams <- unique(games$home_team[order(games$home_team)])
# Create matrix of games --------------------------------------------------
srs_matrix <- data.frame(matrix(NA, ncol = 32, nrow = nrow(games)))
colnames(srs_matrix) <- all_teams
# Populate matrix ---------------------------------------------------------
# 1 for home, -1 for away, 0 for everything else
# Add column that's 1 for home, 0 for neutral (this games file is respective to home team so there won't be any Away)
srs_matrix %<>% mutate(
across(
.cols = everything(),
.fns = ~ case_when(
games$home_team == cur_column() ~ 1,
games$away_team == cur_column() ~ -1,
TRUE ~ 0
)
),
home_field_advantage = if_else(games$location == "Home", 1, 0)
)
# Store results, respective to home team
srs_mov <- games %>% select(result)
# Solve matrix with least squares ------------------------------------------------------------
srs_ratings <- limSolve::lsei(srs_matrix, srs_mov)
# Store ratings in data frame
ratings_tibble <- tibble(
Team = colnames(srs_matrix),
SRS = srs_ratings[[1]]) %>% arrange(desc(SRS))
ratings_tibble
# Function to predict game with ratings -----------------------------------
predict_game <- function(team, opp, home_team = team, rating) {
location <- case_when(
home_team == team ~ 1,
home_team == opp ~ -1,
home_team == "neutral" ~ 0)
if(rating == "SRS"){
pluck(srs_ratings, "X", team) - pluck(srs_ratings, "X", opp) + location*pluck(srs_ratings, "X", "home_field_advantage")
} else if(rating == "linear"){
ratings_linear_tibble %>% filter(Team == team) %>% select(Rating) %>% as.numeric() -
ratings_linear_tibble %>% filter(Team == opp) %>% select(Rating) %>% as.numeric() +
location*ratings_linear_tibble %>% filter(Team == "home_field_advantage") %>% select(Rating) %>% as.numeric()
} else
return("You entered the rating system incorrectly")
}
predict_game("BAL", "KC", rating = "SRS")
# If Baltimore hosted Kansas City after week 17 they'd be ~6.5 point favorites, according to these ratings
# Don't try to use linear until we create the linear ratings below
# Use Linear Regression ---------------------------------------------------
# We can also do this with linear regression. Our equation is result ~ location + team + opp
# Double Games ------------------------------------------------------------
# We'll utilize Lee Sharpe's double games function to give us each game in two rows
# One with respect to the home team, one to the away
double_games <- function(g)
{
g1 <- g %>%
rename(team=away_team,team_score=away_score,
opp=home_team,opp_score=home_score,
team_moneyline=away_moneyline,opp_moneyline=home_moneyline,
team_spread_odds=away_spread_odds,opp_spread_odds=home_spread_odds,
team_coach=away_coach,opp_coach=home_coach) %>%
mutate(location=ifelse(location == "Home","Away",location),
result=-1*result,spread_line=-1*spread_line)
g2 <- g %>%
rename(team=home_team,team_score=home_score,
opp=away_team,opp_score=away_score,
team_moneyline=home_moneyline,opp_moneyline=away_moneyline,
team_spread_odds=home_spread_odds,opp_spread_odds=away_spread_odds,
team_coach=home_coach,opp_coach=away_coach)
g <- bind_rows(g1,g2) %>%
arrange(gameday,gametime,old_game_id,location)
return(g)
}
games %<>% double_games()
# Train Linear Model ------------------------------------------------------
# First we will set the location to be a factor with Neutral as the first level
# This will set Neutral to 0 and give us our home and away rating relative to it, which is what we want
games %<>% mutate(location = factor(location, levels = c("Neutral", "Home", "Away")))
# Now we can train our model
ratings_linear <- lm(data = games, formula = result ~ team + opp + location)
ratings_linear %>% summary()
# Just like with location, one team is set equal to zero and all other ratings are relative to it
# Since Arizona is first alphabetically that is the team held out, and why it is not in the summary output
# We can take the average of the team ratings and subtract it from every rating, including a zero for Arizona
# This will make it so the average of all teams is 0, which is how we want to interpret the ratings...
# points above/below an average team
# Linearly Adjust Ratings -------------------------------------------------
# tidy() will pull the summary output and make it easier to work with
ratings_linear_tidy <- broom::tidy(ratings_linear)
linear_adjust_teams <-
ratings_linear_tidy %>%
select(term, estimate) %>%
filter(str_detect(term, "^team")) %>% # we only need the terms that start with 'team'
summarise(mean(estimate)) %>%
as.numeric()
home_field <- pluck(ratings_linear, "coefficients", "locationHome")
ratings_linear_tibble <- tibble("Team" = c(all_teams, "home_field_advantage"),
"Rating" = c(c(0, ratings_linear_tidy$estimate[2:32]) - linear_adjust_teams, home_field)) %>% arrange(desc(Rating))
ratings_linear_tibble
# above we combine 0 (Arizona's rating) and the other 31 ratings, subtract all by the average (the linear_adjust number) and throw in home field
predict_game("BAL", "KC", rating = "linear")
# we get the same, or about the same, result as with SRS
# under the hood, basically the same thing is being calculated, there will be very small discrepencies between the two
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment