z-feldman · June 29, 2021 01:35
diff --git a/power_ratings.R b/power_ratings.R
 # SRS, simple rating system, is a way to control point differential for opponent and location
 # this is a great primer: https://www.pro-football-reference.com/blog/index4837.html?p=37
 # To do this, we create a matrix with one row for each game, and columns of the 32 teams, plus a 33rd column for location
 # We then solve the matrix/system of equations with the right hand side being the result of the game (scoring margin)
 # Our matrix is populated with 1's, -1's, and 0's. 1 for home team, -1 for away team, 0 for teams not playing in the game
 # In the location column 1 for home, 0 for neutral
 # The equation for margin of victory for a single game is mov = home_field + home_team_rating - away_team_rating
 # Our 1's and -1's are identifiers for the teams, and then solving the matrix will give us the ratings for each team
 # that minimizes the error
 # Since schedules are inbalanced (each team doesn't play each team the same number of times), we cannot fully solve the matrix,
 # we are just finding the ratings that minimize the overall error



 # Load (install if needed) libraries --------------------------------------

 library(tidyverse)
 library(magrittr)
 library(limSolve)
 library(broom)

 # Read in games data from Lee Sharpe's website, filter for games w --------

 games <- read_csv("http://nflgamedata.com/games.csv") %>% filter(season == 2019, game_type == "REG")

 # Store team names for column naming purposes
 all_teams <- unique(games$home_team[order(games$home_team)])


 # Create matrix of games --------------------------------------------------

 srs_matrix <- data.frame(matrix(NA, ncol = 32, nrow = nrow(games)))
 colnames(srs_matrix) <- all_teams


 # Populate matrix ---------------------------------------------------------
 # 1 for home, -1 for away, 0 for everything else
 # Add column that's 1 for home, 0 for neutral (this games file is respective to home team so there won't be any Away)
 srs_matrix %<>% mutate(
  across(
    .cols = everything(),
    .fns = ~ case_when(
      games$home_team == cur_column()  ~ 1,
      games$away_team == cur_column() ~ -1,
      TRUE ~ 0
    )
  ),
  home_field_advantage = if_else(games$location == "Home", 1, 0)
 ) 

 # Store results, respective to home team
 srs_mov <- games %>% select(result)


 # Solve matrix with least squares ------------------------------------------------------------
 srs_ratings <- limSolve::lsei(srs_matrix, srs_mov)


 # Store ratings in data frame
 ratings_tibble <- tibble(
  Team = colnames(srs_matrix),
  SRS = srs_ratings[[1]]) %>% arrange(desc(SRS))

 ratings_tibble



 # Function to predict game with ratings -----------------------------------

 predict_game <- function(team, opp, home_team = team, rating) {
  location <- case_when(
    home_team == team ~ 1,
    home_team == opp ~ -1,
    home_team == "neutral" ~ 0)
  
  if(rating == "SRS"){
    pluck(srs_ratings, "X", team) - pluck(srs_ratings, "X", opp) + location*pluck(srs_ratings, "X", "home_field_advantage")
  } else if(rating == "linear"){
    ratings_linear_tibble %>% filter(Team == team) %>% select(Rating) %>% as.numeric() - 
      ratings_linear_tibble %>% filter(Team == opp) %>% select(Rating) %>% as.numeric() +
      location*ratings_linear_tibble %>% filter(Team == "home_field_advantage") %>% select(Rating) %>% as.numeric()
  } else 
    return("You entered the rating system incorrectly")
  
 }

 predict_game("BAL", "KC", rating = "SRS")

 # If Baltimore hosted Kansas City after week 17 they'd be ~6.5 point favorites, according to these ratings
 # Don't try to use linear until we create the linear ratings below



 # Use Linear Regression ---------------------------------------------------

 # We can also do this with linear regression. Our equation is result ~ location + team + opp


 # Double Games ------------------------------------------------------------
 # We'll utilize Lee Sharpe's double games function to give us each game in two rows
 # One with respect to the home team, one to the away

 double_games <- function(g)
 {
  g1 <- g %>% 
    rename(team=away_team,team_score=away_score,
           opp=home_team,opp_score=home_score,
           team_moneyline=away_moneyline,opp_moneyline=home_moneyline,
           team_spread_odds=away_spread_odds,opp_spread_odds=home_spread_odds,
           team_coach=away_coach,opp_coach=home_coach) %>% 
    mutate(location=ifelse(location == "Home","Away",location),
           result=-1*result,spread_line=-1*spread_line)
  g2 <- g %>% 
    rename(team=home_team,team_score=home_score,
           opp=away_team,opp_score=away_score,
           team_moneyline=home_moneyline,opp_moneyline=away_moneyline,
           team_spread_odds=home_spread_odds,opp_spread_odds=away_spread_odds,           
           team_coach=home_coach,opp_coach=away_coach)
  g <- bind_rows(g1,g2) %>% 
    arrange(gameday,gametime,old_game_id,location)
  return(g)
 }

 games %<>% double_games()



 # Train Linear Model ------------------------------------------------------

 # First we will set the location to be a factor with Neutral as the first level
 # This will set Neutral to 0 and give us our home and away rating relative to it, which is what we want
 games %<>% mutate(location = factor(location, levels = c("Neutral", "Home", "Away")))

 # Now we can train our model
 ratings_linear <- lm(data = games, formula = result ~ team + opp + location)
 ratings_linear %>% summary()

 # Just like with location, one team is set equal to zero and all other ratings are relative to it
 # Since Arizona is first alphabetically that is the team held out, and why it is not in the summary output
 # We can take the average of the team ratings and subtract it from every rating, including a zero for Arizona
 # This will make it so the average of all teams is 0, which is how we want to interpret the ratings...
 # points above/below an average team


 # Linearly Adjust Ratings -------------------------------------------------

 # tidy() will pull the summary output and make it easier to work with
 ratings_linear_tidy <- broom::tidy(ratings_linear)

 linear_adjust_teams <- 
  ratings_linear_tidy %>% 
  select(term, estimate) %>% 
  filter(str_detect(term, "^team")) %>% # we only need the terms that start with 'team'
  summarise(mean(estimate)) %>% 
  as.numeric()

 home_field <- pluck(ratings_linear, "coefficients", "locationHome")

 ratings_linear_tibble <- tibble("Team" = c(all_teams, "home_field_advantage"),
                  "Rating" = c(c(0, ratings_linear_tidy$estimate[2:32]) - linear_adjust_teams, home_field)) %>% arrange(desc(Rating))
 ratings_linear_tibble
 # above we combine 0 (Arizona's rating) and the other 31 ratings, subtract all by the average (the linear_adjust number) and throw in home field


 predict_game("BAL", "KC", rating = "linear") 
 # we get the same, or about the same, result as with SRS
 # under the hood, basically the same thing is being calculated, there will be very small discrepencies between the two
	# SRS, simple rating system, is a way to control point differential for opponent and location
	# this is a great primer: https://www.pro-football-reference.com/blog/index4837.html?p=37
	# To do this, we create a matrix with one row for each game, and columns of the 32 teams, plus a 33rd column for location
	# We then solve the matrix/system of equations with the right hand side being the result of the game (scoring margin)
	# Our matrix is populated with 1's, -1's, and 0's. 1 for home team, -1 for away team, 0 for teams not playing in the game
	# In the location column 1 for home, 0 for neutral
	# The equation for margin of victory for a single game is mov = home_field + home_team_rating - away_team_rating
	# Our 1's and -1's are identifiers for the teams, and then solving the matrix will give us the ratings for each team
	# that minimizes the error
	# Since schedules are inbalanced (each team doesn't play each team the same number of times), we cannot fully solve the matrix,
	# we are just finding the ratings that minimize the overall error



	# Load (install if needed) libraries --------------------------------------

	library(tidyverse)
	library(magrittr)
	library(limSolve)
	library(broom)

	# Read in games data from Lee Sharpe's website, filter for games w --------

	games <- read_csv("http://nflgamedata.com/games.csv") %>% filter(season == 2019, game_type == "REG")

	# Store team names for column naming purposes
	all_teams <- unique(games$home_team[order(games$home_team)])


	# Create matrix of games --------------------------------------------------

	srs_matrix <- data.frame(matrix(NA, ncol = 32, nrow = nrow(games)))
	colnames(srs_matrix) <- all_teams


	# Populate matrix ---------------------------------------------------------
	# 1 for home, -1 for away, 0 for everything else
	# Add column that's 1 for home, 0 for neutral (this games file is respective to home team so there won't be any Away)
	srs_matrix %<>% mutate(
	across(
	.cols = everything(),
	.fns = ~ case_when(
	games$home_team == cur_column() ~ 1,
	games$away_team == cur_column() ~ -1,
	TRUE ~ 0
	)
	),
	home_field_advantage = if_else(games$location == "Home", 1, 0)
	)

	# Store results, respective to home team
	srs_mov <- games %>% select(result)


	# Solve matrix with least squares ------------------------------------------------------------
	srs_ratings <- limSolve::lsei(srs_matrix, srs_mov)


	# Store ratings in data frame
	ratings_tibble <- tibble(
	Team = colnames(srs_matrix),
	SRS = srs_ratings[[1]]) %>% arrange(desc(SRS))

	ratings_tibble



	# Function to predict game with ratings -----------------------------------

	predict_game <- function(team, opp, home_team = team, rating) {
	location <- case_when(
	home_team == team ~ 1,
	home_team == opp ~ -1,
	home_team == "neutral" ~ 0)

	if(rating == "SRS"){
	pluck(srs_ratings, "X", team) - pluck(srs_ratings, "X", opp) + location*pluck(srs_ratings, "X", "home_field_advantage")
	} else if(rating == "linear"){
	ratings_linear_tibble %>% filter(Team == team) %>% select(Rating) %>% as.numeric() -
	ratings_linear_tibble %>% filter(Team == opp) %>% select(Rating) %>% as.numeric() +
	location*ratings_linear_tibble %>% filter(Team == "home_field_advantage") %>% select(Rating) %>% as.numeric()
	} else
	return("You entered the rating system incorrectly")

	}

	predict_game("BAL", "KC", rating = "SRS")

	# If Baltimore hosted Kansas City after week 17 they'd be ~6.5 point favorites, according to these ratings
	# Don't try to use linear until we create the linear ratings below



	# Use Linear Regression ---------------------------------------------------

	# We can also do this with linear regression. Our equation is result ~ location + team + opp


	# Double Games ------------------------------------------------------------
	# We'll utilize Lee Sharpe's double games function to give us each game in two rows
	# One with respect to the home team, one to the away

	double_games <- function(g)
	{
	g1 <- g %>%
	rename(team=away_team,team_score=away_score,
	opp=home_team,opp_score=home_score,
	team_moneyline=away_moneyline,opp_moneyline=home_moneyline,
	team_spread_odds=away_spread_odds,opp_spread_odds=home_spread_odds,
	team_coach=away_coach,opp_coach=home_coach) %>%
	mutate(location=ifelse(location == "Home","Away",location),
	result=-1result,spread_line=-1spread_line)
	g2 <- g %>%
	rename(team=home_team,team_score=home_score,
	opp=away_team,opp_score=away_score,
	team_moneyline=home_moneyline,opp_moneyline=away_moneyline,
	team_spread_odds=home_spread_odds,opp_spread_odds=away_spread_odds,
	team_coach=home_coach,opp_coach=away_coach)
	g <- bind_rows(g1,g2) %>%
	arrange(gameday,gametime,old_game_id,location)
	return(g)
	}

	games %<>% double_games()



	# Train Linear Model ------------------------------------------------------

	# First we will set the location to be a factor with Neutral as the first level
	# This will set Neutral to 0 and give us our home and away rating relative to it, which is what we want
	games %<>% mutate(location = factor(location, levels = c("Neutral", "Home", "Away")))

	# Now we can train our model
	ratings_linear <- lm(data = games, formula = result ~ team + opp + location)
	ratings_linear %>% summary()

	# Just like with location, one team is set equal to zero and all other ratings are relative to it
	# Since Arizona is first alphabetically that is the team held out, and why it is not in the summary output
	# We can take the average of the team ratings and subtract it from every rating, including a zero for Arizona
	# This will make it so the average of all teams is 0, which is how we want to interpret the ratings...
	# points above/below an average team


	# Linearly Adjust Ratings -------------------------------------------------

	# tidy() will pull the summary output and make it easier to work with
	ratings_linear_tidy <- broom::tidy(ratings_linear)

	linear_adjust_teams <-
	ratings_linear_tidy %>%
	select(term, estimate) %>%
	filter(str_detect(term, "^team")) %>% # we only need the terms that start with 'team'
	summarise(mean(estimate)) %>%
	as.numeric()

	home_field <- pluck(ratings_linear, "coefficients", "locationHome")

	ratings_linear_tibble <- tibble("Team" = c(all_teams, "home_field_advantage"),
	"Rating" = c(c(0, ratings_linear_tidy$estimate[2:32]) - linear_adjust_teams, home_field)) %>% arrange(desc(Rating))
	ratings_linear_tibble
	# above we combine 0 (Arizona's rating) and the other 31 ratings, subtract all by the average (the linear_adjust number) and throw in home field


	predict_game("BAL", "KC", rating = "linear")
	# we get the same, or about the same, result as with SRS
	# under the hood, basically the same thing is being calculated, there will be very small discrepencies between the two