Last active
September 26, 2022 00:43
-
-
Save bayesball/4a7d75314b75ac91121885567e9b5b9a to your computer and use it in GitHub Desktop.
R work for the Predicting Home Runs using a Multilevel Model post
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| get_hr_data <- function(pred_season, | |
| retro_data, | |
| n_prev_seasons = 4, | |
| mPA = 1000, | |
| mPA_season = 200){ | |
| # n_prev_seasons is number of previous seasons | |
| # mPA is the minimum number of cumulative PA | |
| # retrodata - Retrosheet data for current season | |
| # mPA_season - minimum number of PA in both | |
| # halves of current season | |
| require(lubridate) | |
| require(Lahman) | |
| require(dplyr) | |
| get_players <- function(seasons, mPA){ | |
| Batting %>% | |
| filter(yearID %in% seasons) %>% | |
| group_by(playerID, yearID) %>% | |
| summarize(PA = sum(AB + BB + HBP), | |
| HR = sum(HR), | |
| .groups = "drop") -> S | |
| S %>% | |
| group_by(playerID) %>% | |
| summarize(HR = sum(HR), | |
| PA = sum(PA)) %>% | |
| filter(PA >= mPA) %>% | |
| inner_join(People, by = "playerID") %>% | |
| select(retroID, HR, PA) | |
| } | |
| seasons <- (pred_season - n_prev_seasons): | |
| (pred_season - 1) | |
| get_players(seasons, mPA) -> players1000 | |
| retro_data %>% | |
| filter(BAT_EVENT_FL == TRUE) %>% | |
| mutate(Year = substr(GAME_ID, 4, 7), | |
| MonthDay = substr(GAME_ID, 8, 11), | |
| Date = mdy(paste(MonthDay, Year, sep = ""))) -> | |
| retro_data | |
| midseason <- mdy(paste("07-01-", | |
| retro_data$Year[1], | |
| sep = "")) | |
| da <- filter(retro_data, Date < midseason) | |
| db <- filter(retro_data, Date >= midseason) | |
| da %>% | |
| group_by(BAT_ID) %>% | |
| summarize(PA = n(), | |
| HR = sum(EVENT_CD == 23)) %>% | |
| mutate(Period = "First Half") -> Sa | |
| db %>% | |
| group_by(BAT_ID) %>% | |
| summarize(PA = n(), | |
| HR = sum(EVENT_CD == 23)) %>% | |
| mutate(Period = "Second Half") -> Sb | |
| Sab <- inner_join(Sa, Sb, by = "BAT_ID") %>% | |
| filter(PA.x >= mPA_season, | |
| PA.y >= mPA_season) | |
| inner_join(Sab, players1000, | |
| by = c("BAT_ID" = "retroID")) | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| multilevel_reg <- function(d1){ | |
| library(tidyverse) | |
| library(runjags) | |
| library(coda) | |
| # multilevel modeling part | |
| modelString = " | |
| model { | |
| for (i in 1:J){ | |
| y[i] ~ dbin(p[i], n[i]) | |
| logit(p[i]) <- a[player[i]] + b[player[i]] * x[i] | |
| } | |
| for (j in 1:J){ | |
| a[j] <- B[j,1] | |
| b[j] <- B[j,2] | |
| B[j,1:2] ~ dmnorm (mu.beta[], Tau.B[,]) | |
| } | |
| mu.beta[1:2] ~ dmnorm(mean[1:2],prec[1:2 ,1:2]) | |
| Tau.B[1:2 , 1:2] ~ dwish(Omega[1:2 ,1:2 ], 2) | |
| } | |
| " | |
| # data for JAGS | |
| logit <- function(y){ log(y) - log(1 - y)} | |
| b.data <- list(y = d1$HR.x, | |
| n = d1$PA.x, | |
| player = d1$Player, | |
| x = logit(d1$HR / d1$PA), | |
| J = dim(d1)[1], | |
| mean = c(0, 0), | |
| Omega = diag(c(.1, .1)), | |
| prec = diag(c(1.0E-6, 1.0E-6))) | |
| # initialization function | |
| initsfunction <- function(chain){ | |
| .RNG.seed <- c(1,2)[chain] | |
| .RNG.name <- c("base::Super-Duper", | |
| "base::Wichmann-Hill")[chain] | |
| return(list(.RNG.seed=.RNG.seed, | |
| .RNG.name=.RNG.name)) | |
| } | |
| # implement MCMC | |
| posterior <- run.jags(modelString, | |
| n.chains = 1, | |
| data = b.data, | |
| monitor = c("B", "mu.beta"), | |
| adapt = 1000, | |
| burnin = 5000, | |
| sample = 20000, | |
| inits = initsfunction) | |
| # convert simulated draws to a data frame | |
| post <- as.mcmc(posterior) | |
| post <- as.data.frame(post) | |
| # compute posterior means, use these to get estimated | |
| # probabilities and add those to the original data frame | |
| est <- apply(post, 2, mean)[1:(2 * b.data$J)] | |
| est <- as.data.frame(matrix(est, b.data$J, 2, | |
| byrow = FALSE)) | |
| names(est) <- c("b0", "b1") | |
| est$Player <- 1:b.data$J | |
| d1 <- inner_join(d1, est, by = "Player") | |
| d1 %>% mutate(lp = b0 + b1 * logit(HR / PA), | |
| p = exp(lp) / (1 + exp(lp))) | |
| } | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| run_and_evaluate <- function(season, retrodata){ | |
| library(dplyr) | |
| d <- get_hr_data(season, retrodata) | |
| d %>% filter(HR.x > 0, | |
| HR > 0) -> d | |
| d$Player <- 1:dim(d)[1] | |
| # multilevel fit | |
| d <- multilevel_reg(d) | |
| # compare prediction errors | |
| d %>% | |
| mutate(Season = season, | |
| Rate.x = HR.x / PA.x, | |
| Rate.y = HR.y / PA.y, | |
| Rate.old = HR / PA, | |
| Rate.20 = .20 * Rate.old + .80 * Rate.x, | |
| Rate.50 = .50 * Rate.old + .50 * Rate.x, | |
| Rate.80 = .80 * Rate.old + .20 * Rate.x) -> d | |
| d %>% | |
| summarize(N = n(), | |
| Season = first(Season), | |
| Current = mean(abs(Rate.x - Rate.y)), | |
| Old = mean(abs(Rate.old - Rate.y)), | |
| Mix.20 = mean(abs(Rate.20 - Rate.y)), | |
| Mix.50 = mean(abs(Rate.50 - Rate.y)), | |
| Mix.80 = mean(abs(Rate.80 - Rate.y)), | |
| Bayes = mean(abs(p - Rate.y))) %>% | |
| select(Season, N, Current, Old, | |
| Mix.20, Mix.50, Mix.80, Bayes) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment