Last active
May 22, 2020 23:21
-
-
Save bayesball/8892999 to your computer and use it in GitHub Desktop.
R function for computing runs values for all plays in a play-by-play Retrosheet file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
compute.runs.expectancy <- function(season){ | |
# changed -- plyr function replaced with dplyr | |
# (increases speed from 114 to 30 sec for 2013 data) | |
# assume that files "allseason.csv" and "fields.csv" | |
# are in current working folder | |
# for example, if season = 1961, all1961.csv should be | |
# available | |
# returns play-by-play matrix with new variables | |
# RUNS.ROI - runs scored in remainder of inning | |
# STATE - current runners/outs state | |
# NEW.STATE - new runners/outs state (after play) | |
# RUNS.STATE - runs value of current runners/outs state | |
# RUNS.NEW.STATE - runs value of new runners/outs state | |
# RUNS.VALUE - runs value of play event | |
data.file <- paste("all", season, ".csv", sep="") | |
data <- read.csv(data.file, header=FALSE) | |
# fields <- read.csv("fields.csv") | |
fields <- read.csv("https://raw.githubusercontent.com/beanumber/baseball_R/master/data/fields.csv") | |
names(data) <- fields[, "Header"] | |
data$RUNS <- with(data, AWAY_SCORE_CT + HOME_SCORE_CT) | |
data$HALF.INNING <- with(data, | |
paste(GAME_ID, INN_CT, BAT_HOME_ID)) | |
data$RUNS.SCORED <- with(data, (BAT_DEST_ID > 3) + | |
(RUN1_DEST_ID > 3) + (RUN2_DEST_ID > 3) + (RUN3_DEST_ID > 3)) | |
RUNS.SCORED.INNING <- aggregate(data$RUNS.SCORED, | |
list(HALF.INNING = data$HALF.INNING), sum) | |
RUNS.SCORED.START <- aggregate(data$RUNS, | |
list(HALF.INNING = data$HALF.INNING), "[", 1) | |
MAX <- data.frame(HALF.INNING=RUNS.SCORED.START$HALF.INNING) | |
MAX$x <- RUNS.SCORED.INNING$x + RUNS.SCORED.START$x | |
data <- merge(data, MAX) | |
N <- ncol(data) | |
names(data)[N] <- "MAX.RUNS" | |
data$RUNS.ROI <- data$MAX.RUNS - data$RUNS | |
get.state <- function(runner1, runner2, runner3, outs){ | |
runners <- paste(runner1, runner2, runner3, sep="") | |
paste(runners, outs) | |
} | |
RUNNER1 <- ifelse(as.character(data[,"BASE1_RUN_ID"])=="", 0, 1) | |
RUNNER2 <- ifelse(as.character(data[,"BASE2_RUN_ID"])=="", 0, 1) | |
RUNNER3 <- ifelse(as.character(data[,"BASE3_RUN_ID"])=="", 0, 1) | |
data$STATE <- get.state(RUNNER1, RUNNER2, RUNNER3, data$OUTS_CT) | |
NRUNNER1 <- with(data, as.numeric(RUN1_DEST_ID==1 | BAT_DEST_ID==1)) | |
NRUNNER2 <- with(data, as.numeric(RUN1_DEST_ID==2 | RUN2_DEST_ID==2 | BAT_DEST_ID==2)) | |
NRUNNER3 <- with(data, as.numeric(RUN1_DEST_ID==3 | RUN2_DEST_ID==3 | | |
RUN3_DEST_ID==3 | BAT_DEST_ID==3)) | |
NOUTS <- with(data, OUTS_CT + EVENT_OUTS_CT) | |
data$NEW.STATE <- get.state(NRUNNER1, NRUNNER2, NRUNNER3, NOUTS) | |
data <- subset(data, (STATE!=NEW.STATE) | (RUNS.SCORED>0)) | |
# require(plyr) | |
# data.outs <- ddply(data, .(HALF.INNING), summarize, | |
# Outs.Inning = sum(EVENT_OUTS_CT)) | |
# data <- merge(data, data.outs) | |
require(dplyr) | |
data.outs <- summarize(group_by(data, HALF.INNING), | |
Outs.Inning = sum(EVENT_OUTS_CT)) | |
data <- merge(data, data.outs) | |
# for expected runs computation, only consider complete innings | |
dataC <- subset(data, Outs.Inning == 3) | |
RUNS <- summarize(group_by(dataC, STATE), Mean=mean(RUNS.ROI)) | |
RUNS$Outs <- substr(RUNS$STATE, 5, 5) | |
RUNS <- RUNS[order(RUNS$Outs), ] | |
RUNS.POTENTIAL <- matrix(c(RUNS$Mean, rep(0, 8)), 32, 1) | |
dimnames(RUNS.POTENTIAL)[[1]] <- c(RUNS$STATE, "000 3","001 3", | |
"010 3","011 3","100 3","101 3","110 3","111 3") | |
data$RUNS.STATE <- RUNS.POTENTIAL[data$STATE, ] | |
data$RUNS.NEW.STATE <- RUNS.POTENTIAL[data$NEW.STATE, ] | |
data$RUNS.VALUE <- data$RUNS.NEW.STATE - data$RUNS.STATE + | |
data$RUNS.SCORED | |
data | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment