Last active
October 29, 2022 21:29
-
-
Save MichaelChirico/0fb318392d1b18d6a966e4785d49cac9 to your computer and use it in GitHub Desktop.
Scraping & munging to answer: Has any team before the 2022 Phillies won all of their playoff game 1s on the road?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SOURCE: retrosheet.org - https://www.retrosheet.org/gamelogs/index.html | |
| library(withr) | |
| library(data.table) | |
| # one file for the full history of each round | |
| playoff_logs <- c( | |
| wild_card = "https://www.retrosheet.org/gamelogs/glwc.zip", | |
| lds = "https://www.retrosheet.org/gamelogs/gldv.zip", | |
| lcs = "https://www.retrosheet.org/gamelogs/gllc.zip", | |
| world_series = "https://www.retrosheet.org/gamelogs/glws.zip" | |
| ) | |
| playoff_data <- lapply(playoff_logs, function(url) { | |
| # download, unzip, ingest | |
| download.file(url, tmp <- withr::local_tempfile(), quiet = TRUE) | |
| dir <- withr::local_tempdir() | |
| unzip(tmp, exdir = dir) | |
| series_data <- fread(list.files(dir, full.names = TRUE)) | |
| # thanks to data dictionary: | |
| # https://www.retrosheet.org/gamelogs/glfields.txt | |
| setnames( | |
| series_data, | |
| c("V1", "V4", "V6", "V7", "V10", "V11"), | |
| c("date", "visitor", "game_number", "home", "visitor_score", "home_score") | |
| ) | |
| # drop unneeded columns | |
| series_data[, grep("^V", names(series_data)) := NULL] | |
| series_data | |
| }) |> | |
| rbindlist(idcol = "round") | |
| # map from abbreviated / canonical team name to common name | |
| team_lookup <- fread("https://www.retrosheet.org/TEAMABR.TXT") | |
| setnames(team_lookup, c("V1", "V3", "V4"), c("abbr", "city", "team")) | |
| playoff_data[team_lookup, on = c(home = "abbr"), home_full := paste(i.city, i.team)] | |
| playoff_data[team_lookup, on = c(visitor = "abbr"), visitor_full := paste(i.city, i.team)] | |
| playoff_data[, date := as.IDate(as.character(date), "%Y%m%d")] | |
| playoff_data[, year := year(date)] | |
| # count visitor game 1 wins, filter for >= 4 | |
| playoff_data[ | |
| game_number == 1 & visitor_score > home_score, | |
| .N, | |
| by = .(visitor_full, year) | |
| ][ | |
| N >= 4 | |
| ] | |
| # visitor_full year N | |
| # 1: San Francisco Giants 2014 4 | |
| # how about winning all away home games, at least 3 times | |
| playoff_data[ | |
| game_number == 1, | |
| all(visitor_score > home_score) && .N >= 3, | |
| by = .(visitor_full, year) | |
| ][(V1)] | |
| # visitor_full year V1 | |
| # 1: San Francisco Giants 2014 TRUE | |
| # 2: San Francisco Giants 2002 TRUE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment