Skip to content

Instantly share code, notes, and snippets.

@tokestermw
Created January 19, 2014 03:07
Show Gist options
  • Save tokestermw/8500014 to your computer and use it in GitHub Desktop.
Save tokestermw/8500014 to your computer and use it in GitHub Desktop.
Download pitchfx XML data (haven't tested in years).
# DownloadPitchFX.R
# downloads the massive MLB Gameday data.
# Version 0.4
# Version History
# 0.5 ~ grab player data, both pitchers and batters, ability to pick team
# 0.4 ~ get team data, and ability to grab team info, checks to see if regular season
# 0.3 ~ updated so 2010 works, fixed some bugs, and saves as tab delimited file
# 0.2 ~ inputs are start and end dates
# 0.1 ~ grab Pitch f/x data from MLB Gameday, specify date ranges (takes half a minute for a day's worth of data on my 2.5Ghz machine)
# Future Versions:
# ~ ability to pick pitchers, batters, teams
# - ability to grab matchups
# - better searching instead of tediously parsing through each XML file
# ~ connect to mysql database
# ~ don't overheat computer!
# ~ document Gameday Code
# downloading pitch f/x data from MLB website
# Get data from http://gd2.mlb.com/components/game/mlb/
# XML package http://www.omegahat.org/RSXML/shortIntro.html
# Perl script of same application by Mike Fast:
# http://fastballs.files.wordpress.com/2007/09/hack_28_parser_mikefast_test_pl.txt
# Less general R code from Erik Iverson of Blogistic Reflections:
# http://blogisticreflections.wordpress.com/2009/10/04/using-r-to-analyze-baseball-games-in-real-time/
# listing of pitch f/x tools by Baseball Analysts
# http://baseballanalysts.com/archives/2010/03/how_can_i_get_m.php
# downloadable pitch f/x database from Darrell Zimmerman
# http://www.wantlinux.net/category/baseball-data/
# I think gameday data starts 2005
# I think enhanced gameday (pitch fx) has all of 2009, most of 2008, some 2007, tiny bit 2006
# required libraries:
library(XML)
# code for <game type> in game.xml (input game.type in code)
# "S" ~ spring training, "R" ~ regular season, "D" ~ Division Series
# "L" ~ League Championship Series "W" ~ World Series
# code for <game gameday_sw> in game.xml
# http://sports.dir.groups.yahoo.com/group/RetroSQL/message/320
# "N" ~ missing, no pitch info
# "Y" ~ standard w/ pitch locations
# "E" ~ w/ pitch f/x
# "P" ~ for 2010, whatever that's supposed to mean
# code for teams
# code for players
# code for gameday
# code for pitch type
# code for atbat type
# checks for:
# gameday type
# home, away
# player, batter, pitch type
# -----------------------------------------------------------
DownloadPitchFX <- function(fileloc = "./pitchfx.txt",
start.date = "2009-05-02", end.date = start.date,
URL.base = "http://gd2.mlb.com/components/game/mlb/",
game.type = "R",
grab.pitch = c("des", "type", "x", "y",
"start_speed", "end_speed",
"sz_top", "sz_bot", "pfx_x", "pfx_z", "px", "pz",
"x0", "y0", "z0", "vx0", "vy0", "vz0", "ax", "ay", "az",
"break_y", "break_angle", "break_length", "pitch_type",
"type_confidence"),
grab.atbat = c("b", "s", "o", "batter", "pitcher", "b_height",
"stand", "p_throws", "event")) {
# write initial variables on file
meta <- c("Year", "Month", "Day", "Inning", "Home", "Away")
write(c(meta, grab.atbat, grab.pitch), file = fileloc,
ncol = length(c(grab.atbat, grab.pitch)) + length(meta), sep = "\t")
# transfer date info
start.date <- as.POSIXlt(start.date); end.date <- as.POSIXlt(end.date);
diff.date <- as.numeric(difftime(end.date, start.date))
date.range <- as.POSIXlt(seq(start.date, by = "days",
length = 1 + diff.date))
for (i in 1:(diff.date+1)) {
year <- date.range[i]$year + 1900
month <- date.range[i]$mon + 1
day <- date.range[i]$mday
URL.date <- paste(URL.base, "year_", year, "/",
ifelse(month >= 10, "month_", "month_0"), month, "/",
ifelse(day >= 10, "day_", "day_0"), day, "/", sep = "")
# grab matchups for today
## URL.scoreboard <- paste(URL.date, "miniscoreboard.xml", sep = "")
## XML.scoreboard <- xmlInternalTreeParse(URL.scoreboard)
## parse.scoreboard <- xpathSApply(XML.scoreboard, "//game[@gameday_link]",
## xmlGetAttr, "gameday_link")
HTML.day <- htmlParse(URL.date)
parse.day <- xpathSApply(HTML.day, "//a[@*]", xmlGetAttr, "href")
parse.day <- parse.day[grep("^gid_*", parse.day)]
# if games exists today
if (length(parse.day) >= 1) {
# for each game
for (game in 1:length(parse.day)) {
print(game)
URL.game <- paste(URL.date, parse.day[game], sep = "")
HTML.game <- htmlParse(URL.game)
parse.game.exists <- xpathSApply(HTML.game, "//a[@*]", xmlGetAttr, "href")
# if game.xml exists
if (sum(match(parse.game.exists, "game.xml"), na.rm = T) > 0) {
# grab game type (regular season, etc.) and gameday type (pitch f/x, etc.)
XML.game <- xmlInternalTreeParse(paste(URL.game, "game.xml", sep = ""))
parse.game <- sapply(c("type", "gameday_sw"), function (x)
xpathSApply(XML.game, "//game[@*]", xmlGetAttr, x))
# if proper game type: "R" ~ regular season, "S" ~ spring, "D" ~ divison series
# "L" ~ league chamption series, "W" ~ world series
if (parse.game['type'] == game.type) {
# grab team names
parse.teams <- sapply(c("abbrev"), function (x)
xpathSApply(XML.game, "//team[@*]", xmlGetAttr, x))
home <- parse.teams[1]; away <- parse.teams[2]
# if pitch f/x data exists
if (parse.game["gameday_sw"] == "E" | parse.game["gameday_sw"] == "P") {
# grab number of innings played
HTML.Ninnings <- htmlParse(paste(URL.game, "inning/", sep = ""))
parse.Ninnings <- xpathSApply(HTML.Ninnings, "//a[@*]", xmlGetAttr, "href")
# check to see if game exists data by checking innings > 1
if (length(grep("^inning_[0-9]", parse.Ninnings)) > 1) {
# for each inning
for (inning in 1:length(grep("^inning_[0-9]", parse.Ninnings))) {
# grab inning info
URL.inning <- paste(URL.game, "inning/", "inning_", inning,
".xml", sep = "")
XML.inning <- xmlInternalTreeParse(URL.inning)
parse.atbat <- xpathSApply(XML.inning, "//atbat[@*]")
parse.Npitches.atbat <- sapply(parse.atbat, function(x)
sum(names(xmlChildren(x)) == "pitch"))
# check to see if atbat exists
if (length(parse.atbat) > 0) {
print(paste(parse.day[game], "inning =", inning))
# parse attributes from pitch and atbat (ugh, ugly)
parse.pitch <- sapply(grab.pitch, function(x)
as.character(xpathSApply(XML.inning, "//pitch[@*]",
xmlGetAttr, x)))
parse.pitch <- if (class(parse.pitch) == "character") {
t(parse.pitch)
} else apply(parse.pitch, 2, as.character)
results.atbat <- t(sapply(parse.atbat, function(x)
xmlAttrs(x)[grab.atbat]))
results.atbat <- results.atbat[rep(seq(nrow(results.atbat)),
times = parse.Npitches.atbat),]
results.atbat <- if (class(results.atbat) == "character") {
t(results.atbat)
} else results.atbat
## parse.pitch <- sapply(grab.pitch, function(x)
## xpathSApply(XML.inning, "//pitch[@*]",
## xmlGetAttr, x))
## parse.pitch <- apply(parse.pitch, 2, as.character)
## results.atbat <- t(sapply(parse.atbat, function(x)
## xmlAttrs(x)[grab.atbat]))
## results.atbat <- results.atbat[rep(seq(nrow(results.atbat)),
## times = parse.Npitches.atbat),]
# write results
write(t(cbind(year, month, day, inning, home, away,
results.atbat, parse.pitch)), file = fileloc,
ncol = length(c(grab.atbat, grab.pitch)) + length(meta),
append = T, sep = "\t")
}
}
}
}
}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment