Skip to content

Instantly share code, notes, and snippets.

@MonkmanMH
Created June 5, 2013 04:21
Show Gist options
  • Save MonkmanMH/5711584 to your computer and use it in GitHub Desktop.
Save MonkmanMH/5711584 to your computer and use it in GitHub Desktop.
MLB runs per game (Lahman database)
# load the package and data set "Teams"
install.packages("Lahman")
library("Lahman")
data(Teams)
#
#
# CREATE LEAGUE SUMMARY TABLES
# ============================
#
# select a sub-set of teams from 1901 [the establishment of the American League] forward to 2012
Teams_sub <- as.data.frame(subset (Teams, yearID > 1900))
#
# calculate each team's average runs and runs allowed per game
Teams_sub$RPG <- Teams_sub$R / Teams_sub$G
Teams_sub$RAPG <- Teams_sub$RA / Teams_sub$G
#
# create new data frame with season totals for each league
LG_RPG <- aggregate(cbind(R, RA, G) ~ yearID + lgID, data = Teams_sub, sum)
# calculate league + season runs and runs allowed per game
LG_RPG$LG_RPG <- LG_RPG$R / LG_RPG$G
LG_RPG$LG_RAPG <- LG_RPG$RA / LG_RPG$G
#
# select a sub-set of teams from 1901 [the establishment of the American League] forward to 2012
# read the data into separate league tables
ALseason <- (subset (LG_RPG, yearID > 1900 & lgID == "AL"))
NLseason <- (subset (LG_RPG, yearID > 1900 & lgID == "NL"))
#
# +++++++++++++++++++++++++++++++++++++++++++++++++++
#
# RUNS SCORED PER GAME
# ====================
#
# web references:
# http://princeofslides.blogspot.ca/2011/05/sab-r-metrics-basics-of-loess.html
# http://research.stowers-institute.org/efg/R/Statistics/loess.htm
#
# start with American League
# create new object ALRunScore.LO for loess model
ALRunScore.LO <- loess(ALseason$LG_RPG ~ ALseason$yearID)
ALRunScore.LO.predict <- predict(ALRunScore.LO)
#
# create new objects RunScore.Lo.XX for loess models with "span" control
ALRunScore.LO.25 <- loess(ALseason$LG_RPG ~ ALseason$yearID, span=0.25)
ALRunScore.LO.25.predict <- predict(ALRunScore.LO.25)
#
ALRunScore.LO.5 <- loess(ALseason$LG_RPG ~ ALseason$yearID, span=0.5)
ALRunScore.LO.5.predict <- predict(ALRunScore.LO.5)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(ALseason$LG_RPG ~ ALseason$yearID,
ylim = ylim,
main = "American League: runs per team per game, 1901-2012",
xlab = "year", ylab = "runs per game")
# loess predicted value line
lines(ALseason$yearID, ALRunScore.LO.predict, lty="solid", col="red", lwd=2)
lines(ALseason$yearID, ALRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
lines(ALseason$yearID, ALRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2)
legend(1980, 3.5,
c("default", "span=0.25", "span=0.50"),
lty=c("solid", "dashed", "dotdash"),
col=c("red", "blue", "black"),
lwd=c(2, 2, 2))
grid()
#
# NATIONAL LEAGUE
# create new object RunScore.LO for loess model
NLRunScore.LO <- loess(NLseason$LG_RPG ~ NLseason$yearID)
NLRunScore.LO.predict <- predict(NLRunScore.LO)
#
# objects with span control in loess model
NLRunScore.LO.25 <- loess(NLseason$LG_RPG ~ NLseason$yearID, span=0.25)
NLRunScore.LO.25.predict <- predict(NLRunScore.LO.25)
NLRunScore.LO.5 <- loess(NLseason$LG_RPG ~ NLseason$yearID, span=0.5)
NLRunScore.LO.5.predict <- predict(NLRunScore.LO.5)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(NLseason$LG_RPG ~ NLseason$yearID,
pch=2, col="black",
ylim = ylim,
main = "National League: runs per team per game, 1901-2012",
xlab = "year", ylab = "runs per game")
# loess predicted value line
lines(NLseason$yearID, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2)
lines(NLseason$yearID, NLRunScore.LO.25.predict, lty="dashed", col="red", lwd=2)
lines(NLseason$yearID, NLRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2)
# chart tidying
legend(1980, 3.5,
c("default", "span=0.25", "span=0.50"),
lty=c("solid", "dashed", "dotdash"),
col=c("blue", "red", "black"),
lwd=c(2, 2, 2))
grid()
#
#
# MULTI-PLOT -- MERGING AL AND NL RESULTS
# plot individual years as lines
ylim <- c(3,6)
# start with AL line
plot(ALseason$LG_RPG ~ ALseason$yearID,
type="l", lty="solid", col="red", lwd=2,
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
lines(NLseason$yearID, NLseason$LG_RPG, lty="solid", col="blue", lwd=2)
# chart additions
grid()
legend(1900, 3.5, c("AL", "NL"), lty=c("solid", "solid"), col=c("red", "blue"), lwd=c(2, 2))
#
#
# plot loess curves (span=0.25)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.25.predict ~ ALseason$yearID,
type="l", lty="solid", col="red", lwd=2,
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
lines(NLseason$yearID, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
# chart additions
legend(1900, 3.5,
c("AL (span=0.25)", "NL (span=0.25)"),
lty=c("solid", "dashed"),
col=c("red", "blue"),
lwd=c(2, 2))
grid()
#
#
# plot loess curves (span=0.50)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.5.predict ~ ALseason$yearID,
type="l", lty="solid", col="red", lwd=2,
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
lines(NLseason$yearID, NLRunScore.LO.5.predict, lty="dashed", col="blue", lwd=2)
# chart additions
legend(1900, 3.5,
c("AL (span=0.50)", "NL (span=0.50)"),
lty=c("solid", "dashed"),
col=c("red", "blue"),
lwd=c(2, 2))
grid()
#
#
#
# plot multiple loess curves (span=0.50 and 0.25)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.5.predict ~ ALseason$yearID,
type="l", lty="solid", col="red", lwd=2,
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
lines(NLseason$yearID, NLRunScore.LO.5.predict, lty="solid", col="blue", lwd=2)
# add 0.25 lines
lines(ALseason$yearID, ALRunScore.LO.25.predict, lty="dashed", col="red", lwd=2)
lines(NLseason$yearID, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
# chart additions
legend(1900, 3.5,
c("AL (span=0.50)", "NL (span=0.50)", "AL (span=0.25)", "NL (span=0.25)"),
lty=c("solid", "solid", "dashed", "dashed"),
col=c("red", "blue", "red", "blue"),
lwd=c(2, 2, 2, 2))
grid()
#
# # # # # # # # # # # # # # # # # #
#
# calculate the difference between the two leagues
# 1. absolute
RunDiff <- (ALseason$LG_RPG - NLseason$LG_RPG)
# 2. LOESS span=0.25
RunDiffLO <- (ALRunScore.LO.25.predict - NLRunScore.LO.25.predict)
#
# plot the LOESS difference
ylim <- c(-1,1)
plot(RunDiffLO ~ ALseason$yearID,
type="l", lty="solid", col="red", lwd=2,
main = "Run scoring trend: AL difference from NL, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add line at zero
abline(h = 0, lty="dotdash")
grid()
#
# plot each year difference as line, trend as line
ylim <- c(-1,1.5)
plot(RunDiffLO ~ ALseason$yearID,
type="l", lty="solid", col="red", lwd=3,
main = "Run scoring trend: AL difference from NL, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add RunDiff line
lines(ALseason$yearID, RunDiff, lty="solid", col="black", lwd=1)
# add line at zero
abline(h = 0, lty="dotdash")
grid()
#
#
# plot each year difference as bar, trend as line
ylim <- c(-1,1.5)
plot(RunDiff ~ ALseason$yearID,
type="h", lty="solid", col="blue", lwd=2,
main = "Run scoring trend: AL difference from NL, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add RunDiff line
lines(ALseason$yearID, RunDiffLO, lty="solid", col="black", lwd=2)
# add line at zero
abline(h = 0, lty="dotdash")
# chart additions
grid()
legend(1900, 1.5,
c("AL difference from NL: absolute", "AL difference from NL, LOESS (span=0.25)"),
lty=c("solid", "solid"),
col=c("blue", "black"),
lwd=c(2, 2))
#
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment