Skip to content

Instantly share code, notes, and snippets.

@MonkmanMH
Created February 17, 2013 06:45
Show Gist options
  • Save MonkmanMH/4970480 to your computer and use it in GitHub Desktop.
Save MonkmanMH/4970480 to your computer and use it in GitHub Desktop.
MLB runs per game - league trends
# MAJOR LEAGUE BASEBALL - RUNS PER GAME TREND
#
# discussion at
# 1. http://bayesball.blogspot.ca/2012/07/trends-in-al-run-scoring-using-r.html
# 2. http://bayesball.blogspot.ca/2012/07/trends-in-run-scoring-nl-edition-more-r.html
# 3. http://bayesball.blogspot.ca/2012/08/trends-in-run-scoring-comparing-leagues.html
#
# data source: Baseball Reference
# http://www.baseball-reference.com
# http://www.baseball-reference.com/leagues/AL/bat.shtml
# http://www.baseball-reference.com/leagues/NL/bat.shtml
#
# open with "csv" option (top right corner of the table),
# copy and paste into text editor or Excel, save as CSV file
#
# set working directory
# setwd("K:/data/R_the software/datatrials/baseball/RunsPerGame")
# read the data into a table
ALseason <- read.table(file="ALseasons.csv", sep = ",", header = TRUE)
NLseason <- read.table(file="NLseasons.csv", sep = ",", header = TRUE)
#
# alternate approach to reading NL season, from .txt file
NLseason <- read.table(file="NLseasons.txt", sep = ",", header = TRUE)
#
#
# RUNS SCORED PER GAME
# ====================
#
# start with American League
#
# very simple plot -- as (x, y)
plot(ALseason$Year, ALseason$R)
# as (y predicted by x)
plot(ALseason$R ~ ALseason$Year)
#
# create new object ALRunScore.LO for loess model
ALRunScore.LO <- loess(ALseason$R ~ ALseason$Year)
ALRunScore.LO.predict <- predict(ALRunScore.LO)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(ALseason$R ~ ALseason$Year,
ylim = ylim,
main = "American League: runs per team per game, 1901-2012",
xlab = "year", ylab = "runs per game")
# chart tidying
grid()
# loess predicted value line
lines(ALseason$Year, ALRunScore.LO.predict,
lty="solid", col="red", lwd=2)
#
#
# VERSION 2 -- add "span" control to adjust smoothing
#
# references:
# http://princeofslides.blogspot.ca/2011/05/sab-r-metrics-basics-of-loess.html
# http://research.stowers-institute.org/efg/R/Statistics/loess.htm
#
# create new object RunScore.LO for loess model, span=0.25
ALRunScore.LO.25 <- loess(ALseason$R ~ ALseason$Year, span=0.25)
ALRunScore.LO.25.predict <- predict(ALRunScore.LO.25)
#
ALRunScore.LO.5 <- loess(ALseason$R ~ ALseason$Year, span=0.5)
ALRunScore.LO.5.predict <- predict(ALRunScore.LO.5)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(ALseason$R ~ ALseason$Year,
ylim = ylim,
main = "American League: runs per team per game, 1901-2012",
xlab = "year", ylab = "runs per game")
# loess predicted value line
lines(ALseason$Year, ALRunScore.LO.predict, lty="solid", col="red", lwd=2)
lines(ALseason$Year, ALRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
lines(ALseason$Year, ALRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2)
# chart tidying
legend(1980, 3.5,
c("default", "span=0.25", "span=0.50"),
lty=c("solid", "dashed", "dotdash"),
col=c("red", "blue", "black"),
lwd=c(2, 2, 2))
grid()
#
#
#
#-N-N-N-N-N-N-N-N-N-N-N-N-N-N
#
# NATIONAL LEAGUE
#
# create new object RunScore.LO for loess model
NLRunScore.LO <- loess(NLseason$R ~ NLseason$Year)
NLRunScore.LO.predict <- predict(NLRunScore.LO)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(NLseason$R ~ NLseason$Year,
pch=2, col="black",
ylim = ylim,
main = "National League: runs per team per game, 1901-2012",
xlab = "year", ylab = "runs per game")
# loess predicted value line
lines(NLseason$Year, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2)
# chart tidying
grid()
#
#
# VERSION 2 -- add "span" control to adjust smoothing
#
# reference: http://research.stowers-institute.org/efg/R/Statistics/loess.htm
#
# create new object RunScore.LO for loess model
NLRunScore.LO.25 <- loess(NLseason$R ~ NLseason$Year, span=0.25)
NLRunScore.LO.25.predict <- predict(NLRunScore.LO.25)
NLRunScore.LO.5 <- loess(NLseason$R ~ NLseason$Year, span=0.5)
NLRunScore.LO.5.predict <- predict(NLRunScore.LO.5)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(NLseason$R ~ NLseason$Year,
pch=2, col="black",
ylim = ylim,
main = "National League: runs per team per game, 1901-2012",
xlab = "year", ylab = "runs per game")
# loess predicted value line
lines(NLseason$Year, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2)
lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="red", lwd=2)
lines(NLseason$Year, NLRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2)
# chart tidying
legend(1980, 3.5,
c("default", "span=0.25", "span=0.50"),
lty=c("solid", "dashed", "dotdash"),
col=c("blue", "red", "black"),
lwd=c(2, 2, 2))
grid()
#
#
#
#
# MULTI-PLOT -- MERGING AL AND NL RESULTS
#
# plot individual years as points
ylim <- c(3,6)
# start with AL
plot(ALseason$R ~ ALseason$Year,
type="p", pch=1, col="black",
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
points(NLseason$Year, NLseason$R, pch=2, col="blue")
# chart additions
grid()
legend(1900, 6, c("AL", "NL"), pch=c(1, 2), col=c("black", "blue"))
#
# plot individual years as lines
ylim <- c(3,6)
# start with AL line
plot(ALseason$R ~ ALseason$Year,
type="l", lty="solid", col="red", lwd=2,
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
lines(NLseason$Year, NLseason$R, lty="solid", col="blue", lwd=2)
# chart additions
grid()
legend(1900, 3.5, c("AL", "NL"), lty=c("solid", "solid"), col=c("red", "blue"), lwd=c(2, 2))
#
#
# plot loess curves (span=0.25)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.25.predict ~ ALseason$Year,
type="l", lty="solid", col="red", lwd=2,
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
# chart additions
legend(1900, 3.5,
c("AL (span=0.25)", "NL (span=0.25)"),
lty=c("solid", "dashed"),
col=c("red", "blue"),
lwd=c(2, 2))
grid()
#
#
# plot loess curves (span=0.50)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.5.predict ~ ALseason$Year,
type="l", lty="solid", col="red", lwd=2,
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
lines(NLseason$Year, NLRunScore.LO.5.predict, lty="dashed", col="blue", lwd=2)
# chart additions
legend(1900, 3.5,
c("AL (span=0.50)", "NL (span=0.50)"),
lty=c("solid", "dashed"),
col=c("red", "blue"),
lwd=c(2, 2))
grid()
#
#
#
# plot multiple loess curves (span=0.50 and 0.25)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.5.predict ~ ALseason$Year,
type="l", lty="solid", col="red", lwd=2,
main = "Runs per team per game, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add NL line
lines(NLseason$Year, NLRunScore.LO.5.predict, lty="solid", col="blue", lwd=2)
# add 0.25 lines
lines(ALseason$Year, ALRunScore.LO.25.predict, lty="dashed", col="red", lwd=2)
lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
# chart additions
legend(1900, 3.5,
c("AL (span=0.50)", "NL (span=0.50)", "AL (span=0.25)", "NL (span=0.25)"),
lty=c("solid", "solid", "dashed", "dashed"),
col=c("red", "blue", "red", "blue"),
lwd=c(2, 2, 2, 2))
grid()
#
# # # # # # # # # # # # # # # # # #
#
# calculate the difference between the two leagues
# 1. absolute
RunDiff <- (ALseason$R - NLseason$R)
# 2. LOESS span=0.25
RunDiffLO <- (ALRunScore.LO.25.predict - NLRunScore.LO.25.predict)
#
# plot the LOESS difference
ylim <- c(-1,1)
plot(RunDiffLO ~ ALseason$Year,
type="l", lty="solid", col="red", lwd=2,
main = "Run scoring trend: AL difference from NL, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add line at zero
abline(h = 0, lty="dotdash")
grid()
#
# plot each year difference as line, trend as line
ylim <- c(-1,1.5)
plot(RunDiffLO ~ ALseason$Year,
type="l", lty="solid", col="red", lwd=3,
main = "Run scoring trend: AL difference from NL, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add RunDiff line
lines(ALseason$Year, RunDiff, lty="solid", col="black", lwd=1)
# add line at zero
abline(h = 0, lty="dotdash")
grid()
#
#
# plot each year difference as bar, trend as line
ylim <- c(-1,1.5)
plot(RunDiff ~ ALseason$Year,
type="h", lty="solid", col="blue", lwd=2,
main = "Run scoring trend: AL difference from NL, 1901-2012",
ylim = ylim,
xlab = "year", ylab = "runs per game")
# add RunDiff line
lines(ALseason$Year, RunDiffLO, lty="solid", col="black", lwd=2)
# add line at zero
abline(h = 0, lty="dotdash")
# chart additions
grid()
legend(1900, 1.5,
c("AL difference from NL: absolute", "AL difference from NL, LOESS (span=0.25)"),
lty=c("solid", "solid"),
col=c("blue", "black"),
lwd=c(2, 2))
#
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment