Created
February 25, 2013 04:43
-
-
Save MonkmanMH/5027789 to your computer and use it in GitHub Desktop.
MLB runs per game - league average
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# THE HISTORICAL RECORD - RUNS PER GAME | |
# | |
# discussion and output can be found at | |
# http://bayesball.blogspot.ca/2013/02/comparing-individual-team-run-production.html | |
# | |
# data source: Lahman Database | |
# http://www.seanlahman.com/baseball-archive/statistics/ | |
# 2012 version (1871-2012) | |
# table: "Teams" | |
# | |
# note: R doesn't like something in line 141 of the stadium name fields, | |
# so reading the data becomes a two-step process | |
# | |
# read the file, then convert it into a datafram | |
Teams1 <- read.csv(file="Teams.csv", header = TRUE) | |
Teams1 <- as.data.frame(Teams1) | |
# | |
# select only those teams from 1947 forward to 2012 | |
# see Bill James http://www.billjamesonline.com/dividing_baseball_history_into_eras/ | |
Teams <- as.data.frame(subset (Teams1, yearID > 1946)) | |
# | |
# calculate average runs per game | |
Teams$RPG <- Teams$R / Teams$G | |
# calculate average runs allowed per game | |
Teams$RAPG <- Teams$RA / Teams$G | |
# | |
# calculate RPG and RAPG season averages for each league | |
# step 1a: sum of annual runs per league | |
RunsLG <- data.frame(aggregate(Teams$R ~ Teams$yearID + Teams$lgID, FUN=sum)) | |
# step 1b: sum of annual runs allowed per league | |
# [NOTE: since the introduction of interleague play, this is not the same as runs scored!] | |
RunsALG <- data.frame(aggregate(Teams$RA ~ Teams$yearID + Teams$lgID, FUN=sum)) | |
# step 2: sum of annual games per league | |
GamesLG <- data.frame(aggregate(Teams$G ~ Teams$yearID + Teams$lgID, FUN=sum)) | |
# | |
# | |
# merge the two objects together (need to find a more elegant way to do this!) | |
LG_RPG <- data.frame(merge(RunsLG, RunsALG, | |
by.x = c("Teams.yearID", "Teams.lgID"), | |
by.y = c("Teams.yearID", "Teams.lgID"))) | |
# | |
LG_RPG <- data.frame(merge(LG_RPG, GamesLG, | |
by.x = c("Teams.yearID", "Teams.lgID"), | |
by.y = c("Teams.yearID", "Teams.lgID"))) | |
# | |
# clean up the variable names | |
names(LG_RPG)[1]<-paste("yearID") | |
names(LG_RPG)[2]<-paste("lgID") | |
# | |
# Peter's more elegant solution for lines #35-45 | |
LG_RPG <- aggregate(cbind(R, RA, G) ~ yearID + lgID, data = Teams, sum) | |
# | |
# calculate league runs and runs allowed per game | |
LG_RPG$LG_RPG <- LG_RPG$Teams.R / LG_RPG$Teams.G | |
LG_RPG$LG_RAPG <- LG_RPG$Teams.RA / LG_RPG$Teams.G | |
## | |
# Use the "merge" command to append the league values to the correct rows | |
# Creates a single data frame Teams.merge that contains the team runs etc as well as the league run values for that season | |
# | |
Teams.merge <- merge(Teams, LG_RPG) | |
# | |
# | |
# CREATE INDEX VALUES FOR EACH TEAM | |
# | |
# A. Runs per game | |
# | |
# create new values to compare the individual team's runs/game compares to the league average that season | |
# 1. use an index where 100=the league average for that season | |
Teams.merge$R_index <- Teams.merge$RPG / Teams.merge$LG_RPG * 100 | |
# 2. and Z scores of the index scores | |
R_index.sd <- sd(Teams.merge$R_index) | |
Teams.merge$R_Z <- (Teams.merge$R_index - 100)/R_index.sd | |
# | |
# calculate minimum, maximum, and standard deviation | |
min(Teams.merge$R_index) | |
max(Teams.merge$R_index) | |
sd(Teams.merge$R_index) | |
# | |
# B. Runs allowed per game | |
# | |
# create new values to compare the individual team's runs allowed/game compares to the league average that season | |
# 1. use an index where 100=the league average for that season | |
Teams.merge$RA_index <- Teams.merge$RAPG / Teams.merge$LG_RAPG * 100 | |
# 2. and Z scores of the index scores | |
RA_index.sd <- sd(Teams.merge$RA_index) | |
Teams.merge$RA_Z <- (Teams.merge$RA_index - 100)/R_index.sd | |
# | |
# calculate minimum, maximum, and standard deviation | |
min(Teams.merge$RA_index) | |
max(Teams.merge$RA_index) | |
sd(Teams.merge$RA_index) | |
# | |
# | |
# RANK AND SORT BY R_INDEX | |
# 1. low to high (default) | |
# a. rank | |
Teams.merge$R_index_rank <- rank(Teams.merge$R_index) | |
# b. Sort | |
Teams.merge.sort <- Teams.merge[c("yearID","lgID","franchID","R_index", "R_index_rank")] | |
Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index),] | |
Teams.low_off <- as.data.frame (subset(Teams.merge.sort, R_index < 80)) | |
Teams.low_off | |
write.csv(Teams.low_off, file="Teams.low_off.csv") | |
# | |
# 2. high to low | |
# a. rank (note use of "-" in front of variable name) | |
Teams.merge$R_index_rank <- rank(-Teams.merge$R_index) | |
# b. sort (note use of "decreasing=TRUE" in "order" command) | |
Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index,decreasing=TRUE),] | |
Teams.hi_off <- as.data.frame (subset(Teams.merge.sort, R_index > 120)) | |
Teams.hi_off | |
write.csv(Teams.hi_off, file="Teams.hi_off.csv") | |
# | |
# | |
# PLOT! | |
# index (basic) | |
hist(Teams.merge$R_index, | |
main="MLB teams 1947-2012: Distribution of scoring", | |
xla="Index value (100=league average)") | |
# | |
# index with density curve | |
hist(Teams.merge$R_index, | |
prob=T, | |
main="MLB teams 1947-2012: Distribution & density curve of scoring", | |
xla="Index value (100=league average)") | |
lines(density(Teams.merge$R_index)) | |
# | |
# Z scores | |
hist(Teams.merge$R_Z, prob=T) | |
lines(density(Teams.merge$R_Z)) | |
# | |
# | |
# and write a new file with the Teams.merge data | |
write.csv(Teams.merge, file="Teams.merge.csv") | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment