MonkmanMH · February 25, 2013 04:43
diff --git a/gistfile1.r b/gistfile1.r
 # THE HISTORICAL RECORD - RUNS PER GAME
 #
 # discussion and output can be found at
 # http://bayesball.blogspot.ca/2013/02/comparing-individual-team-run-production.html
 #
 # data source: Lahman Database
 # http://www.seanlahman.com/baseball-archive/statistics/
 # 2012 version (1871-2012)
 # table: "Teams"
 #
 # note: R doesn't like something in line 141 of the stadium name fields,
 # so reading the data becomes a two-step process 
 #
 # read the file, then convert it into a datafram
 Teams1 <- read.csv(file="Teams.csv", header = TRUE)
 Teams1 <- as.data.frame(Teams1)
 #
 # select only those teams from 1947 forward to 2012
 # see Bill James http://www.billjamesonline.com/dividing_baseball_history_into_eras/
 Teams <- as.data.frame(subset (Teams1, yearID > 1946))
 #
 # calculate average runs per game
 Teams$RPG <- Teams$R / Teams$G    			
 # calculate average runs allowed per game
 Teams$RAPG <- Teams$RA / Teams$G					
 #
 # calculate RPG and RAPG season averages for each league
 # step 1a: sum of annual runs per league
 RunsLG <- data.frame(aggregate(Teams$R ~ Teams$yearID + Teams$lgID, FUN=sum))
 # step 1b: sum of annual runs allowed per league
 #   [NOTE: since the introduction of interleague play, this is not the same as runs scored!]
 RunsALG <- data.frame(aggregate(Teams$RA ~ Teams$yearID + Teams$lgID, FUN=sum))
 # step 2: sum of annual games per league
 GamesLG <- data.frame(aggregate(Teams$G ~ Teams$yearID + Teams$lgID, FUN=sum))
 #
 #
 # merge the two objects together (need to find a more elegant way to do this!)
 LG_RPG <- data.frame(merge(RunsLG, RunsALG, 
   by.x = c("Teams.yearID", "Teams.lgID"), 
   by.y = c("Teams.yearID", "Teams.lgID")))
 #
 LG_RPG <- data.frame(merge(LG_RPG, GamesLG, 
   by.x = c("Teams.yearID", "Teams.lgID"), 
   by.y = c("Teams.yearID", "Teams.lgID")))
 #
 # clean up the variable names
 names(LG_RPG)[1]<-paste("yearID")
 names(LG_RPG)[2]<-paste("lgID")
 #
 # Peter's more elegant solution for lines #35-45
 LG_RPG <- aggregate(cbind(R, RA, G) ~ yearID + lgID, data = Teams, sum)
 #
 # calculate league runs and runs allowed per game
 LG_RPG$LG_RPG <- LG_RPG$Teams.R / LG_RPG$Teams.G
 LG_RPG$LG_RAPG <- LG_RPG$Teams.RA / LG_RPG$Teams.G
 ##
 # Use the "merge" command to append the league values to the correct rows
 # Creates a single data frame Teams.merge that contains the team runs etc as well as the league run values for that season
 # 
 Teams.merge <- merge(Teams, LG_RPG)
 #
 #
 # CREATE INDEX VALUES FOR EACH TEAM
 #
 # A. Runs per game
 #
 # create new values to compare the individual team's runs/game compares to the league average that season
 # 1. use an index where 100=the league average for that season
 Teams.merge$R_index <- Teams.merge$RPG / Teams.merge$LG_RPG * 100
 # 2. and Z scores of the index scores
 R_index.sd <- sd(Teams.merge$R_index)
 Teams.merge$R_Z <- (Teams.merge$R_index - 100)/R_index.sd
 #
 # calculate minimum, maximum, and standard deviation
 min(Teams.merge$R_index)
 max(Teams.merge$R_index)
 sd(Teams.merge$R_index)
 #
 # B. Runs allowed per game
 #
 # create new values to compare the individual team's runs allowed/game compares to the league average that season
 # 1. use an index where 100=the league average for that season
 Teams.merge$RA_index <- Teams.merge$RAPG / Teams.merge$LG_RAPG * 100
 # 2. and Z scores of the index scores
 RA_index.sd <- sd(Teams.merge$RA_index)
 Teams.merge$RA_Z <- (Teams.merge$RA_index - 100)/R_index.sd
 #
 # calculate minimum, maximum, and standard deviation
 min(Teams.merge$RA_index)
 max(Teams.merge$RA_index)
 sd(Teams.merge$RA_index)
 #
 #
 # RANK AND SORT BY R_INDEX
 # 1. low to high (default) 
 #   a. rank
 Teams.merge$R_index_rank <- rank(Teams.merge$R_index)
 #   b. Sort
 Teams.merge.sort <- Teams.merge[c("yearID","lgID","franchID","R_index", "R_index_rank")] 
 Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index),]
 Teams.low_off <- as.data.frame (subset(Teams.merge.sort, R_index < 80))
 Teams.low_off
 write.csv(Teams.low_off, file="Teams.low_off.csv")
 #
 # 2. high to low 
 #   a. rank (note use of "-" in front of variable name)
 Teams.merge$R_index_rank <- rank(-Teams.merge$R_index)
 #   b. sort (note use of "decreasing=TRUE" in "order" command)
 Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index,decreasing=TRUE),]
 Teams.hi_off <- as.data.frame (subset(Teams.merge.sort, R_index > 120))
 Teams.hi_off
 write.csv(Teams.hi_off, file="Teams.hi_off.csv")
 #
 #
 # PLOT!
 # index (basic)
 hist(Teams.merge$R_index, 
  main="MLB teams 1947-2012: Distribution of scoring",
  xla="Index value (100=league average)")
 #
 # index with density curve
 hist(Teams.merge$R_index, 
  prob=T,
  main="MLB teams 1947-2012: Distribution & density curve of scoring",
  xla="Index value (100=league average)")
 lines(density(Teams.merge$R_index))
 #
 # Z scores
 hist(Teams.merge$R_Z, prob=T)
 lines(density(Teams.merge$R_Z))
 #
 #
 # and write a new file with the Teams.merge data 
 write.csv(Teams.merge, file="Teams.merge.csv")
 #
	# THE HISTORICAL RECORD - RUNS PER GAME
	#
	# discussion and output can be found at
	# http://bayesball.blogspot.ca/2013/02/comparing-individual-team-run-production.html
	#
	# data source: Lahman Database
	# http://www.seanlahman.com/baseball-archive/statistics/
	# 2012 version (1871-2012)
	# table: "Teams"
	#
	# note: R doesn't like something in line 141 of the stadium name fields,
	# so reading the data becomes a two-step process
	#
	# read the file, then convert it into a datafram
	Teams1 <- read.csv(file="Teams.csv", header = TRUE)
	Teams1 <- as.data.frame(Teams1)
	#
	# select only those teams from 1947 forward to 2012
	# see Bill James http://www.billjamesonline.com/dividing_baseball_history_into_eras/
	Teams <- as.data.frame(subset (Teams1, yearID > 1946))
	#
	# calculate average runs per game
	Teams$RPG <- Teams$R / Teams$G
	# calculate average runs allowed per game
	Teams$RAPG <- Teams$RA / Teams$G
	#
	# calculate RPG and RAPG season averages for each league
	# step 1a: sum of annual runs per league
	RunsLG <- data.frame(aggregate(Teams$R ~ Teams$yearID + Teams$lgID, FUN=sum))
	# step 1b: sum of annual runs allowed per league
	# [NOTE: since the introduction of interleague play, this is not the same as runs scored!]
	RunsALG <- data.frame(aggregate(Teams$RA ~ Teams$yearID + Teams$lgID, FUN=sum))
	# step 2: sum of annual games per league
	GamesLG <- data.frame(aggregate(Teams$G ~ Teams$yearID + Teams$lgID, FUN=sum))
	#
	#
	# merge the two objects together (need to find a more elegant way to do this!)
	LG_RPG <- data.frame(merge(RunsLG, RunsALG,
	by.x = c("Teams.yearID", "Teams.lgID"),
	by.y = c("Teams.yearID", "Teams.lgID")))
	#
	LG_RPG <- data.frame(merge(LG_RPG, GamesLG,
	by.x = c("Teams.yearID", "Teams.lgID"),
	by.y = c("Teams.yearID", "Teams.lgID")))
	#
	# clean up the variable names
	names(LG_RPG)[1]<-paste("yearID")
	names(LG_RPG)[2]<-paste("lgID")
	#
	# Peter's more elegant solution for lines #35-45
	LG_RPG <- aggregate(cbind(R, RA, G) ~ yearID + lgID, data = Teams, sum)
	#
	# calculate league runs and runs allowed per game
	LG_RPG$LG_RPG <- LG_RPG$Teams.R / LG_RPG$Teams.G
	LG_RPG$LG_RAPG <- LG_RPG$Teams.RA / LG_RPG$Teams.G
	##
	# Use the "merge" command to append the league values to the correct rows
	# Creates a single data frame Teams.merge that contains the team runs etc as well as the league run values for that season
	#
	Teams.merge <- merge(Teams, LG_RPG)
	#
	#
	# CREATE INDEX VALUES FOR EACH TEAM
	#
	# A. Runs per game
	#
	# create new values to compare the individual team's runs/game compares to the league average that season
	# 1. use an index where 100=the league average for that season
	Teams.merge$R_index <- Teams.merge$RPG / Teams.merge$LG_RPG * 100
	# 2. and Z scores of the index scores
	R_index.sd <- sd(Teams.merge$R_index)
	Teams.merge$R_Z <- (Teams.merge$R_index - 100)/R_index.sd
	#
	# calculate minimum, maximum, and standard deviation
	min(Teams.merge$R_index)
	max(Teams.merge$R_index)
	sd(Teams.merge$R_index)
	#
	# B. Runs allowed per game
	#
	# create new values to compare the individual team's runs allowed/game compares to the league average that season
	# 1. use an index where 100=the league average for that season
	Teams.merge$RA_index <- Teams.merge$RAPG / Teams.merge$LG_RAPG * 100
	# 2. and Z scores of the index scores
	RA_index.sd <- sd(Teams.merge$RA_index)
	Teams.merge$RA_Z <- (Teams.merge$RA_index - 100)/R_index.sd
	#
	# calculate minimum, maximum, and standard deviation
	min(Teams.merge$RA_index)
	max(Teams.merge$RA_index)
	sd(Teams.merge$RA_index)
	#
	#
	# RANK AND SORT BY R_INDEX
	# 1. low to high (default)
	# a. rank
	Teams.merge$R_index_rank <- rank(Teams.merge$R_index)
	# b. Sort
	Teams.merge.sort <- Teams.merge[c("yearID","lgID","franchID","R_index", "R_index_rank")]
	Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index),]
	Teams.low_off <- as.data.frame (subset(Teams.merge.sort, R_index < 80))
	Teams.low_off
	write.csv(Teams.low_off, file="Teams.low_off.csv")
	#
	# 2. high to low
	# a. rank (note use of "-" in front of variable name)
	Teams.merge$R_index_rank <- rank(-Teams.merge$R_index)
	# b. sort (note use of "decreasing=TRUE" in "order" command)
	Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index,decreasing=TRUE),]
	Teams.hi_off <- as.data.frame (subset(Teams.merge.sort, R_index > 120))
	Teams.hi_off
	write.csv(Teams.hi_off, file="Teams.hi_off.csv")
	#
	#
	# PLOT!
	# index (basic)
	hist(Teams.merge$R_index,
	main="MLB teams 1947-2012: Distribution of scoring",
	xla="Index value (100=league average)")
	#
	# index with density curve
	hist(Teams.merge$R_index,
	prob=T,
	main="MLB teams 1947-2012: Distribution & density curve of scoring",
	xla="Index value (100=league average)")
	lines(density(Teams.merge$R_index))
	#
	# Z scores
	hist(Teams.merge$R_Z, prob=T)
	lines(density(Teams.merge$R_Z))
	#
	#
	# and write a new file with the Teams.merge data
	write.csv(Teams.merge, file="Teams.merge.csv")
	#