Skip to content

Instantly share code, notes, and snippets.

@glamp
Created May 27, 2016 15:53
Show Gist options
  • Save glamp/68810d9ca13c30e0d0ecd163808172d5 to your computer and use it in GitHub Desktop.
Save glamp/68810d9ca13c30e0d0ecd163808172d5 to your computer and use it in GitHub Desktop.
import feather
import pandas as pd
from ggplot import *
standings = feather.read_dataframe('./standings.feather')
attendance = feather.read_dataframe('./attendance.feather')
payrolls = attendance[['year', 'est_payroll']].groupby('year').mean() / 1000
print payrolls[(payrolls.index==1970) | (payrolls.index==2010)]
mean_payrolls = attendance[['year', 'est_payroll']].groupby('year').mean().reset_index()
mean_payrolls.columns = ['year', 'league_mean_payroll']
attendance = pd.merge(attendance, mean_payrolls, on='year')
attendance['norm_payroll'] = attendance.est_payroll / attendance.league_mean_payroll
print ggplot(attendance, aes(x='norm_payroll')) + geom_histogram()
print ggplot(attendance, aes(x='norm_payroll', color='factor(year)')) + geom_density()
min_payrolls = attendance[['year', 'est_payroll']].groupby('year').min().reset_index()
min_payrolls.columns = ['year', 'league_min_payroll']
max_payrolls = attendance[['year', 'est_payroll']].groupby('year').max().reset_index()
max_payrolls.columns = ['year', 'league_max_payroll']
attendance = pd.merge(attendance, min_payrolls, on='year')
attendance = pd.merge(attendance, max_payrolls, on='year')
attendance['norm_payroll_0_1'] = (attendance.est_payroll - attendance.league_min_payroll) / (attendance.league_max_payroll - attendance.league_min_payroll)
print ggplot(attendance, aes(x='norm_payroll_0_1')) + geom_histogram()
print ggplot(attendance, aes(x='norm_payroll_0_1', color='factor(year)')) + geom_density()
library(XML)
library(stringr)
library(plyr)
library(ggplot2)
make_numeric <- function(x) {
x <- str_replace_all(x, ",", "") # remove all commas
x <- str_replace_all(x, "[$]", "") # remove all $
as.numeric(x) # cast as a number
}
years <- 1950:2015
print("Scraping attendance data...")
attendance <- ldply(years, function(year) {
url <- paste0("http://www.baseball-reference.com/leagues/MLB/", year, "-misc.shtml")
data <- readHTMLTable(url, stringsAsFactors = FALSE)[[1]]
data$year <- year
data
}, .progress="text")
names(attendance) <- c("tm", "attendance", "attend_per_game", "batage", "page",
"bpf", "ppf", "n_hof", "n_aallstars", "n_a_ta_s", "est_payroll", "time",
"managers", "year")
attendance$attendance <- make_numeric(attendance$attendance)
attendance$attend_per_game <- make_numeric(attendance$attend_per_game)
attendance$est_payroll <- make_numeric(attendance$est_payroll)
print("Scraping standings data...")
standings <- ldply(years, function(year) {
url <- paste0("http://www.baseball-reference.com/leagues/MLB/", year, "-standings.shtml")
data <- readHTMLTable(url, stringsAsFactors = FALSE)
data <- data[[length(data)]]
data$year <- year
subset(data, Tm != "Avg")
}, .progress="text")
names(standings) <- c("rk", "tm", "lg", "g", "w", "l", "wins_losses", "r", "ra",
"rdiff", "sos", "srs", "pythwl", "luck", "home", "road", "exinn", "1run",
"vrhp", "vlhp", "vs_teams_above_500", "vs_teams_below_500", "year", "inter")
standings$g <- make_numeric(standings$g)
standings$w <- make_numeric(standings$w)
standings$l <- make_numeric(standings$l)
standings$r <- make_numeric(standings$r)
standings$wins_losses <- make_numeric(standings$wins_losses)
df <- merge(standings, attendance, by=c("tm", "year"))
standings$last_year <- standings$year - 1
df <- merge(df, standings[,c("tm", "last_year", "w")], by.x=c("tm", "year"), by.y=c("tm", "last_year"))
names(df)[6] <- "w"
names(df)[37] <- "w_last_year"
head(df)
library(feather)
write_feather(standings, "standings.feather")
write_feather(attendance, "attendance.feather")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment