Created
April 1, 2011 23:06
-
-
Save Btibert3/899024 to your computer and use it in GitHub Desktop.
Player Streakiness in the NHL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## basics | |
# R 2.12.2 | |
# windows xp; Yes, I know | |
## libraries | |
library(XML) | |
library(plyr) | |
library(lubridate) | |
library(ggplot2) | |
# Set the working directory | |
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Player Streakiness") | |
# Set the constants | |
BASE <- "http://www.hockey-reference.com/players/h/hortona01/gamelog/" | |
SEASON <- c(2004, 2006:2011) | |
# Loop and grab the data | |
ds <- data.frame() | |
for (S in SEASON) { | |
URL <- paste(BASE, S, "/", sep="") | |
tables <- readHTMLTable(URL)$stats | |
head(tables, n=30) | |
# fix factors and names | |
for(i in 1:ncol(tables)) { | |
tables[,i] <- as.character(tables[,i]) | |
names(tables) <- tolower(colnames(tables)) | |
} | |
tables | |
str(tables) | |
names(tables)[6] <- "AwayHome" | |
names(tables)[8] <- "WinLoss" | |
names(tables)[9] <- "goals" | |
names(tables) | |
# fix the columns - NAs forced by coercion warnings | |
str(tables) | |
for(i in c(1:2, 9:19)) { | |
tables[,i] <- as.numeric(tables[, i]) | |
} | |
str(tables) | |
tables$year <- S | |
ds <- rbind.fill(ds, tables) | |
# BE KIND when scraping | |
Sys.sleep(10) | |
} | |
with(ds, table(year)) | |
head(ds, n=30) | |
dim(ds) | |
ds<- ds[!is.na(ds$rk), ] | |
dim(ds) | |
head(ds, n=30) | |
save(ds, file="Horton.Rdata") | |
# Need to change the date to an actual date in R | |
str(ds) | |
ds$date <- parse_date(ds$date, c("%Y", "%m", "%d"), seps="-") | |
str(ds) | |
# Format to the month year = do so by setting all with the same arbitrary year | |
# Set the last months of the season to the year plus 1 so the dates are in "order" when plotted | |
ds$date <- update(ds$date, year=2010) | |
ds$date[month(ds$date) < 10] <- update(ds$date[month(ds$date) < 10], year=2011) | |
head(ds, n=40) | |
# Help recieved from | |
# http://stackoverflow.com/questions/5494216/extract-date-in-r | |
# add cumulative goals by season and make a new dataframe | |
gamelog <- ddply(ds, .(year), transform, cumegoals = cumsum(goals)) | |
# plot the data | |
ggplot(aes(y=cumegoals, x=date), data=gamelog) + geom_point() + geom_line() + | |
facet_wrap(~year, ncol=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment