Created
April 2, 2011 18:39
-
-
Save Btibert3/899745 to your computer and use it in GitHub Desktop.
Function to grab NHL Skater Stats for a season
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Source the file with the Grab Skaters function | |
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Scripts") | |
source("SkaterStats.R") | |
## set the working directory | |
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/30 Goals 100 PIM") | |
## set the seasons | |
SEASON <- as.character(c(1960:2004, 2006:2011)) | |
#----------------------------------------------------------------------- | |
# Use the function to loop over the seasons and piece together | |
#----------------------------------------------------------------------- | |
## define the seasons -- 2005 dataset doesnt exist | |
## if I was a good coder I would trap the error, but this works | |
SEASON <- as.character(c(1960:2004, 2006:2011)) | |
## create an empy dataset that we will append to | |
dataset <- data.frame() | |
## loop over the seasons, use the function to grab the data | |
## and build the dataset | |
for (S in SEASON) { | |
require(plyr) | |
temp <- GrabSkaters(S) | |
dataset <- rbind.fill(dataset, temp) | |
print(paste("Completed Season ", S, sep="")) | |
## pause the script so we don't kill their servers | |
Sys.sleep(10) | |
} | |
head(dataset); tail(dataset); | |
dataset <- dataset[dataset$tm != 'TOT', ] | |
save(dataset, file="hr skater db.Rdata") | |
#----------------------------------------------------------------------- | |
# Lets get a sense for how many players finish 30/100 | |
#----------------------------------------------------------------------- | |
# Let's look at a scatterplot of the data | |
png("Scatter Plot.png") | |
plot(x=dataset$pim, y=dataset$g, type="p", pch=20, cex=.9, | |
xlab="Penalty Mins", ylab="Goals Scored", bty="n", | |
main="Correlate Goals Scored and Penalty Minutes") | |
abline(h=30, lty=2, col="red"); abline(v=100, lty=2, col="red"); | |
dev.off() | |
# raw output | |
dstemp <- transform(dataset, ind30100 = as.numeric(g >=30 & pim >=100)) | |
str(dstemp) | |
with(dstemp, addmargins(table(season, ind30100))) | |
with(dstemp, addmargins(table(season, ind30100))) | |
# % of players that meet criteria per season | |
library(sqldf) | |
(pct <- sqldf("SELECT season, AVG(ind30100) as pct FROM dstemp GROUP BY season")) | |
# basic plot of the data | |
barplot(pct$pct, names.arg=pct$season) | |
# trying to learn ggplot | |
library(ggplot2) | |
ds <- dataset[dstemp$g >=30 & dstemp$pim >=100, c("season", "player")] | |
min(as.numeric(ds$season)); as.numeric(max(ds$season)); | |
p <- ggplot(ds, aes(as.numeric(season))) + geom_bar(fill="black", colour="white", binwidth=1) | |
p <- p + labs(x="Season", y="# Players") | |
ggsave(p, file="Distribution.png") | |
# look at a listing of the data again | |
View(pct) | |
# Who are the players in the 2011 season that meet this criteria | |
dataset[dataset$season == "2011" & dataset$g >= 30 & dataset$pim >=100, c("player", "g", "pim")] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################################################################### | |
# Function to scrape season skater statistics from Hockey-reference.com | |
####################################################################################### | |
GrabSkaters <- function(S) { | |
# The function takes parameter S which is a string and represents the Season | |
# Returns: data frame | |
require(XML) | |
## create the URL | |
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", | |
S, "_skaters.html", sep="") | |
## grab the page -- the table is parsed nicely | |
tables <- readHTMLTable(URL) | |
ds.skaters <- tables$stats | |
## determine if the HTML table was well formed (column names are the first record) | |
## can either read in directly or need to force column names | |
## and | |
## I don't like dealing with factors if I don't have to | |
## and I prefer lower case | |
for(i in 1:ncol(ds.skaters)) { | |
ds.skaters[,i] <- as.character(ds.skaters[,i]) | |
names(ds.skaters) <- tolower(colnames(ds.skaters)) | |
} | |
## fix a couple of the column names | |
colnames(ds.skaters) | |
## names(ds.skaters)[10] <- "plusmin" | |
names(ds.skaters)[11] <- "plusmin" | |
names(ds.skaters)[18] <- "spct" | |
## finally fix the columns - NAs forced by coercion warnings | |
for(i in c(1, 3, 6:18)) { | |
ds.skaters[,i] <- as.numeric(ds.skaters[, i]) | |
} | |
## convert toi to seconds, and seconds/game | |
## ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp | |
## remove the header and totals row | |
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ] | |
## ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ] | |
## add the year | |
ds.skaters$season <- S | |
## return the dataframe | |
return(ds.skaters) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment