Created
January 18, 2013 22:32
-
-
Save tcash21/4569252 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
library(RCurl) | |
library(ggplot2) | |
results <- c() | |
## Loop through the 30 pages of player data | |
for(i in 1:30){ | |
theURL <- paste("http://www.nhl.com/ice/playerstats.htm?fetchKey=20122ALLSASAll&viewName=summary&sort=points&pg=", i, sep="") | |
webpage <- getURL(theURL) | |
h<-htmlParse(webpage) | |
stats <- xmlToDataFrame(nodes = getNodeSet(h, "//tbody//tr"))[,-1] | |
## Grab the column names only on the first iteration | |
if(i == 1){ | |
nodes<-getNodeSet(h, "//table [@summary='2011-2012 - Regular Season - Skater - Summary - Points'] | |
//thead//tr//th//a[@title]") | |
cols <- as.character(xmlToDataFrame(nodes)[,1]) | |
column.names <- gsub("\\n", "", cols) | |
## Append columns since any sorted column and Team column do not appear in a structured format in the HTML | |
column.names<-append(column.names, "Team", after=1) | |
column.names<-append(column.names, "P", after=6) | |
## Clean up column names so they are R-friendly | |
column.names[8] <- "Plus.Minus" | |
column.names[15] <- "Shooting.Percentage" | |
column.names[16] <- "Time.On.Ice.Per.Game" | |
column.names[17] <- "Avg.Shifts.Per.Game" | |
column.names[18] <- "Faceoff.Win.Percentage" | |
results <- rbind(results, stats) | |
colnames(results) <- column.names | |
} | |
colnames(stats) <- column.names | |
results <- rbind(results, stats) | |
} | |
## Remove plus signs from +/- so we can treat it as a number | |
results$Plus.Minus <- as.numeric(gsub("\\+", "", results$Plus.Minus)) | |
## Format factors as numeric data types | |
results[,c(4:15, 17:18)] <- apply(results[,c(4:15, 17:18)], 2, function(x) as.numeric(as.character(x))) | |
results <- results[match(unique(results$Player ), results$Player),] | |
## We only care about the first Team listed and not if that player was on multiple teams in 2011-12 | |
results$Team <- gsub("\\,\\s+\\w+", "", as.character(s.results$Team)) | |
## Pull out a team to visualize | |
t.results <- subset(s.results, Team == "BOS") | |
## Plot the data and save in a PDF | |
pdf(file="Bruins.pdf", width=11, height=8) | |
ggplot(t.results, aes(x=Plus.Minus, y=P, size=Avg.Shifts.Per.Game, colour=Pos, label=Player)) + geom_text() + | |
labs(x="+/-", y="Points", title= t.results$Team) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi all, this URL is no longer active on NHL's site, but if you want a much easier way to get hockey data you should check out www.stattleship.com. There is an R wrapper and it is currently in free beta mode.