tcash21 · January 18, 2013 22:32 · paulmotz · May 5, 2014 · DPotter555 · Nov 20, 2015
diff --git a/NHL data scrape R b/NHL data scrape R
 library(XML)
 library(RCurl)
 library(ggplot2)

 results <- c()

 ## Loop through the 30 pages of player data
 for(i in 1:30){
  theURL <- paste("http://www.nhl.com/ice/playerstats.htm?fetchKey=20122ALLSASAll&viewName=summary&sort=points&pg=", i, sep="")
  webpage <- getURL(theURL)
  h<-htmlParse(webpage)
  stats <- xmlToDataFrame(nodes = getNodeSet(h, "//tbody//tr"))[,-1]
  
  ## Grab the column names only on the first iteration
  if(i == 1){
    nodes<-getNodeSet(h, "//table [@summary='2011-2012 - Regular Season - Skater - Summary - Points']
                  //thead//tr//th//a[@title]")
    cols <- as.character(xmlToDataFrame(nodes)[,1])
    column.names <- gsub("\\n", "", cols)
    
    ## Append columns since any sorted column and Team column do not appear in a structured format in the HTML
    column.names<-append(column.names, "Team", after=1)
    column.names<-append(column.names, "P", after=6)
    
    ## Clean up column names so they are R-friendly
    column.names[8] <- "Plus.Minus"
    column.names[15] <- "Shooting.Percentage"
    column.names[16] <- "Time.On.Ice.Per.Game"
    column.names[17] <- "Avg.Shifts.Per.Game"
    column.names[18] <- "Faceoff.Win.Percentage"
    results <- rbind(results, stats)
    colnames(results) <- column.names
  }
  colnames(stats) <- column.names
  results <- rbind(results, stats)
 }

 ## Remove plus signs from +/- so we can treat it as a number
 results$Plus.Minus <- as.numeric(gsub("\\+", "", results$Plus.Minus))

 ## Format factors as numeric data types
 results[,c(4:15, 17:18)] <- apply(results[,c(4:15, 17:18)], 2, function(x) as.numeric(as.character(x)))
 results <- results[match(unique(results$Player ), results$Player),]

 ## We only care about the first Team listed and not if that player was on multiple teams in 2011-12
 results$Team <- gsub("\\,\\s+\\w+", "", as.character(s.results$Team))

 ## Pull out a team to visualize
 t.results <- subset(s.results, Team == "BOS")

 ## Plot the data and save in a PDF
 pdf(file="Bruins.pdf", width=11, height=8)
 ggplot(t.results, aes(x=Plus.Minus, y=P, size=Avg.Shifts.Per.Game, colour=Pos, label=Player)) +  geom_text() +
  labs(x="+/-", y="Points", title= t.results$Team)
 dev.off()
	library(XML)
	library(RCurl)
	library(ggplot2)

	results <- c()

	## Loop through the 30 pages of player data
	for(i in 1:30){
	theURL <- paste("http://www.nhl.com/ice/playerstats.htm?fetchKey=20122ALLSASAll&viewName=summary&sort=points&pg=", i, sep="")
	webpage <- getURL(theURL)
	h<-htmlParse(webpage)
	stats <- xmlToDataFrame(nodes = getNodeSet(h, "//tbody//tr"))[,-1]

	## Grab the column names only on the first iteration
	if(i == 1){
	nodes<-getNodeSet(h, "//table [@summary='2011-2012 - Regular Season - Skater - Summary - Points']
	//thead//tr//th//a[@title]")
	cols <- as.character(xmlToDataFrame(nodes)[,1])
	column.names <- gsub("\\n", "", cols)

	## Append columns since any sorted column and Team column do not appear in a structured format in the HTML
	column.names<-append(column.names, "Team", after=1)
	column.names<-append(column.names, "P", after=6)

	## Clean up column names so they are R-friendly
	column.names[8] <- "Plus.Minus"
	column.names[15] <- "Shooting.Percentage"
	column.names[16] <- "Time.On.Ice.Per.Game"
	column.names[17] <- "Avg.Shifts.Per.Game"
	column.names[18] <- "Faceoff.Win.Percentage"
	results <- rbind(results, stats)
	colnames(results) <- column.names
	}
	colnames(stats) <- column.names
	results <- rbind(results, stats)
	}

	## Remove plus signs from +/- so we can treat it as a number
	results$Plus.Minus <- as.numeric(gsub("\\+", "", results$Plus.Minus))

	## Format factors as numeric data types
	results[,c(4:15, 17:18)] <- apply(results[,c(4:15, 17:18)], 2, function(x) as.numeric(as.character(x)))
	results <- results[match(unique(results$Player ), results$Player),]

	## We only care about the first Team listed and not if that player was on multiple teams in 2011-12
	results$Team <- gsub("\\,\\s+\\w+", "", as.character(s.results$Team))

	## Pull out a team to visualize
	t.results <- subset(s.results, Team == "BOS")

	## Plot the data and save in a PDF
	pdf(file="Bruins.pdf", width=11, height=8)
	ggplot(t.results, aes(x=Plus.Minus, y=P, size=Avg.Shifts.Per.Game, colour=Pos, label=Player)) + geom_text() +
	labs(x="+/-", y="Points", title= t.results$Team)
	dev.off()