Skip to content

Instantly share code, notes, and snippets.

@Btibert3
Created November 28, 2011 17:34
Show Gist options
  • Save Btibert3/1401223 to your computer and use it in GitHub Desktop.
Save Btibert3/1401223 to your computer and use it in GitHub Desktop.
Basic Web Mining with R
###############################################################################
# Author: @BrockTibert
#
# Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse
#
#
###############################################################################
#-----------------------------------------------------------------------
# set up script level basics
#-----------------------------------------------------------------------
## I code on the screen, so I set it wider for the console output
options(widht=180)
## libraries
library(XML)
library(plyr)
library(gdata)
library(sqldf)
## directory for the project
DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL"
setwd(DIR)
#-----------------------------------------------------------------------
# Grab the skaters data from the season page
#-----------------------------------------------------------------------
## set the season
SEASON <- "2011"
## create the URL
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="")
## grab the page -- the table is parsed nicely
tables <- readHTMLTable(URL)
ds.skaters <- tables$stats
## I don't like dealing with factors if I don't have to
## and I prefer lower case
for(i in 1:ncol(ds.skaters)) {
ds.skaters[,i] <- as.character(ds.skaters[,i])
names(ds.skaters) <- tolower(colnames(ds.skaters))
}
## fix a couple of the column names
colnames(ds.skaters)
names(ds.skaters)[10] <- "plusmin"
names(ds.skaters)[17] <- "spct"
## finally fix the columns - NAs forced by coercion warnings
for(i in c(1, 3, 6:18)) {
ds.skaters[,i] <- as.numeric(ds.skaters[, i])
}
## change the avg time on ice to a time format
ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S")
## remove the header and totals row
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]
## save the datafile
write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F)
## convert toi to seconds, and seconds/game
ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp
#-----------------------------------------------------------------------
# Quick overview of the 2011 skater data
#-----------------------------------------------------------------------
## quick look of the dataset
summary(ds.skaters)
## Best and worst +/-
head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ])
head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ?
## select the columns to analyze
play <- ds.skaters[, c(7:17, 20)]
## fix the NA's to zeros
for(i in 1:ncol(play)) {
# which(is.na(play[,i])) <- rows that meet the criteria of NA
play[which(is.na(play[,i])), i] <- 0
}
## look at the correlations
summary(play)
round(cor(play), 2)
#-----------------------------------------------------------------------
# Use PCA and clustering techniques to group the players
#-----------------------------------------------------------------------
## PCA analysis
## http://www.statmethods.net/advstats/factor.html
## the 1st component of princomp(play, cor=F) explained 99% variance
play.pca <- princomp(play, cor=T)
## what did we get for data?
summary(play.pca)
plot(play.pca)
png("PCA biplot.png")
biplot(play.pca)
dev.off()
## lets just use the first 5 = ~= 86% of the variance
play.pca.comp <- play.pca$scores[, 1:5]
## use hierarchical clustering to get a sense of how many clusters
## to feed to the kmeans procedure - need to feed distance matrix
## pull 40 random cases into the procedure
dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian")
play.hclust <- hclust(dist.mat, method="ward")
png("Dendrogram.png")
plot(play.hclust)
dev.off()
## I am going to try to 4 clusters
kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000)
kmean4$withinss
kmean4$size
#-----------------------------------------------------------------------
# Analyze the clusters
#-----------------------------------------------------------------------
## add the cluster membership to the player data
skaters <- cbind(ds.skaters, kmean4$cluster)
names(skaters)[ncol(skaters)] <- "cluster"
## make them a factor
skaters$cluster2 <- as.factor(skaters$cluster)
skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3))
## profile clusters by a few variables
table(skaters$cluster2)
ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T)
## boxplot these clusters
png("4 Cluster by Points Boxplot.png")
boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011")
dev.off()
## cluster membership by team
addmargins(table(skaters$tm, skaters$cluster2))
(team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3))
## plot of the table
## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r
## http://research.stowers-institute.org/efg/R/Color/Chart/
## type colors() to see names for the graph on link above
mycol <- colors()
mycol <- c("red4", "orangered", "burlywood1", "beige")
png("Team Distribution.png")
barplot(team.dist, horiz=T, cex.names=.75, las=1,
col=mycol, main="Distribution of Clusters by Team", xlab="% of Team")
abline(v=.25, lty=2)
abline(v=.5, lty=2)
abline(v=.75, lty=2)
dev.off()
#######################################################################################
# Function to scrape season skater statistics from Hockey-reference.com
#######################################################################################
GrabSkaters <- function(S) {
# The function takes parameter S which is a string and represents the Season
# Returns: data frame
require(XML)
## create the URL
URL <- paste("http://www.hockey-reference.com/leagues/NHL_",
S, "_skaters.html", sep="")
## grab the page -- the table is parsed nicely
tables <- readHTMLTable(URL)
ds.skaters <- tables$stats
## determine if the HTML table was well formed (column names are the first record)
## can either read in directly or need to force column names
## and
## I don't like dealing with factors if I don't have to
## and I prefer lower case
for(i in 1:ncol(ds.skaters)) {
ds.skaters[,i] <- as.character(ds.skaters[,i])
names(ds.skaters) <- tolower(colnames(ds.skaters))
}
## fix a couple of the column names
colnames(ds.skaters)
## names(ds.skaters)[10] <- "plusmin"
names(ds.skaters)[11] <- "plusmin"
names(ds.skaters)[18] <- "spct"
## finally fix the columns - NAs forced by coercion warnings
for(i in c(1, 3, 6:18)) {
ds.skaters[,i] <- as.numeric(ds.skaters[, i])
}
## convert toi to seconds, and seconds/game
## ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp
## remove the header and totals row
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
## ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]
## add the year
ds.skaters$season <- S
## return the dataframe
return(ds.skaters)
}
# Source the file with the Grab Skaters function
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Scripts")
source("SkaterStats.R")
## set the working directory
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/30 Goals 100 PIM")
## set the seasons
SEASON <- as.character(c(1960:2004, 2006:2011))
#-----------------------------------------------------------------------
# Use the function to loop over the seasons and piece together
#-----------------------------------------------------------------------
## define the seasons -- 2005 dataset doesnt exist
## if I was a good coder I would trap the error, but this works
SEASON <- as.character(c(1960:2004, 2006:2011))
## create an empy dataset that we will append to
dataset <- data.frame()
## loop over the seasons, use the function to grab the data
## and build the dataset
for (S in SEASON) {
require(plyr)
temp <- GrabSkaters(S)
dataset <- rbind.fill(dataset, temp)
print(paste("Completed Season ", S, sep=""))
## pause the script so we don't kill their servers
Sys.sleep(10)
}
head(dataset); tail(dataset);
dataset <- dataset[dataset$tm != 'TOT', ]
save(dataset, file="hr skater db.Rdata")
#-----------------------------------------------------------------------
# Lets get a sense for how many players finish 30/100
#-----------------------------------------------------------------------
# Let's look at a scatterplot of the data
png("Scatter Plot.png")
plot(x=dataset$pim, y=dataset$g, type="p", pch=20, cex=.9,
xlab="Penalty Mins", ylab="Goals Scored", bty="n",
main="Correlate Goals Scored and Penalty Minutes")
abline(h=30, lty=2, col="red"); abline(v=100, lty=2, col="red");
dev.off()
# raw output
dstemp <- transform(dataset, ind30100 = as.numeric(g >=30 & pim >=100))
str(dstemp)
with(dstemp, addmargins(table(season, ind30100)))
with(dstemp, addmargins(table(season, ind30100)))
# % of players that meet criteria per season
library(sqldf)
(pct <- sqldf("SELECT season, AVG(ind30100) as pct FROM dstemp GROUP BY season"))
# basic plot of the data
barplot(pct$pct, names.arg=pct$season)
# trying to learn ggplot
library(ggplot2)
ds <- dataset[dstemp$g >=30 & dstemp$pim >=100, c("season", "player")]
min(as.numeric(ds$season)); as.numeric(max(ds$season));
p <- ggplot(ds, aes(as.numeric(season))) + geom_bar(fill="black", colour="white", binwidth=1)
p <- p + labs(x="Season", y="# Players")
ggsave(p, file="Distribution.png")
# look at a listing of the data again
View(pct)
# Who are the players in the 2011 season that meet this criteria
dataset[dataset$season == "2011" & dataset$g >= 30 & dataset$pim >=100, c("player", "g", "pim")]
#### load the libraries used for the discussion
install.packages("twitteR")
install.packages("sqldf")
install.packages("XML")
install.packages("RCurl") # cURL interface adapted in R
#==============================================================================
# Read in already existing data from the web
#==============================================================================
##### read in a CSV dataset from yahoo finance
yahoo <- read.csv("http://ichart.finance.yahoo.com/table.csv?s=AAPL&a=08&b=7&c=1984&d=10&e=28&f=2011&g=w&ignore=.csv",
header=T, stringsAsFactors=F)
dim(yahoo) # check the rows/columns
head(yahoo, n=5) # look at the first 5 rows
str(yahoo) # have R tell us the details of the
plot(yahoo[,7], type="l") # data are reverse order
### fix the data to make a plot
yahoo$Date <- as.Date(yahoo$Date) # tell R that this column is a Date
yahoo.2 <- yahoo[order(yahoo$Date),] # sort the data Ascending order
plot(x=yahoo.2$Date, y=yahoo.2[,7], type="l",
xlab="Date", ylab="Adjusted Close",
main="Weekly Adjusted Closing Price for Apple (AAPL)") # plot the data
#==============================================================================
# Read in HTML tables
# http://stackoverflow.com/questions/4393780/scraping-a-wiki-page-for-the-periodic-table-and-all-the-links
# http://stackoverflow.com/questions/1395528/scraping-html-tables-into-r-data-frames-using-the-xml-package
#==============================================================================
# lets parse NHL data
library(XML)
URL <- paste("http://www.hockey-reference.com/leagues/NHL_",
2011, "_skaters.html", sep="")
tables <- readHTMLTable(URL) # parses all of the tables for us
ds.skaters <- tables$stats # if the table is named (View Source Code), can reference by name
dim(ds.skaters)
#==============================================================================
# Collect data from twitter
#==============================================================================
library(twitteR)
rstats <- searchTwitter("#rstats", n=1500) #searches twitter for the last 1500 tweets
length(rstats) # but only searches back a week or so
mode(rstats) # not a data frame, so we need to convert it to rows/columns
rstats.df <- twListToDF(rstats)
dim(rstats.df)
colnames(rstats.df) # print the column names
length(unique(rstats.df$screenName)) # how many different twitter users
install.packages("sqldf") # install a library to use SQL-like commands
library(sqldf)
temp <- rstats.df
twts <- sqldf("SELECT screenName as name, count(*) as tweets FROM temp
GROUP BY screenName ORDER BY count(*) desc")
head(twts, n=10) # top 10 tweeters about rstats
hist(twts[,2], xlab="# Rstats Tweets", main="")
#==============================================================================
# Social Network Graph of rstats users
# http://www.drewconway.com/zia/?p=1471
#==============================================================================
#==============================================================================
# Streaming twitter using RCURL
#==============================================================================
library(RCurl)
library(rjson)
# set the directory
setwd("C:\\Documents and Settings\\user\\Desktop\\Web Mining Pres")
#### redirects output to a file, but still has problems with getting data back into R!
WRITE_TO_FILE <- function(x) {
if (nchar(x) >0 ) {
write.table(x, file="Twitter Stream Capture.txt", append=T, row.names=F, col.names=F)
}
}
### windows users will need to get this certificate to authenticate
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")
### write the raw JSON data from the Twitter Firehouse to a text file
getURL("https://stream.twitter.com/1/statuses/sample.json",
userpwd=USER:PASSWORD,
cainfo = "cacert.pem",
write=WRITE_TO_FILE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment