-
-
Save georggr/04c51d9d510777052428c26365bf2675 to your computer and use it in GitHub Desktop.
Basic Web Mining with R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################### | |
# Author: @BrockTibert | |
# | |
# Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse | |
# | |
# | |
############################################################################### | |
#----------------------------------------------------------------------- | |
# set up script level basics | |
#----------------------------------------------------------------------- | |
## I code on the screen, so I set it wider for the console output | |
options(widht=180) | |
## libraries | |
library(XML) | |
library(plyr) | |
library(gdata) | |
library(sqldf) | |
## directory for the project | |
DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL" | |
setwd(DIR) | |
#----------------------------------------------------------------------- | |
# Grab the skaters data from the season page | |
#----------------------------------------------------------------------- | |
## set the season | |
SEASON <- "2011" | |
## create the URL | |
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="") | |
## grab the page -- the table is parsed nicely | |
tables <- readHTMLTable(URL) | |
ds.skaters <- tables$stats | |
## I don't like dealing with factors if I don't have to | |
## and I prefer lower case | |
for(i in 1:ncol(ds.skaters)) { | |
ds.skaters[,i] <- as.character(ds.skaters[,i]) | |
names(ds.skaters) <- tolower(colnames(ds.skaters)) | |
} | |
## fix a couple of the column names | |
colnames(ds.skaters) | |
names(ds.skaters)[10] <- "plusmin" | |
names(ds.skaters)[17] <- "spct" | |
## finally fix the columns - NAs forced by coercion warnings | |
for(i in c(1, 3, 6:18)) { | |
ds.skaters[,i] <- as.numeric(ds.skaters[, i]) | |
} | |
## change the avg time on ice to a time format | |
ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S") | |
## remove the header and totals row | |
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ] | |
ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ] | |
## save the datafile | |
write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F) | |
## convert toi to seconds, and seconds/game | |
ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp | |
#----------------------------------------------------------------------- | |
# Quick overview of the 2011 skater data | |
#----------------------------------------------------------------------- | |
## quick look of the dataset | |
summary(ds.skaters) | |
## Best and worst +/- | |
head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ]) | |
head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ? | |
## select the columns to analyze | |
play <- ds.skaters[, c(7:17, 20)] | |
## fix the NA's to zeros | |
for(i in 1:ncol(play)) { | |
# which(is.na(play[,i])) <- rows that meet the criteria of NA | |
play[which(is.na(play[,i])), i] <- 0 | |
} | |
## look at the correlations | |
summary(play) | |
round(cor(play), 2) | |
#----------------------------------------------------------------------- | |
# Use PCA and clustering techniques to group the players | |
#----------------------------------------------------------------------- | |
## PCA analysis | |
## http://www.statmethods.net/advstats/factor.html | |
## the 1st component of princomp(play, cor=F) explained 99% variance | |
play.pca <- princomp(play, cor=T) | |
## what did we get for data? | |
summary(play.pca) | |
plot(play.pca) | |
png("PCA biplot.png") | |
biplot(play.pca) | |
dev.off() | |
## lets just use the first 5 = ~= 86% of the variance | |
play.pca.comp <- play.pca$scores[, 1:5] | |
## use hierarchical clustering to get a sense of how many clusters | |
## to feed to the kmeans procedure - need to feed distance matrix | |
## pull 40 random cases into the procedure | |
dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian") | |
play.hclust <- hclust(dist.mat, method="ward") | |
png("Dendrogram.png") | |
plot(play.hclust) | |
dev.off() | |
## I am going to try to 4 clusters | |
kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000) | |
kmean4$withinss | |
kmean4$size | |
#----------------------------------------------------------------------- | |
# Analyze the clusters | |
#----------------------------------------------------------------------- | |
## add the cluster membership to the player data | |
skaters <- cbind(ds.skaters, kmean4$cluster) | |
names(skaters)[ncol(skaters)] <- "cluster" | |
## make them a factor | |
skaters$cluster2 <- as.factor(skaters$cluster) | |
skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3)) | |
## profile clusters by a few variables | |
table(skaters$cluster2) | |
ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T) | |
## boxplot these clusters | |
png("4 Cluster by Points Boxplot.png") | |
boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011") | |
dev.off() | |
## cluster membership by team | |
addmargins(table(skaters$tm, skaters$cluster2)) | |
(team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3)) | |
## plot of the table | |
## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r | |
## http://research.stowers-institute.org/efg/R/Color/Chart/ | |
## type colors() to see names for the graph on link above | |
mycol <- colors() | |
mycol <- c("red4", "orangered", "burlywood1", "beige") | |
png("Team Distribution.png") | |
barplot(team.dist, horiz=T, cex.names=.75, las=1, | |
col=mycol, main="Distribution of Clusters by Team", xlab="% of Team") | |
abline(v=.25, lty=2) | |
abline(v=.5, lty=2) | |
abline(v=.75, lty=2) | |
dev.off() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################################################################### | |
# Function to scrape season skater statistics from Hockey-reference.com | |
####################################################################################### | |
GrabSkaters <- function(S) { | |
# The function takes parameter S which is a string and represents the Season | |
# Returns: data frame | |
require(XML) | |
## create the URL | |
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", | |
S, "_skaters.html", sep="") | |
## grab the page -- the table is parsed nicely | |
tables <- readHTMLTable(URL) | |
ds.skaters <- tables$stats | |
## determine if the HTML table was well formed (column names are the first record) | |
## can either read in directly or need to force column names | |
## and | |
## I don't like dealing with factors if I don't have to | |
## and I prefer lower case | |
for(i in 1:ncol(ds.skaters)) { | |
ds.skaters[,i] <- as.character(ds.skaters[,i]) | |
names(ds.skaters) <- tolower(colnames(ds.skaters)) | |
} | |
## fix a couple of the column names | |
colnames(ds.skaters) | |
## names(ds.skaters)[10] <- "plusmin" | |
names(ds.skaters)[11] <- "plusmin" | |
names(ds.skaters)[18] <- "spct" | |
## finally fix the columns - NAs forced by coercion warnings | |
for(i in c(1, 3, 6:18)) { | |
ds.skaters[,i] <- as.numeric(ds.skaters[, i]) | |
} | |
## convert toi to seconds, and seconds/game | |
## ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp | |
## remove the header and totals row | |
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ] | |
## ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ] | |
## add the year | |
ds.skaters$season <- S | |
## return the dataframe | |
return(ds.skaters) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Source the file with the Grab Skaters function | |
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Scripts") | |
source("SkaterStats.R") | |
## set the working directory | |
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/30 Goals 100 PIM") | |
## set the seasons | |
SEASON <- as.character(c(1960:2004, 2006:2011)) | |
#----------------------------------------------------------------------- | |
# Use the function to loop over the seasons and piece together | |
#----------------------------------------------------------------------- | |
## define the seasons -- 2005 dataset doesnt exist | |
## if I was a good coder I would trap the error, but this works | |
SEASON <- as.character(c(1960:2004, 2006:2011)) | |
## create an empy dataset that we will append to | |
dataset <- data.frame() | |
## loop over the seasons, use the function to grab the data | |
## and build the dataset | |
for (S in SEASON) { | |
require(plyr) | |
temp <- GrabSkaters(S) | |
dataset <- rbind.fill(dataset, temp) | |
print(paste("Completed Season ", S, sep="")) | |
## pause the script so we don't kill their servers | |
Sys.sleep(10) | |
} | |
head(dataset); tail(dataset); | |
dataset <- dataset[dataset$tm != 'TOT', ] | |
save(dataset, file="hr skater db.Rdata") | |
#----------------------------------------------------------------------- | |
# Lets get a sense for how many players finish 30/100 | |
#----------------------------------------------------------------------- | |
# Let's look at a scatterplot of the data | |
png("Scatter Plot.png") | |
plot(x=dataset$pim, y=dataset$g, type="p", pch=20, cex=.9, | |
xlab="Penalty Mins", ylab="Goals Scored", bty="n", | |
main="Correlate Goals Scored and Penalty Minutes") | |
abline(h=30, lty=2, col="red"); abline(v=100, lty=2, col="red"); | |
dev.off() | |
# raw output | |
dstemp <- transform(dataset, ind30100 = as.numeric(g >=30 & pim >=100)) | |
str(dstemp) | |
with(dstemp, addmargins(table(season, ind30100))) | |
with(dstemp, addmargins(table(season, ind30100))) | |
# % of players that meet criteria per season | |
library(sqldf) | |
(pct <- sqldf("SELECT season, AVG(ind30100) as pct FROM dstemp GROUP BY season")) | |
# basic plot of the data | |
barplot(pct$pct, names.arg=pct$season) | |
# trying to learn ggplot | |
library(ggplot2) | |
ds <- dataset[dstemp$g >=30 & dstemp$pim >=100, c("season", "player")] | |
min(as.numeric(ds$season)); as.numeric(max(ds$season)); | |
p <- ggplot(ds, aes(as.numeric(season))) + geom_bar(fill="black", colour="white", binwidth=1) | |
p <- p + labs(x="Season", y="# Players") | |
ggsave(p, file="Distribution.png") | |
# look at a listing of the data again | |
View(pct) | |
# Who are the players in the 2011 season that meet this criteria | |
dataset[dataset$season == "2011" & dataset$g >= 30 & dataset$pim >=100, c("player", "g", "pim")] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### load the libraries used for the discussion | |
install.packages("twitteR") | |
install.packages("sqldf") | |
install.packages("XML") | |
install.packages("RCurl") # cURL interface adapted in R | |
#============================================================================== | |
# Read in already existing data from the web | |
#============================================================================== | |
##### read in a CSV dataset from yahoo finance | |
yahoo <- read.csv("http://ichart.finance.yahoo.com/table.csv?s=AAPL&a=08&b=7&c=1984&d=10&e=28&f=2011&g=w&ignore=.csv", | |
header=T, stringsAsFactors=F) | |
dim(yahoo) # check the rows/columns | |
head(yahoo, n=5) # look at the first 5 rows | |
str(yahoo) # have R tell us the details of the | |
plot(yahoo[,7], type="l") # data are reverse order | |
### fix the data to make a plot | |
yahoo$Date <- as.Date(yahoo$Date) # tell R that this column is a Date | |
yahoo.2 <- yahoo[order(yahoo$Date),] # sort the data Ascending order | |
plot(x=yahoo.2$Date, y=yahoo.2[,7], type="l", | |
xlab="Date", ylab="Adjusted Close", | |
main="Weekly Adjusted Closing Price for Apple (AAPL)") # plot the data | |
#============================================================================== | |
# Read in HTML tables | |
# http://stackoverflow.com/questions/4393780/scraping-a-wiki-page-for-the-periodic-table-and-all-the-links | |
# http://stackoverflow.com/questions/1395528/scraping-html-tables-into-r-data-frames-using-the-xml-package | |
#============================================================================== | |
# lets parse NHL data | |
library(XML) | |
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", | |
2011, "_skaters.html", sep="") | |
tables <- readHTMLTable(URL) # parses all of the tables for us | |
ds.skaters <- tables$stats # if the table is named (View Source Code), can reference by name | |
dim(ds.skaters) | |
#============================================================================== | |
# Collect data from twitter | |
#============================================================================== | |
library(twitteR) | |
rstats <- searchTwitter("#rstats", n=1500) #searches twitter for the last 1500 tweets | |
length(rstats) # but only searches back a week or so | |
mode(rstats) # not a data frame, so we need to convert it to rows/columns | |
rstats.df <- twListToDF(rstats) | |
dim(rstats.df) | |
colnames(rstats.df) # print the column names | |
length(unique(rstats.df$screenName)) # how many different twitter users | |
install.packages("sqldf") # install a library to use SQL-like commands | |
library(sqldf) | |
temp <- rstats.df | |
twts <- sqldf("SELECT screenName as name, count(*) as tweets FROM temp | |
GROUP BY screenName ORDER BY count(*) desc") | |
head(twts, n=10) # top 10 tweeters about rstats | |
hist(twts[,2], xlab="# Rstats Tweets", main="") | |
#============================================================================== | |
# Social Network Graph of rstats users | |
# http://www.drewconway.com/zia/?p=1471 | |
#============================================================================== | |
#============================================================================== | |
# Streaming twitter using RCURL | |
#============================================================================== | |
library(RCurl) | |
library(rjson) | |
# set the directory | |
setwd("C:\\Documents and Settings\\user\\Desktop\\Web Mining Pres") | |
#### redirects output to a file, but still has problems with getting data back into R! | |
WRITE_TO_FILE <- function(x) { | |
if (nchar(x) >0 ) { | |
write.table(x, file="Twitter Stream Capture.txt", append=T, row.names=F, col.names=F) | |
} | |
} | |
### windows users will need to get this certificate to authenticate | |
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem") | |
### write the raw JSON data from the Twitter Firehouse to a text file | |
getURL("https://stream.twitter.com/1/statuses/sample.json", | |
userpwd=USER:PASSWORD, | |
cainfo = "cacert.pem", | |
write=WRITE_TO_FILE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment