Btibert3 · November 28, 2011 17:34
diff --git a/Cluster NHL Players.R b/Cluster NHL Players.R
 ###############################################################################
 # Author: @BrockTibert
 #
 # Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse
 #
 #
 ###############################################################################

 #-----------------------------------------------------------------------
 # set up script level basics
 #-----------------------------------------------------------------------

 ## I code on the screen, so I set it wider for the console output
 options(widht=180)

 ## libraries
 library(XML)
 library(plyr)
 library(gdata)
 library(sqldf)


 ## directory for the project
 DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL"
 setwd(DIR)



 #-----------------------------------------------------------------------
 # Grab the skaters data from the season page
 #-----------------------------------------------------------------------

 ## set the season
 SEASON <- "2011"

 ## create the URL
 URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="")


 ## grab the page -- the table is parsed nicely
 tables <- readHTMLTable(URL)
 ds.skaters <- tables$stats

 ## I don't like dealing with factors if I don't have to
 ## and I prefer lower case
 for(i in 1:ncol(ds.skaters)) {
 	ds.skaters[,i] <- as.character(ds.skaters[,i])
 	names(ds.skaters) <- tolower(colnames(ds.skaters))
 }

 ## fix a couple of the column names
 colnames(ds.skaters)
 names(ds.skaters)[10] <- "plusmin"
 names(ds.skaters)[17] <- "spct"

 ## finally fix the columns - NAs forced by coercion warnings
 for(i in c(1, 3, 6:18)) {
 	ds.skaters[,i] <- as.numeric(ds.skaters[, i])
 }

 ## change the avg time on ice to a time format
 ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S")

 ## remove the header and totals row
 ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
 ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]

 ## save the datafile
 write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F)

 ## convert toi to seconds, and seconds/game
 ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp

 #-----------------------------------------------------------------------
 # Quick overview of the 2011 skater data
 #-----------------------------------------------------------------------

 ## quick look of the dataset
 summary(ds.skaters)

 ## Best and worst +/-
 head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ])
 head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ?

 ## select the columns to analyze
 play <- ds.skaters[, c(7:17, 20)]


 ## fix the NA's to zeros
 for(i in 1:ncol(play)) {
 #	which(is.na(play[,i])) <- rows that meet the criteria of NA
 	play[which(is.na(play[,i])), i] <- 0
 }

 ## look at the correlations
 summary(play)
 round(cor(play), 2)

 #-----------------------------------------------------------------------
 # Use PCA and clustering techniques to group the players
 #-----------------------------------------------------------------------

 ## PCA analysis
 ## http://www.statmethods.net/advstats/factor.html
 ## the 1st component of princomp(play, cor=F) explained 99% variance
 play.pca <- princomp(play, cor=T)


 ## what did we get for data?
 summary(play.pca)
 plot(play.pca)
 png("PCA biplot.png")
 biplot(play.pca)
 dev.off()


 ## lets just use the first 5 = ~= 86% of the variance
 play.pca.comp <- play.pca$scores[, 1:5]


 ## use hierarchical clustering to get a sense of how many clusters 
 ## to feed to the kmeans procedure - need to feed distance matrix
 ## pull 40 random cases into the procedure
 dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian")
 play.hclust <- hclust(dist.mat, method="ward")
 png("Dendrogram.png")
 plot(play.hclust)
 dev.off()

 ## I am going to try to 4 clusters
 kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000)
 kmean4$withinss
 kmean4$size



 #-----------------------------------------------------------------------
 # Analyze the clusters
 #-----------------------------------------------------------------------

 ## add the cluster membership to the player data
 skaters <- cbind(ds.skaters, kmean4$cluster)
 names(skaters)[ncol(skaters)] <- "cluster"

 ## make them a factor
 skaters$cluster2 <- as.factor(skaters$cluster)
 skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3))

 ## profile clusters by a few variables
 table(skaters$cluster2)
 ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T)

 ## boxplot these clusters
 png("4 Cluster by Points Boxplot.png")
 boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011")
 dev.off()


 ## cluster membership by team
 addmargins(table(skaters$tm, skaters$cluster2))
 (team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3))


 ## plot of the table
 ## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r 
 ## http://research.stowers-institute.org/efg/R/Color/Chart/
 ## type colors() to see names for the graph on link above
 mycol <- colors()
 mycol <- c("red4", "orangered", "burlywood1",  "beige")
 png("Team Distribution.png")
 barplot(team.dist, horiz=T, cex.names=.75, las=1, 
 		col=mycol, main="Distribution of Clusters by Team", xlab="% of Team")
 abline(v=.25, lty=2)
 abline(v=.5, lty=2)
 abline(v=.75, lty=2)
 dev.off()
diff --git a/Function Grab Skaters.R b/Function Grab Skaters.R
 #######################################################################################
 # Function to scrape season skater statistics from Hockey-reference.com
 #######################################################################################
 GrabSkaters <- function(S) {
  
  # The function takes parameter S which is a string and represents the Season
 	# Returns: data frame    
  
  require(XML)
 	
 	## create the URL
 	URL <- paste("http://www.hockey-reference.com/leagues/NHL_", 
 			S, "_skaters.html", sep="")
 	
 	## grab the page -- the table is parsed nicely
 	tables <- readHTMLTable(URL)
 	ds.skaters <- tables$stats
 	
 	## determine if the HTML table was well formed (column names are the first record)
 	## can either read in directly or need to force column names
 	## and 
 	
 	## I don't like dealing with factors if I don't have to
 	## and I prefer lower case
 	for(i in 1:ncol(ds.skaters)) {
 		ds.skaters[,i] <- as.character(ds.skaters[,i])
 		names(ds.skaters) <- tolower(colnames(ds.skaters))
 	}
 	
 	## fix a couple of the column names
 	colnames(ds.skaters)
 	## names(ds.skaters)[10] <- "plusmin"
 	names(ds.skaters)[11] <- "plusmin"
 	names(ds.skaters)[18] <- "spct"
 	
 	## finally fix the columns - NAs forced by coercion warnings
 	for(i in c(1, 3, 6:18)) {
 		ds.skaters[,i] <- as.numeric(ds.skaters[, i])
 	}
 	
 	## convert toi to seconds, and seconds/game
 	## ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp
 	
 	## remove the header and totals row
 	ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
 	## ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]
 	
 	## add the year
 	ds.skaters$season <- S
 	
 	## return the dataframe
 	return(ds.skaters)
 	
 }
diff --git a/Grab Skaters.R b/Grab Skaters.R
 # Source the file with the Grab Skaters function
 setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Scripts")
 source("SkaterStats.R")

 ## set the working directory
 setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/30 Goals 100 PIM")

 ## set the seasons
 SEASON <- as.character(c(1960:2004, 2006:2011))


 #-----------------------------------------------------------------------
 # Use the function to loop over the seasons and piece together
 #-----------------------------------------------------------------------

 ## define the seasons -- 2005 dataset doesnt exist
 ## if I was a good coder I would trap the error, but this works
 SEASON <- as.character(c(1960:2004, 2006:2011))

 ## create an empy dataset that we will append to
 dataset <- data.frame()

 ## loop over the seasons, use the function to grab the data
 ## and build the dataset
 for (S in SEASON) {
  
  require(plyr)
  
 	temp <- GrabSkaters(S)
 	dataset <- rbind.fill(dataset, temp)
 	print(paste("Completed Season ", S, sep=""))
 	
 	## pause the script so we don't kill their servers
 	Sys.sleep(10)
 	
 }
 head(dataset); tail(dataset);
 dataset <- dataset[dataset$tm != 'TOT', ]
 save(dataset, file="hr skater db.Rdata")


 #-----------------------------------------------------------------------
 # Lets get a sense for how many players finish 30/100
 #-----------------------------------------------------------------------

 # Let's look at a scatterplot of the data
 png("Scatter Plot.png")
 plot(x=dataset$pim, y=dataset$g, type="p", pch=20, cex=.9,
    xlab="Penalty Mins", ylab="Goals Scored", bty="n", 
 		main="Correlate Goals Scored and Penalty Minutes")
 abline(h=30, lty=2, col="red"); abline(v=100, lty=2, col="red");
 dev.off()

 # raw output
 dstemp <- transform(dataset, ind30100 = as.numeric(g >=30 & pim >=100))
 str(dstemp)
 with(dstemp, addmargins(table(season, ind30100)))
 with(dstemp, addmargins(table(season, ind30100)))

 # % of players that meet criteria per season
 library(sqldf)
 (pct <- sqldf("SELECT season, AVG(ind30100) as pct FROM dstemp GROUP BY season"))


 # basic plot of the data
 barplot(pct$pct, names.arg=pct$season)

 # trying to learn ggplot
 library(ggplot2)
 ds <- dataset[dstemp$g >=30 & dstemp$pim >=100, c("season", "player")]
 min(as.numeric(ds$season)); as.numeric(max(ds$season));
 p <- ggplot(ds, aes(as.numeric(season))) + geom_bar(fill="black", colour="white", binwidth=1)
 p <- p + labs(x="Season", y="# Players") 
 ggsave(p, file="Distribution.png")


 # look at a listing of the data again
 View(pct)

 # Who are the players in the 2011 season that meet this criteria
 dataset[dataset$season == "2011" & dataset$g >= 30 & dataset$pim >=100, c("player", "g", "pim")]
diff --git a/Intro to R Web Mining.R b/Intro to R Web Mining.R
 #### load the libraries used for the discussion
 install.packages("twitteR")
 install.packages("sqldf")
 install.packages("XML")
 install.packages("RCurl") # cURL interface adapted in R


 #==============================================================================
 # Read in already existing data from the web
 #==============================================================================

 ##### read in a CSV dataset from yahoo finance
 yahoo <- read.csv("http://ichart.finance.yahoo.com/table.csv?s=AAPL&a=08&b=7&c=1984&d=10&e=28&f=2011&g=w&ignore=.csv",
                  header=T, stringsAsFactors=F)

 dim(yahoo) # check the rows/columns
 head(yahoo, n=5) # look at the first 5 rows
 str(yahoo) # have R tell us the details of the 
 plot(yahoo[,7], type="l")  # data are reverse order


 ### fix the data to make a plot
 yahoo$Date <- as.Date(yahoo$Date) # tell R that this column is a Date
 yahoo.2 <- yahoo[order(yahoo$Date),] # sort the data Ascending order
 plot(x=yahoo.2$Date, y=yahoo.2[,7], type="l",
     xlab="Date", ylab="Adjusted Close", 
     main="Weekly Adjusted Closing Price for Apple (AAPL)") # plot the data



 #==============================================================================
 # Read in HTML tables
 # http://stackoverflow.com/questions/4393780/scraping-a-wiki-page-for-the-periodic-table-and-all-the-links
 # http://stackoverflow.com/questions/1395528/scraping-html-tables-into-r-data-frames-using-the-xml-package
 #==============================================================================

 # lets parse NHL data
 library(XML)
 URL <- paste("http://www.hockey-reference.com/leagues/NHL_", 
  		2011, "_skaters.html", sep="")
 tables <- readHTMLTable(URL) # parses all of the tables for us
 ds.skaters <- tables$stats # if the table is named (View Source Code), can reference by name
 dim(ds.skaters)



 #==============================================================================
 # Collect data from twitter
 #==============================================================================

 library(twitteR)
 rstats <- searchTwitter("#rstats", n=1500) #searches twitter for the last 1500 tweets
 length(rstats) # but only searches back a week or so
 mode(rstats) # not a data frame, so we need to convert it to rows/columns
 rstats.df <- twListToDF(rstats)
 dim(rstats.df)
 colnames(rstats.df) # print the column names
 length(unique(rstats.df$screenName)) # how many different twitter users
 install.packages("sqldf") # install a library to use SQL-like commands
 library(sqldf)
 temp <- rstats.df
 twts <- sqldf("SELECT screenName as name, count(*) as tweets FROM temp
              GROUP BY screenName ORDER BY count(*) desc")
 head(twts, n=10) # top 10 tweeters about rstats
 hist(twts[,2], xlab="# Rstats Tweets", main="")



 #==============================================================================
 # Social Network Graph of rstats users
 # http://www.drewconway.com/zia/?p=1471
 #==============================================================================



 #==============================================================================
 # Streaming twitter using RCURL
 #==============================================================================
 library(RCurl)
 library(rjson)

 # set the directory
 setwd("C:\\Documents and Settings\\user\\Desktop\\Web Mining Pres")

 #### redirects output to a file, but still has problems with getting data back into R!
 WRITE_TO_FILE <- function(x) {
     
     if (nchar(x) >0 ) {
          write.table(x, file="Twitter Stream Capture.txt", append=T, row.names=F, col.names=F)
     }
     
 }

 ### windows users will need to get this certificate to authenticate
 download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

 ### write the raw JSON data from the Twitter Firehouse to a text file
 getURL("https://stream.twitter.com/1/statuses/sample.json", 
       userpwd=USER:PASSWORD,
       cainfo = "cacert.pem", 
       write=WRITE_TO_FILE)
	###############################################################################
	# Author: @BrockTibert
	#
	# Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse
	#
	#
	###############################################################################

	#-----------------------------------------------------------------------
	# set up script level basics
	#-----------------------------------------------------------------------

	## I code on the screen, so I set it wider for the console output
	options(widht=180)

	## libraries
	library(XML)
	library(plyr)
	library(gdata)
	library(sqldf)


	## directory for the project
	DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL"
	setwd(DIR)



	#-----------------------------------------------------------------------
	# Grab the skaters data from the season page
	#-----------------------------------------------------------------------

	## set the season
	SEASON <- "2011"

	## create the URL
	URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="")


	## grab the page -- the table is parsed nicely
	tables <- readHTMLTable(URL)
	ds.skaters <- tables$stats

	## I don't like dealing with factors if I don't have to
	## and I prefer lower case
	for(i in 1:ncol(ds.skaters)) {
	ds.skaters[,i] <- as.character(ds.skaters[,i])
	names(ds.skaters) <- tolower(colnames(ds.skaters))
	}

	## fix a couple of the column names
	colnames(ds.skaters)
	names(ds.skaters)[10] <- "plusmin"
	names(ds.skaters)[17] <- "spct"

	## finally fix the columns - NAs forced by coercion warnings
	for(i in c(1, 3, 6:18)) {
	ds.skaters[,i] <- as.numeric(ds.skaters[, i])
	}

	## change the avg time on ice to a time format
	ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S")

	## remove the header and totals row
	ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
	ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]

	## save the datafile
	write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F)

	## convert toi to seconds, and seconds/game
	ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp

	#-----------------------------------------------------------------------
	# Quick overview of the 2011 skater data
	#-----------------------------------------------------------------------

	## quick look of the dataset
	summary(ds.skaters)

	## Best and worst +/-
	head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ])
	head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ?

	## select the columns to analyze
	play <- ds.skaters[, c(7:17, 20)]


	## fix the NA's to zeros
	for(i in 1:ncol(play)) {
	# which(is.na(play[,i])) <- rows that meet the criteria of NA
	play[which(is.na(play[,i])), i] <- 0
	}

	## look at the correlations
	summary(play)
	round(cor(play), 2)

	#-----------------------------------------------------------------------
	# Use PCA and clustering techniques to group the players
	#-----------------------------------------------------------------------

	## PCA analysis
	## http://www.statmethods.net/advstats/factor.html
	## the 1st component of princomp(play, cor=F) explained 99% variance
	play.pca <- princomp(play, cor=T)


	## what did we get for data?
	summary(play.pca)
	plot(play.pca)
	png("PCA biplot.png")
	biplot(play.pca)
	dev.off()


	## lets just use the first 5 = ~= 86% of the variance
	play.pca.comp <- play.pca$scores[, 1:5]


	## use hierarchical clustering to get a sense of how many clusters
	## to feed to the kmeans procedure - need to feed distance matrix
	## pull 40 random cases into the procedure
	dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian")
	play.hclust <- hclust(dist.mat, method="ward")
	png("Dendrogram.png")
	plot(play.hclust)
	dev.off()

	## I am going to try to 4 clusters
	kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000)
	kmean4$withinss
	kmean4$size



	#-----------------------------------------------------------------------
	# Analyze the clusters
	#-----------------------------------------------------------------------

	## add the cluster membership to the player data
	skaters <- cbind(ds.skaters, kmean4$cluster)
	names(skaters)[ncol(skaters)] <- "cluster"

	## make them a factor
	skaters$cluster2 <- as.factor(skaters$cluster)
	skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3))

	## profile clusters by a few variables
	table(skaters$cluster2)
	ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T)

	## boxplot these clusters
	png("4 Cluster by Points Boxplot.png")
	boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011")
	dev.off()


	## cluster membership by team
	addmargins(table(skaters$tm, skaters$cluster2))
	(team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3))


	## plot of the table
	## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r
	## http://research.stowers-institute.org/efg/R/Color/Chart/
	## type colors() to see names for the graph on link above
	mycol <- colors()
	mycol <- c("red4", "orangered", "burlywood1", "beige")
	png("Team Distribution.png")
	barplot(team.dist, horiz=T, cex.names=.75, las=1,
	col=mycol, main="Distribution of Clusters by Team", xlab="% of Team")
	abline(v=.25, lty=2)
	abline(v=.5, lty=2)
	abline(v=.75, lty=2)
	dev.off()
	#######################################################################################
	# Function to scrape season skater statistics from Hockey-reference.com
	#######################################################################################
	GrabSkaters <- function(S) {

	# The function takes parameter S which is a string and represents the Season
	# Returns: data frame

	require(XML)

	## create the URL
	URL <- paste("http://www.hockey-reference.com/leagues/NHL_",
	S, "_skaters.html", sep="")

	## grab the page -- the table is parsed nicely
	tables <- readHTMLTable(URL)
	ds.skaters <- tables$stats

	## determine if the HTML table was well formed (column names are the first record)
	## can either read in directly or need to force column names
	## and

	## I don't like dealing with factors if I don't have to
	## and I prefer lower case
	for(i in 1:ncol(ds.skaters)) {
	ds.skaters[,i] <- as.character(ds.skaters[,i])
	names(ds.skaters) <- tolower(colnames(ds.skaters))
	}

	## fix a couple of the column names
	colnames(ds.skaters)
	## names(ds.skaters)[10] <- "plusmin"
	names(ds.skaters)[11] <- "plusmin"
	names(ds.skaters)[18] <- "spct"

	## finally fix the columns - NAs forced by coercion warnings
	for(i in c(1, 3, 6:18)) {
	ds.skaters[,i] <- as.numeric(ds.skaters[, i])
	}

	## convert toi to seconds, and seconds/game
	## ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp

	## remove the header and totals row
	ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
	## ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]

	## add the year
	ds.skaters$season <- S

	## return the dataframe
	return(ds.skaters)

	}
	# Source the file with the Grab Skaters function
	setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Scripts")
	source("SkaterStats.R")

	## set the working directory
	setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/30 Goals 100 PIM")

	## set the seasons
	SEASON <- as.character(c(1960:2004, 2006:2011))


	#-----------------------------------------------------------------------
	# Use the function to loop over the seasons and piece together
	#-----------------------------------------------------------------------

	## define the seasons -- 2005 dataset doesnt exist
	## if I was a good coder I would trap the error, but this works
	SEASON <- as.character(c(1960:2004, 2006:2011))

	## create an empy dataset that we will append to
	dataset <- data.frame()

	## loop over the seasons, use the function to grab the data
	## and build the dataset
	for (S in SEASON) {

	require(plyr)

	temp <- GrabSkaters(S)
	dataset <- rbind.fill(dataset, temp)
	print(paste("Completed Season ", S, sep=""))

	## pause the script so we don't kill their servers
	Sys.sleep(10)

	}
	head(dataset); tail(dataset);
	dataset <- dataset[dataset$tm != 'TOT', ]
	save(dataset, file="hr skater db.Rdata")


	#-----------------------------------------------------------------------
	# Lets get a sense for how many players finish 30/100
	#-----------------------------------------------------------------------

	# Let's look at a scatterplot of the data
	png("Scatter Plot.png")
	plot(x=dataset$pim, y=dataset$g, type="p", pch=20, cex=.9,
	xlab="Penalty Mins", ylab="Goals Scored", bty="n",
	main="Correlate Goals Scored and Penalty Minutes")
	abline(h=30, lty=2, col="red"); abline(v=100, lty=2, col="red");
	dev.off()

	# raw output
	dstemp <- transform(dataset, ind30100 = as.numeric(g >=30 & pim >=100))
	str(dstemp)
	with(dstemp, addmargins(table(season, ind30100)))
	with(dstemp, addmargins(table(season, ind30100)))

	# % of players that meet criteria per season
	library(sqldf)
	(pct <- sqldf("SELECT season, AVG(ind30100) as pct FROM dstemp GROUP BY season"))


	# basic plot of the data
	barplot(pct$pct, names.arg=pct$season)

	# trying to learn ggplot
	library(ggplot2)
	ds <- dataset[dstemp$g >=30 & dstemp$pim >=100, c("season", "player")]
	min(as.numeric(ds$season)); as.numeric(max(ds$season));
	p <- ggplot(ds, aes(as.numeric(season))) + geom_bar(fill="black", colour="white", binwidth=1)
	p <- p + labs(x="Season", y="# Players")
	ggsave(p, file="Distribution.png")


	# look at a listing of the data again
	View(pct)

	# Who are the players in the 2011 season that meet this criteria
	dataset[dataset$season == "2011" & dataset$g >= 30 & dataset$pim >=100, c("player", "g", "pim")]
	#### load the libraries used for the discussion
	install.packages("twitteR")
	install.packages("sqldf")
	install.packages("XML")
	install.packages("RCurl") # cURL interface adapted in R


	#==============================================================================
	# Read in already existing data from the web
	#==============================================================================

	##### read in a CSV dataset from yahoo finance
	yahoo <- read.csv("http://ichart.finance.yahoo.com/table.csv?s=AAPL&a=08&b=7&c=1984&d=10&e=28&f=2011&g=w&ignore=.csv",
	header=T, stringsAsFactors=F)

	dim(yahoo) # check the rows/columns
	head(yahoo, n=5) # look at the first 5 rows
	str(yahoo) # have R tell us the details of the
	plot(yahoo[,7], type="l") # data are reverse order


	### fix the data to make a plot
	yahoo$Date <- as.Date(yahoo$Date) # tell R that this column is a Date
	yahoo.2 <- yahoo[order(yahoo$Date),] # sort the data Ascending order
	plot(x=yahoo.2$Date, y=yahoo.2[,7], type="l",
	xlab="Date", ylab="Adjusted Close",
	main="Weekly Adjusted Closing Price for Apple (AAPL)") # plot the data



	#==============================================================================
	# Read in HTML tables
	# http://stackoverflow.com/questions/4393780/scraping-a-wiki-page-for-the-periodic-table-and-all-the-links
	# http://stackoverflow.com/questions/1395528/scraping-html-tables-into-r-data-frames-using-the-xml-package
	#==============================================================================

	# lets parse NHL data
	library(XML)
	URL <- paste("http://www.hockey-reference.com/leagues/NHL_",
	2011, "_skaters.html", sep="")
	tables <- readHTMLTable(URL) # parses all of the tables for us
	ds.skaters <- tables$stats # if the table is named (View Source Code), can reference by name
	dim(ds.skaters)



	#==============================================================================
	# Collect data from twitter
	#==============================================================================

	library(twitteR)
	rstats <- searchTwitter("#rstats", n=1500) #searches twitter for the last 1500 tweets
	length(rstats) # but only searches back a week or so
	mode(rstats) # not a data frame, so we need to convert it to rows/columns
	rstats.df <- twListToDF(rstats)
	dim(rstats.df)
	colnames(rstats.df) # print the column names
	length(unique(rstats.df$screenName)) # how many different twitter users
	install.packages("sqldf") # install a library to use SQL-like commands
	library(sqldf)
	temp <- rstats.df
	twts <- sqldf("SELECT screenName as name, count(*) as tweets FROM temp
	GROUP BY screenName ORDER BY count(*) desc")
	head(twts, n=10) # top 10 tweeters about rstats
	hist(twts[,2], xlab="# Rstats Tweets", main="")



	#==============================================================================
	# Social Network Graph of rstats users
	# http://www.drewconway.com/zia/?p=1471
	#==============================================================================



	#==============================================================================
	# Streaming twitter using RCURL
	#==============================================================================
	library(RCurl)
	library(rjson)

	# set the directory
	setwd("C:\\Documents and Settings\\user\\Desktop\\Web Mining Pres")

	#### redirects output to a file, but still has problems with getting data back into R!
	WRITE_TO_FILE <- function(x) {

	if (nchar(x) >0 ) {
	write.table(x, file="Twitter Stream Capture.txt", append=T, row.names=F, col.names=F)
	}

	}

	### windows users will need to get this certificate to authenticate
	download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

	### write the raw JSON data from the Twitter Firehouse to a text file
	getURL("https://stream.twitter.com/1/statuses/sample.json",
	userpwd=USER:PASSWORD,
	cainfo = "cacert.pem",
	write=WRITE_TO_FILE)