jblumenau · May 29, 2016 02:24
diff --git a/scraper.R b/scraper.R
 #############################################
 ### A very messy example of scraping in R ###
 #############################################

 rm(list=ls()) # Remove everything from your current workspace

 ######################
 ### Load libraries ###
 ######################

 library(ggplot2)
 library(data.table)
 library(XML)
 library(RCurl)
 library(reshape2)
 library(scales)

 trim <- function(s) gsub("^[[:space:]]+|[[:space:]]+$","",s) # This function trims all whitespace from a character vector

 ########################
 ### DATA COLLECTION  ###
 ########################

 ## Relevant URLs from Ipsos MORI
 urls <- list(
 	"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=2437",
 	"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=2438&view=wide",
 	"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=88&view=wide")


 ## Loop over the URLs

 out.list <- list()

 for(i in 1:length(urls)){ # Do the following for each url

 lines <- getURL(urls[i]) # Load the html code from the relevant page

 data <- htmlTreeParse(lines,useInternal=TRUE) # Parse it so that you have something you can navigate through

 tables <- xpathApply(data,"//*[@class='datatable']") # Grab all the data tables from that page

 years <- lapply(tables,function(x)xmlValue(x[["thead"]][["tr"]][["th"]])) # Grab all the years from that page

 leaders.list <- list()
 for(t in 1:length(tables)) leaders.list[[t]] <- unlist(lapply(tables[[t]][["thead"]][["tr"]]["th"][-1],xmlValue)) # Find each leader in each table

 approval.data.list <- list()

 for(t in 1:length(tables)) { ## Loop over extracted tables to actually get the data

 tmp.approval.data <- readHTMLTable(tables[[t]],header=T) 

 # Clean it all up a bit

 names(tmp.approval.data)[which(1:dim(tmp.approval.data)[2]%%2==0)] <- paste(leaders.list[[t]],"Sat",sep=" ")
 names(tmp.approval.data)[which(1:dim(tmp.approval.data)[2]%%2==1)[-1]] <- paste(leaders.list[[t]],"Dis",sep=" ")
 names(tmp.approval.data)[1] <- "Date"

 tmp.approval.data <- melt(tmp.approval.data,id.vars="Date")
 tmp.approval.data$Year <- years[[t]]
 tmp.approval.data$Sat <- "Sat"
 tmp.approval.data$Sat[grep("Dis",tmp.approval.data$variable)] <- "Dis"
 tmp.approval.data$variable <- gsub(" Sat","",tmp.approval.data$variable)
 tmp.approval.data$variable <- gsub(" Dis","",tmp.approval.data$variable)
 approval.data.list[[t]] <- tmp.approval.data

 }

 # Bind everything together

 approval.data <- do.call("rbind", approval.data.list)

 approval.data <- approval.data[!is.na(approval.data$value),]
 approval.data <- approval.data[approval.data$value!="",]
 approval.data <- approval.data[approval.data$value!="-",]
 approval.data <- approval.data[!nchar(approval.data$value)>2,]
 approval.data$Date <- gsub("[[:digit:]]","",approval.data$Date)
 approval.data$Date <- gsub("-","",approval.data$Date)
 approval.data$Date <- trim(approval.data$Date)
 approval.data$Date <- unlist(lapply(strsplit(approval.data$Date," "),function(x)x[1]))

 approval.data <- approval.data[,c(2,1,4,3,5)]
 names(approval.data) <- c("Leader","Month","Year","Rating","Satisfied")

 out.list[[i]] <- approval.data

 print(i)
 }

 ## Bind everything together

 out <- do.call("rbind", out.list)
 out$Leader[out$Leader==" "] <- "Campbell"

 out$Leader <- trim(out$Leader)

 ## Create party var

 tories <- c("Thatcher","Major","Hague","Duncan Smith","Howard","Cameron")
 labour <- c("Callaghan","Foot","Kinnock","Smith","Blair","Brown","Miliband")
 libdem <- c("Steel","Ashdown","Kennedy","Campbell","Clegg")
 sdp <- c("Jenkins","Owen","MacLennan")
 ukip <- "Farage"
 gov <- "Gov't"

 out$Party <- "Conservatives"
 out$Party[out$Leader%in%labour] <- "Labour"
 out$Party[out$Leader%in%libdem] <- "Liberal Democrats"
 out$Party[out$Leader%in%sdp] <- "SDP"
 out$Party[out$Leader%in%ukip] <- "UKIP"
 out$Party[out$Leader%in%gov] <- "Government"

 ## Clean up some other variables (date, rating)

 out$Rating <- as.numeric(out$Rating)
 out$chrDate <- paste(out$Month,out$Year)
 out$monthNum <- match(out$Month,month.name)
 out$chrDate <- paste("01",out$monthNum,out$Year,sep="/")
 out$Date <- as.Date(out$chrDate,"%d/%m/%Y")

 ## Create DV: satisfied - dissatisfied

 out.dt <- data.table(out[out$Satisfied=="Sat",],DisRating=out[out$Satisfied=="Dis",c("Rating")]) # (note switch to data.table from data.frame)

 out.dt[,diffRating := Rating - DisRating] # Create diff var
 out.dt[,meanDiffRating:=mean(diffRating),by=list(Party,Date)] # calculate mean diff var by party, by date

 out.dt[,duplicates:=duplicated(Party),by=list(Date)] # drop duplicate observations
 out.dt <- out.dt[duplicates==F] 

 out.dt$diffRating <- out.dt$meanDiffRating #replalce var

 ## Only keep Lab, Con, Lib Dem

 out.dt <- out.dt[Party%in%c("Labour","Conservatives","Liberal Democrats")]
 out.dt$Party <- as.factor(out.dt$Party)

 #################
 ### PLOTTING  ###
 #################

 party.colours <- c("#0087DC","#D50000","#FDBB30","#FFFF00","#008142","#99CC33","#70147A","#DDDDDD")
 elections <- as.Date(c("01-05-1979","01-05-1983","01-05-1987","01-05-1992","01-05-1997","01-05-2001","01-05-2005","01-05-2010","01-05-2015"),"%d-%m-%Y")

 plot(x=out.dt$Date,
 		y=out.dt$diffRating,
 		col=alpha(party.colours[as.numeric(out.dt$Party)],0.2),
 		pch=19,cex=0.5,
 		bty="n",
 		ylab="Net satisfaction",
 		xlab="Election",
 		xaxt="n",
 		main="Party leader net satisfaction")
 for(p in 1:length(levels(out.dt$Party))) { # Loop over parties to plot loess line
 	out.tmp <- out.dt[out.dt$Party==levels(out.dt$Party)[p]]
 lines(out.tmp$Date,	predict(loess(out.tmp$diffRating ~ as.numeric(out.tmp$Date),span=0.1)),col=party.colours[1:3][p],lwd=4)
 }

 abline(v=elections,lty=2,col="gray") # Add gray lines for the elections
 axis(1,at=elections,labels=format(elections,"%Y")) # Add an axis
 legend("topright",c("Conservatives", "Labour","Liberal Democrats"), pch = 19, col=party.colours[1:3],box.col="white",bg="white") # Add a legend
	#############################################
	### A very messy example of scraping in R ###
	#############################################

	rm(list=ls()) # Remove everything from your current workspace

	######################
	### Load libraries ###
	######################

	library(ggplot2)
	library(data.table)
	library(XML)
	library(RCurl)
	library(reshape2)
	library(scales)

	trim <- function(s) gsub("^[[:space:]]+\|[[:space:]]+$","",s) # This function trims all whitespace from a character vector

	########################
	### DATA COLLECTION ###
	########################

	## Relevant URLs from Ipsos MORI
	urls <- list(
	"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=2437",
	"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=2438&view=wide",
	"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=88&view=wide")


	## Loop over the URLs

	out.list <- list()

	for(i in 1:length(urls)){ # Do the following for each url

	lines <- getURL(urls[i]) # Load the html code from the relevant page

	data <- htmlTreeParse(lines,useInternal=TRUE) # Parse it so that you have something you can navigate through

	tables <- xpathApply(data,"//*[@class='datatable']") # Grab all the data tables from that page

	years <- lapply(tables,function(x)xmlValue(x[["thead"]][["tr"]][["th"]])) # Grab all the years from that page

	leaders.list <- list()
	for(t in 1:length(tables)) leaders.list[[t]] <- unlist(lapply(tables[[t]][["thead"]][["tr"]]["th"][-1],xmlValue)) # Find each leader in each table

	approval.data.list <- list()

	for(t in 1:length(tables)) { ## Loop over extracted tables to actually get the data

	tmp.approval.data <- readHTMLTable(tables[[t]],header=T)

	# Clean it all up a bit

	names(tmp.approval.data)[which(1:dim(tmp.approval.data)[2]%%2==0)] <- paste(leaders.list[[t]],"Sat",sep=" ")
	names(tmp.approval.data)[which(1:dim(tmp.approval.data)[2]%%2==1)[-1]] <- paste(leaders.list[[t]],"Dis",sep=" ")
	names(tmp.approval.data)[1] <- "Date"

	tmp.approval.data <- melt(tmp.approval.data,id.vars="Date")
	tmp.approval.data$Year <- years[[t]]
	tmp.approval.data$Sat <- "Sat"
	tmp.approval.data$Sat[grep("Dis",tmp.approval.data$variable)] <- "Dis"
	tmp.approval.data$variable <- gsub(" Sat","",tmp.approval.data$variable)
	tmp.approval.data$variable <- gsub(" Dis","",tmp.approval.data$variable)
	approval.data.list[[t]] <- tmp.approval.data

	}

	# Bind everything together

	approval.data <- do.call("rbind", approval.data.list)

	approval.data <- approval.data[!is.na(approval.data$value),]
	approval.data <- approval.data[approval.data$value!="",]
	approval.data <- approval.data[approval.data$value!="-",]
	approval.data <- approval.data[!nchar(approval.data$value)>2,]
	approval.data$Date <- gsub("[[:digit:]]","",approval.data$Date)
	approval.data$Date <- gsub("-","",approval.data$Date)
	approval.data$Date <- trim(approval.data$Date)
	approval.data$Date <- unlist(lapply(strsplit(approval.data$Date," "),function(x)x[1]))

	approval.data <- approval.data[,c(2,1,4,3,5)]
	names(approval.data) <- c("Leader","Month","Year","Rating","Satisfied")

	out.list[[i]] <- approval.data

	print(i)
	}

	## Bind everything together

	out <- do.call("rbind", out.list)
	out$Leader[out$Leader==" "] <- "Campbell"

	out$Leader <- trim(out$Leader)

	## Create party var

	tories <- c("Thatcher","Major","Hague","Duncan Smith","Howard","Cameron")
	labour <- c("Callaghan","Foot","Kinnock","Smith","Blair","Brown","Miliband")
	libdem <- c("Steel","Ashdown","Kennedy","Campbell","Clegg")
	sdp <- c("Jenkins","Owen","MacLennan")
	ukip <- "Farage"
	gov <- "Gov't"

	out$Party <- "Conservatives"
	out$Party[out$Leader%in%labour] <- "Labour"
	out$Party[out$Leader%in%libdem] <- "Liberal Democrats"
	out$Party[out$Leader%in%sdp] <- "SDP"
	out$Party[out$Leader%in%ukip] <- "UKIP"
	out$Party[out$Leader%in%gov] <- "Government"

	## Clean up some other variables (date, rating)

	out$Rating <- as.numeric(out$Rating)
	out$chrDate <- paste(out$Month,out$Year)
	out$monthNum <- match(out$Month,month.name)
	out$chrDate <- paste("01",out$monthNum,out$Year,sep="/")
	out$Date <- as.Date(out$chrDate,"%d/%m/%Y")

	## Create DV: satisfied - dissatisfied

	out.dt <- data.table(out[out$Satisfied=="Sat",],DisRating=out[out$Satisfied=="Dis",c("Rating")]) # (note switch to data.table from data.frame)

	out.dt[,diffRating := Rating - DisRating] # Create diff var
	out.dt[,meanDiffRating:=mean(diffRating),by=list(Party,Date)] # calculate mean diff var by party, by date

	out.dt[,duplicates:=duplicated(Party),by=list(Date)] # drop duplicate observations
	out.dt <- out.dt[duplicates==F]

	out.dt$diffRating <- out.dt$meanDiffRating #replalce var

	## Only keep Lab, Con, Lib Dem

	out.dt <- out.dt[Party%in%c("Labour","Conservatives","Liberal Democrats")]
	out.dt$Party <- as.factor(out.dt$Party)

	#################
	### PLOTTING ###
	#################

	party.colours <- c("#0087DC","#D50000","#FDBB30","#FFFF00","#008142","#99CC33","#70147A","#DDDDDD")
	elections <- as.Date(c("01-05-1979","01-05-1983","01-05-1987","01-05-1992","01-05-1997","01-05-2001","01-05-2005","01-05-2010","01-05-2015"),"%d-%m-%Y")

	plot(x=out.dt$Date,
	y=out.dt$diffRating,
	col=alpha(party.colours[as.numeric(out.dt$Party)],0.2),
	pch=19,cex=0.5,
	bty="n",
	ylab="Net satisfaction",
	xlab="Election",
	xaxt="n",
	main="Party leader net satisfaction")
	for(p in 1:length(levels(out.dt$Party))) { # Loop over parties to plot loess line
	out.tmp <- out.dt[out.dt$Party==levels(out.dt$Party)[p]]
	lines(out.tmp$Date, predict(loess(out.tmp$diffRating ~ as.numeric(out.tmp$Date),span=0.1)),col=party.colours[1:3][p],lwd=4)
	}

	abline(v=elections,lty=2,col="gray") # Add gray lines for the elections
	axis(1,at=elections,labels=format(elections,"%Y")) # Add an axis
	legend("topright",c("Conservatives", "Labour","Liberal Democrats"), pch = 19, col=party.colours[1:3],box.col="white",bg="white") # Add a legend