Skip to content

Instantly share code, notes, and snippets.

@jblumenau
Created May 29, 2016 02:24
Show Gist options
  • Save jblumenau/a2db9f5481b2119854d0e44f29269b69 to your computer and use it in GitHub Desktop.
Save jblumenau/a2db9f5481b2119854d0e44f29269b69 to your computer and use it in GitHub Desktop.
Script to demonstrate scraping potential of R (created as a teaching example)
#############################################
### A very messy example of scraping in R ###
#############################################
rm(list=ls()) # Remove everything from your current workspace
######################
### Load libraries ###
######################
library(ggplot2)
library(data.table)
library(XML)
library(RCurl)
library(reshape2)
library(scales)
trim <- function(s) gsub("^[[:space:]]+|[[:space:]]+$","",s) # This function trims all whitespace from a character vector
########################
### DATA COLLECTION ###
########################
## Relevant URLs from Ipsos MORI
urls <- list(
"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=2437",
"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=2438&view=wide",
"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=88&view=wide")
## Loop over the URLs
out.list <- list()
for(i in 1:length(urls)){ # Do the following for each url
lines <- getURL(urls[i]) # Load the html code from the relevant page
data <- htmlTreeParse(lines,useInternal=TRUE) # Parse it so that you have something you can navigate through
tables <- xpathApply(data,"//*[@class='datatable']") # Grab all the data tables from that page
years <- lapply(tables,function(x)xmlValue(x[["thead"]][["tr"]][["th"]])) # Grab all the years from that page
leaders.list <- list()
for(t in 1:length(tables)) leaders.list[[t]] <- unlist(lapply(tables[[t]][["thead"]][["tr"]]["th"][-1],xmlValue)) # Find each leader in each table
approval.data.list <- list()
for(t in 1:length(tables)) { ## Loop over extracted tables to actually get the data
tmp.approval.data <- readHTMLTable(tables[[t]],header=T)
# Clean it all up a bit
names(tmp.approval.data)[which(1:dim(tmp.approval.data)[2]%%2==0)] <- paste(leaders.list[[t]],"Sat",sep=" ")
names(tmp.approval.data)[which(1:dim(tmp.approval.data)[2]%%2==1)[-1]] <- paste(leaders.list[[t]],"Dis",sep=" ")
names(tmp.approval.data)[1] <- "Date"
tmp.approval.data <- melt(tmp.approval.data,id.vars="Date")
tmp.approval.data$Year <- years[[t]]
tmp.approval.data$Sat <- "Sat"
tmp.approval.data$Sat[grep("Dis",tmp.approval.data$variable)] <- "Dis"
tmp.approval.data$variable <- gsub(" Sat","",tmp.approval.data$variable)
tmp.approval.data$variable <- gsub(" Dis","",tmp.approval.data$variable)
approval.data.list[[t]] <- tmp.approval.data
}
# Bind everything together
approval.data <- do.call("rbind", approval.data.list)
approval.data <- approval.data[!is.na(approval.data$value),]
approval.data <- approval.data[approval.data$value!="",]
approval.data <- approval.data[approval.data$value!="-",]
approval.data <- approval.data[!nchar(approval.data$value)>2,]
approval.data$Date <- gsub("[[:digit:]]","",approval.data$Date)
approval.data$Date <- gsub("-","",approval.data$Date)
approval.data$Date <- trim(approval.data$Date)
approval.data$Date <- unlist(lapply(strsplit(approval.data$Date," "),function(x)x[1]))
approval.data <- approval.data[,c(2,1,4,3,5)]
names(approval.data) <- c("Leader","Month","Year","Rating","Satisfied")
out.list[[i]] <- approval.data
print(i)
}
## Bind everything together
out <- do.call("rbind", out.list)
out$Leader[out$Leader==" "] <- "Campbell"
out$Leader <- trim(out$Leader)
## Create party var
tories <- c("Thatcher","Major","Hague","Duncan Smith","Howard","Cameron")
labour <- c("Callaghan","Foot","Kinnock","Smith","Blair","Brown","Miliband")
libdem <- c("Steel","Ashdown","Kennedy","Campbell","Clegg")
sdp <- c("Jenkins","Owen","MacLennan")
ukip <- "Farage"
gov <- "Gov't"
out$Party <- "Conservatives"
out$Party[out$Leader%in%labour] <- "Labour"
out$Party[out$Leader%in%libdem] <- "Liberal Democrats"
out$Party[out$Leader%in%sdp] <- "SDP"
out$Party[out$Leader%in%ukip] <- "UKIP"
out$Party[out$Leader%in%gov] <- "Government"
## Clean up some other variables (date, rating)
out$Rating <- as.numeric(out$Rating)
out$chrDate <- paste(out$Month,out$Year)
out$monthNum <- match(out$Month,month.name)
out$chrDate <- paste("01",out$monthNum,out$Year,sep="/")
out$Date <- as.Date(out$chrDate,"%d/%m/%Y")
## Create DV: satisfied - dissatisfied
out.dt <- data.table(out[out$Satisfied=="Sat",],DisRating=out[out$Satisfied=="Dis",c("Rating")]) # (note switch to data.table from data.frame)
out.dt[,diffRating := Rating - DisRating] # Create diff var
out.dt[,meanDiffRating:=mean(diffRating),by=list(Party,Date)] # calculate mean diff var by party, by date
out.dt[,duplicates:=duplicated(Party),by=list(Date)] # drop duplicate observations
out.dt <- out.dt[duplicates==F]
out.dt$diffRating <- out.dt$meanDiffRating #replalce var
## Only keep Lab, Con, Lib Dem
out.dt <- out.dt[Party%in%c("Labour","Conservatives","Liberal Democrats")]
out.dt$Party <- as.factor(out.dt$Party)
#################
### PLOTTING ###
#################
party.colours <- c("#0087DC","#D50000","#FDBB30","#FFFF00","#008142","#99CC33","#70147A","#DDDDDD")
elections <- as.Date(c("01-05-1979","01-05-1983","01-05-1987","01-05-1992","01-05-1997","01-05-2001","01-05-2005","01-05-2010","01-05-2015"),"%d-%m-%Y")
plot(x=out.dt$Date,
y=out.dt$diffRating,
col=alpha(party.colours[as.numeric(out.dt$Party)],0.2),
pch=19,cex=0.5,
bty="n",
ylab="Net satisfaction",
xlab="Election",
xaxt="n",
main="Party leader net satisfaction")
for(p in 1:length(levels(out.dt$Party))) { # Loop over parties to plot loess line
out.tmp <- out.dt[out.dt$Party==levels(out.dt$Party)[p]]
lines(out.tmp$Date, predict(loess(out.tmp$diffRating ~ as.numeric(out.tmp$Date),span=0.1)),col=party.colours[1:3][p],lwd=4)
}
abline(v=elections,lty=2,col="gray") # Add gray lines for the elections
axis(1,at=elections,labels=format(elections,"%Y")) # Add an axis
legend("topright",c("Conservatives", "Labour","Liberal Democrats"), pch = 19, col=party.colours[1:3],box.col="white",bg="white") # Add a legend
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment