Created
May 29, 2016 02:24
-
-
Save jblumenau/a2db9f5481b2119854d0e44f29269b69 to your computer and use it in GitHub Desktop.
Script to demonstrate scraping potential of R (created as a teaching example)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################# | |
### A very messy example of scraping in R ### | |
############################################# | |
rm(list=ls()) # Remove everything from your current workspace | |
###################### | |
### Load libraries ### | |
###################### | |
library(ggplot2) | |
library(data.table) | |
library(XML) | |
library(RCurl) | |
library(reshape2) | |
library(scales) | |
trim <- function(s) gsub("^[[:space:]]+|[[:space:]]+$","",s) # This function trims all whitespace from a character vector | |
######################## | |
### DATA COLLECTION ### | |
######################## | |
## Relevant URLs from Ipsos MORI | |
urls <- list( | |
"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=2437", | |
"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=2438&view=wide", | |
"https://www.ipsos-mori.com/researchpublications/researcharchive/poll.aspx?oItemId=88&view=wide") | |
## Loop over the URLs | |
out.list <- list() | |
for(i in 1:length(urls)){ # Do the following for each url | |
lines <- getURL(urls[i]) # Load the html code from the relevant page | |
data <- htmlTreeParse(lines,useInternal=TRUE) # Parse it so that you have something you can navigate through | |
tables <- xpathApply(data,"//*[@class='datatable']") # Grab all the data tables from that page | |
years <- lapply(tables,function(x)xmlValue(x[["thead"]][["tr"]][["th"]])) # Grab all the years from that page | |
leaders.list <- list() | |
for(t in 1:length(tables)) leaders.list[[t]] <- unlist(lapply(tables[[t]][["thead"]][["tr"]]["th"][-1],xmlValue)) # Find each leader in each table | |
approval.data.list <- list() | |
for(t in 1:length(tables)) { ## Loop over extracted tables to actually get the data | |
tmp.approval.data <- readHTMLTable(tables[[t]],header=T) | |
# Clean it all up a bit | |
names(tmp.approval.data)[which(1:dim(tmp.approval.data)[2]%%2==0)] <- paste(leaders.list[[t]],"Sat",sep=" ") | |
names(tmp.approval.data)[which(1:dim(tmp.approval.data)[2]%%2==1)[-1]] <- paste(leaders.list[[t]],"Dis",sep=" ") | |
names(tmp.approval.data)[1] <- "Date" | |
tmp.approval.data <- melt(tmp.approval.data,id.vars="Date") | |
tmp.approval.data$Year <- years[[t]] | |
tmp.approval.data$Sat <- "Sat" | |
tmp.approval.data$Sat[grep("Dis",tmp.approval.data$variable)] <- "Dis" | |
tmp.approval.data$variable <- gsub(" Sat","",tmp.approval.data$variable) | |
tmp.approval.data$variable <- gsub(" Dis","",tmp.approval.data$variable) | |
approval.data.list[[t]] <- tmp.approval.data | |
} | |
# Bind everything together | |
approval.data <- do.call("rbind", approval.data.list) | |
approval.data <- approval.data[!is.na(approval.data$value),] | |
approval.data <- approval.data[approval.data$value!="",] | |
approval.data <- approval.data[approval.data$value!="-",] | |
approval.data <- approval.data[!nchar(approval.data$value)>2,] | |
approval.data$Date <- gsub("[[:digit:]]","",approval.data$Date) | |
approval.data$Date <- gsub("-","",approval.data$Date) | |
approval.data$Date <- trim(approval.data$Date) | |
approval.data$Date <- unlist(lapply(strsplit(approval.data$Date," "),function(x)x[1])) | |
approval.data <- approval.data[,c(2,1,4,3,5)] | |
names(approval.data) <- c("Leader","Month","Year","Rating","Satisfied") | |
out.list[[i]] <- approval.data | |
print(i) | |
} | |
## Bind everything together | |
out <- do.call("rbind", out.list) | |
out$Leader[out$Leader==" "] <- "Campbell" | |
out$Leader <- trim(out$Leader) | |
## Create party var | |
tories <- c("Thatcher","Major","Hague","Duncan Smith","Howard","Cameron") | |
labour <- c("Callaghan","Foot","Kinnock","Smith","Blair","Brown","Miliband") | |
libdem <- c("Steel","Ashdown","Kennedy","Campbell","Clegg") | |
sdp <- c("Jenkins","Owen","MacLennan") | |
ukip <- "Farage" | |
gov <- "Gov't" | |
out$Party <- "Conservatives" | |
out$Party[out$Leader%in%labour] <- "Labour" | |
out$Party[out$Leader%in%libdem] <- "Liberal Democrats" | |
out$Party[out$Leader%in%sdp] <- "SDP" | |
out$Party[out$Leader%in%ukip] <- "UKIP" | |
out$Party[out$Leader%in%gov] <- "Government" | |
## Clean up some other variables (date, rating) | |
out$Rating <- as.numeric(out$Rating) | |
out$chrDate <- paste(out$Month,out$Year) | |
out$monthNum <- match(out$Month,month.name) | |
out$chrDate <- paste("01",out$monthNum,out$Year,sep="/") | |
out$Date <- as.Date(out$chrDate,"%d/%m/%Y") | |
## Create DV: satisfied - dissatisfied | |
out.dt <- data.table(out[out$Satisfied=="Sat",],DisRating=out[out$Satisfied=="Dis",c("Rating")]) # (note switch to data.table from data.frame) | |
out.dt[,diffRating := Rating - DisRating] # Create diff var | |
out.dt[,meanDiffRating:=mean(diffRating),by=list(Party,Date)] # calculate mean diff var by party, by date | |
out.dt[,duplicates:=duplicated(Party),by=list(Date)] # drop duplicate observations | |
out.dt <- out.dt[duplicates==F] | |
out.dt$diffRating <- out.dt$meanDiffRating #replalce var | |
## Only keep Lab, Con, Lib Dem | |
out.dt <- out.dt[Party%in%c("Labour","Conservatives","Liberal Democrats")] | |
out.dt$Party <- as.factor(out.dt$Party) | |
################# | |
### PLOTTING ### | |
################# | |
party.colours <- c("#0087DC","#D50000","#FDBB30","#FFFF00","#008142","#99CC33","#70147A","#DDDDDD") | |
elections <- as.Date(c("01-05-1979","01-05-1983","01-05-1987","01-05-1992","01-05-1997","01-05-2001","01-05-2005","01-05-2010","01-05-2015"),"%d-%m-%Y") | |
plot(x=out.dt$Date, | |
y=out.dt$diffRating, | |
col=alpha(party.colours[as.numeric(out.dt$Party)],0.2), | |
pch=19,cex=0.5, | |
bty="n", | |
ylab="Net satisfaction", | |
xlab="Election", | |
xaxt="n", | |
main="Party leader net satisfaction") | |
for(p in 1:length(levels(out.dt$Party))) { # Loop over parties to plot loess line | |
out.tmp <- out.dt[out.dt$Party==levels(out.dt$Party)[p]] | |
lines(out.tmp$Date, predict(loess(out.tmp$diffRating ~ as.numeric(out.tmp$Date),span=0.1)),col=party.colours[1:3][p],lwd=4) | |
} | |
abline(v=elections,lty=2,col="gray") # Add gray lines for the elections | |
axis(1,at=elections,labels=format(elections,"%Y")) # Add an axis | |
legend("topright",c("Conservatives", "Labour","Liberal Democrats"), pch = 19, col=party.colours[1:3],box.col="white",bg="white") # Add a legend | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment