Skip to content

Instantly share code, notes, and snippets.

@rpietro
Created August 9, 2013 14:57
Show Gist options
  • Save rpietro/6194309 to your computer and use it in GitHub Desktop.
Save rpietro/6194309 to your computer and use it in GitHub Desktop.
collection of web scraping scripts from multiple sources. sources are acknowledged at the top of each section
# source: http://goo.gl/bgiOgy
# Install the RCurl package if necessary
#install.packages("RCurl", dependencies = TRUE)
library("RCurl")
# Install the XML package if necessary
#install.packages("XML", dependencies = TRUE)
library("XML")
# Get first quarter archives
jan09 <- getURL("https://stat.ethz.ch/pipermail/r-help/2009-January/date.html", ssl.verifypeer = FALSE)
jan09_parsed <- htmlTreeParse(jan09)
jan09_parsed
#------------------------------------------------------------------------------
# source: http://goo.gl/5zM2xu
# clean data
rm(list=ls())
# web url
site <- "http://www.metoffice.gov.uk/climate/uk/stationdata/armaghdata.txt"
# call in data with try command in while loop
i <- 1
while (i < 2){
aa <- try(read.table(site,sep="\t"))
if (class(aa) == "try-error") {
next
} else {
i <- i + 1
}
}
# grand! now inspect and trim off crap
aa <- aa[6:dim(aa)[1],]
# data is melted together so some tidying required
bb <- cc <- dd <- c()
for (i in (1:length(aa))){
bb <- unlist(strsplit(as.character(aa[i]), " "))
cc <- bb[nchar(bb)>0] ; cc <- cc[1:7]
dd <- rbind(dd,cc)
}
row.names(dd) <- dd[,1]
colnm <- c(dd[1,1],dd[1,2],paste(dd[1,3],dd[2,1],sep=" "), paste(dd[1,4],
dd[2,2],sep=" "), paste(dd[1,5],dd[2,3],sep=" "),
paste(dd[1,6],dd[2,4],sep=" "), paste(dd[1,7],dd[2,5],sep=" "))
colnames(dd) <- colnm
armagh <- data.frame(dd[-c(1,2),])
for (i in (1:dim(armagh)[2])){
armagh[,i] <- as.numeric(as.character(armagh[,i]))
}
decmin <- armagh[armagh[,2]==12,4]
year <- armagh[armagh[,2]==12,1]
wh1 <- data.frame(cbind(armagh$tmin.degC[armagh$mm==12],armagh$yyyy[armagh$mm==12]))
wh1 <- na.omit(wh1)
# nice plot
library(ggplot2)
ggplot(wh1, aes(X2,X1)) +
geom_line(colour="red") +
theme_bw() +
scale_x_continuous('Year') +
scale_y_continuous('Minimum Temperature - Degree Celsius') +
opts(title = expression("December Average Daily Minimum Temperature - Armagh 1865-2011"))
In the script above, I call in these data, tidy them up and then do a pretty graph with
#------------------------------------------------------------------------------
# source http://goo.gl/K8zrd
library(XML)
theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
tables <- readHTMLTable(theurl)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
brazilfootball<-tables[[which.max(n.rows)]]
head(brazilfootball)
#Takes the largest which is C
theurl <- "http://grants.nih.gov/grants/glossary.htm"
tables <- readHTMLTable(theurl)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
nihglossaryA<-tables[[10]]
nihglossaryC<-tables[[which.max(n.rows)]]
#Here are several tables on one page
theurl<-"http://elections.nytimes.com/2010/results/senate/big-board"
tables <- readHTMLTable(theurl)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
#This is combines all but the first table
Elections<-do.call(rbind, tables[-1])
cleanElections = cbind(Elections[1], sapply(Elections[-1], function(xx) as.numeric(gsub('[^0-9]', '', xx))))
rownames(cleanElections) <-1:nrow(cleanElections)
tables = readHTMLTable('http://www.disastercenter.com/crime/iacrime.htm')
## the 3rd element is what we want
x = tables[[3]]
## names are in the first 2 rows, paste column-wise "2", not rowwise "1"
nms = as.vector(apply(x[1:2, ], 2, paste, collapse = ''))
## remove the first 2 rows because they are not data
x = x[-(1:2), ]
## assign the names to data
names(x) = nms
## then remove any characters which are not numbers (i.e. 0-9)
x = sapply(x, function(xx) as.numeric(gsub('[^0-9]', '', xx)))
## x is a matrix, so put into dataframe
crimetable<-data.frame(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment