Created
August 9, 2013 14:57
-
-
Save rpietro/6194309 to your computer and use it in GitHub Desktop.
collection of web scraping scripts from multiple sources. sources are acknowledged at the top of each section
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source: http://goo.gl/bgiOgy | |
# Install the RCurl package if necessary | |
#install.packages("RCurl", dependencies = TRUE) | |
library("RCurl") | |
# Install the XML package if necessary | |
#install.packages("XML", dependencies = TRUE) | |
library("XML") | |
# Get first quarter archives | |
jan09 <- getURL("https://stat.ethz.ch/pipermail/r-help/2009-January/date.html", ssl.verifypeer = FALSE) | |
jan09_parsed <- htmlTreeParse(jan09) | |
jan09_parsed | |
#------------------------------------------------------------------------------ | |
# source: http://goo.gl/5zM2xu | |
# clean data | |
rm(list=ls()) | |
# web url | |
site <- "http://www.metoffice.gov.uk/climate/uk/stationdata/armaghdata.txt" | |
# call in data with try command in while loop | |
i <- 1 | |
while (i < 2){ | |
aa <- try(read.table(site,sep="\t")) | |
if (class(aa) == "try-error") { | |
next | |
} else { | |
i <- i + 1 | |
} | |
} | |
# grand! now inspect and trim off crap | |
aa <- aa[6:dim(aa)[1],] | |
# data is melted together so some tidying required | |
bb <- cc <- dd <- c() | |
for (i in (1:length(aa))){ | |
bb <- unlist(strsplit(as.character(aa[i]), " ")) | |
cc <- bb[nchar(bb)>0] ; cc <- cc[1:7] | |
dd <- rbind(dd,cc) | |
} | |
row.names(dd) <- dd[,1] | |
colnm <- c(dd[1,1],dd[1,2],paste(dd[1,3],dd[2,1],sep=" "), paste(dd[1,4], | |
dd[2,2],sep=" "), paste(dd[1,5],dd[2,3],sep=" "), | |
paste(dd[1,6],dd[2,4],sep=" "), paste(dd[1,7],dd[2,5],sep=" ")) | |
colnames(dd) <- colnm | |
armagh <- data.frame(dd[-c(1,2),]) | |
for (i in (1:dim(armagh)[2])){ | |
armagh[,i] <- as.numeric(as.character(armagh[,i])) | |
} | |
decmin <- armagh[armagh[,2]==12,4] | |
year <- armagh[armagh[,2]==12,1] | |
wh1 <- data.frame(cbind(armagh$tmin.degC[armagh$mm==12],armagh$yyyy[armagh$mm==12])) | |
wh1 <- na.omit(wh1) | |
# nice plot | |
library(ggplot2) | |
ggplot(wh1, aes(X2,X1)) + | |
geom_line(colour="red") + | |
theme_bw() + | |
scale_x_continuous('Year') + | |
scale_y_continuous('Minimum Temperature - Degree Celsius') + | |
opts(title = expression("December Average Daily Minimum Temperature - Armagh 1865-2011")) | |
In the script above, I call in these data, tidy them up and then do a pretty graph with | |
#------------------------------------------------------------------------------ | |
# source http://goo.gl/K8zrd | |
library(XML) | |
theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team" | |
tables <- readHTMLTable(theurl) | |
n.rows <- unlist(lapply(tables, function(t) dim(t)[1])) | |
brazilfootball<-tables[[which.max(n.rows)]] | |
head(brazilfootball) | |
#Takes the largest which is C | |
theurl <- "http://grants.nih.gov/grants/glossary.htm" | |
tables <- readHTMLTable(theurl) | |
n.rows <- unlist(lapply(tables, function(t) dim(t)[1])) | |
nihglossaryA<-tables[[10]] | |
nihglossaryC<-tables[[which.max(n.rows)]] | |
#Here are several tables on one page | |
theurl<-"http://elections.nytimes.com/2010/results/senate/big-board" | |
tables <- readHTMLTable(theurl) | |
n.rows <- unlist(lapply(tables, function(t) dim(t)[1])) | |
#This is combines all but the first table | |
Elections<-do.call(rbind, tables[-1]) | |
cleanElections = cbind(Elections[1], sapply(Elections[-1], function(xx) as.numeric(gsub('[^0-9]', '', xx)))) | |
rownames(cleanElections) <-1:nrow(cleanElections) | |
tables = readHTMLTable('http://www.disastercenter.com/crime/iacrime.htm') | |
## the 3rd element is what we want | |
x = tables[[3]] | |
## names are in the first 2 rows, paste column-wise "2", not rowwise "1" | |
nms = as.vector(apply(x[1:2, ], 2, paste, collapse = '')) | |
## remove the first 2 rows because they are not data | |
x = x[-(1:2), ] | |
## assign the names to data | |
names(x) = nms | |
## then remove any characters which are not numbers (i.e. 0-9) | |
x = sapply(x, function(xx) as.numeric(gsub('[^0-9]', '', xx))) | |
## x is a matrix, so put into dataframe | |
crimetable<-data.frame(x) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment