Skip to content

Instantly share code, notes, and snippets.

@joshbode
Last active December 17, 2015 01:08
Show Gist options
  • Select an option

  • Save joshbode/5525698 to your computer and use it in GitHub Desktop.

Select an option

Save joshbode/5525698 to your computer and use it in GitHub Desktop.
# What did Barbara get up to in Spain?
# TL;DR: 11 Gelato, 68 Drinks and 36 Tapas
library(XML)
library(ggplot2)
library(reshape2)
# parse page into XML DOM document
parse_doc = function (page) {
# get html text so that bad tags can be stripped
u = url(page)
html = paste(readLines(u, warn=FALSE), collapse='\n')
close(u)
# remove spurious tag with no declared namespace (b:*)
html = gsub('b:[^ ]+ ', '', html)
# remove script tags with unparsable CDATA
html = gsub('<script[^>]*>.*?</script>', '', html)
return(xmlRoot(htmlTreeParse(html, useInt=TRUE, asText=TRUE)))
}
# get the next page
get_next_page = function (doc) {
return(xpathSApply(doc, '//a[@class="blog-pager-newer-link"]/@href'))
}
# get consumption summary
get_consumption = function (doc) {
# find a line in the post-body starting with Gelati
# trim leading spaces
return(gsub('^ +', '',
xpathSApply(doc, '//div[@class="post-body entry-content"]/text()[contains(., "Gelati")]', xmlValue, trim=TRUE)
))
}
# parse consumption statistics
parse_consumption = function (consumption, synonyms=NULL) {
# regular expressions to identify:
# - keys (like Gelati:)
# - values (integers)
key_regex = '\\w+(?=: )' # word characters (one or more) followed by a colon and a space
value_regex = '\\d+' # integers of one or more digits
# use perl regexes to enable zero-width lookahead
key_matches = gregexpr(key_regex, consumption, perl=TRUE)
# process values as the inversion of the key matches
values = sapply(unlist(regmatches(consumption, key_matches, invert=TRUE))[-1], function(x) {
sum(as.integer(unlist(regmatches(x, gregexpr(value_regex, x)))))
})
# extract keys and rename synonyms
keys = unlist(regmatches(consumption, key_matches))
keys = ifelse(keys %in% names(synonyms), synonyms[keys], keys)
names(values) = keys
return(values)
}
# set up synonyms to map to standardised labels
synonyms = c('Beer'='Drink', 'Wine'='Drink')
# start page
page = 'http://barbara-foodwinepilgrimage.blogspot.com.au/2013/03/day-1-palma-de-mallorca-gelati-1-beer-2.html'
# loop until there are no more pages
l = list()
while (!is.null(page)) {
doc = parse_doc(page)
stats = parse_consumption(get_consumption(doc), synonyms)
l = append(l, list(stats))
page = get_next_page(doc)
}
# summarise the results
consumption = as.data.frame(do.call(rbind, l))
consumption$Day = 1:nrow(consumption)
d = melt(consumption, id='Day', variable.name='Type', value.name='Amount')
p = ggplot(data=d, aes(x=Day, y=Amount)) +
geom_bar(aes(colour=Type, fill=Type), stat='identity') +
facet_grid(Type ~ .)
plot(p)
p = ggplot(data=d, aes(x=Amount)) +
geom_bar(aes(colour=Type, fill=Type)) +
facet_grid(Type ~ .)
plot(p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment