Last active
December 17, 2015 01:08
-
-
Save joshbode/5525698 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # What did Barbara get up to in Spain? | |
| # TL;DR: 11 Gelato, 68 Drinks and 36 Tapas | |
| library(XML) | |
| library(ggplot2) | |
| library(reshape2) | |
| # parse page into XML DOM document | |
| parse_doc = function (page) { | |
| # get html text so that bad tags can be stripped | |
| u = url(page) | |
| html = paste(readLines(u, warn=FALSE), collapse='\n') | |
| close(u) | |
| # remove spurious tag with no declared namespace (b:*) | |
| html = gsub('b:[^ ]+ ', '', html) | |
| # remove script tags with unparsable CDATA | |
| html = gsub('<script[^>]*>.*?</script>', '', html) | |
| return(xmlRoot(htmlTreeParse(html, useInt=TRUE, asText=TRUE))) | |
| } | |
| # get the next page | |
| get_next_page = function (doc) { | |
| return(xpathSApply(doc, '//a[@class="blog-pager-newer-link"]/@href')) | |
| } | |
| # get consumption summary | |
| get_consumption = function (doc) { | |
| # find a line in the post-body starting with Gelati | |
| # trim leading spaces | |
| return(gsub('^ +', '', | |
| xpathSApply(doc, '//div[@class="post-body entry-content"]/text()[contains(., "Gelati")]', xmlValue, trim=TRUE) | |
| )) | |
| } | |
| # parse consumption statistics | |
| parse_consumption = function (consumption, synonyms=NULL) { | |
| # regular expressions to identify: | |
| # - keys (like Gelati:) | |
| # - values (integers) | |
| key_regex = '\\w+(?=: )' # word characters (one or more) followed by a colon and a space | |
| value_regex = '\\d+' # integers of one or more digits | |
| # use perl regexes to enable zero-width lookahead | |
| key_matches = gregexpr(key_regex, consumption, perl=TRUE) | |
| # process values as the inversion of the key matches | |
| values = sapply(unlist(regmatches(consumption, key_matches, invert=TRUE))[-1], function(x) { | |
| sum(as.integer(unlist(regmatches(x, gregexpr(value_regex, x))))) | |
| }) | |
| # extract keys and rename synonyms | |
| keys = unlist(regmatches(consumption, key_matches)) | |
| keys = ifelse(keys %in% names(synonyms), synonyms[keys], keys) | |
| names(values) = keys | |
| return(values) | |
| } | |
| # set up synonyms to map to standardised labels | |
| synonyms = c('Beer'='Drink', 'Wine'='Drink') | |
| # start page | |
| page = 'http://barbara-foodwinepilgrimage.blogspot.com.au/2013/03/day-1-palma-de-mallorca-gelati-1-beer-2.html' | |
| # loop until there are no more pages | |
| l = list() | |
| while (!is.null(page)) { | |
| doc = parse_doc(page) | |
| stats = parse_consumption(get_consumption(doc), synonyms) | |
| l = append(l, list(stats)) | |
| page = get_next_page(doc) | |
| } | |
| # summarise the results | |
| consumption = as.data.frame(do.call(rbind, l)) | |
| consumption$Day = 1:nrow(consumption) | |
| d = melt(consumption, id='Day', variable.name='Type', value.name='Amount') | |
| p = ggplot(data=d, aes(x=Day, y=Amount)) + | |
| geom_bar(aes(colour=Type, fill=Type), stat='identity') + | |
| facet_grid(Type ~ .) | |
| plot(p) | |
| p = ggplot(data=d, aes(x=Amount)) + | |
| geom_bar(aes(colour=Type, fill=Type)) + | |
| facet_grid(Type ~ .) | |
| plot(p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment