joshbode · December 17, 2015 01:08
diff --git a/tapas.r b/tapas.r
 # What did Barbara get up to in Spain?
 #   TL;DR: 11 Gelato, 68 Drinks and 36 Tapas

 library(XML)
 library(ggplot2)
 library(reshape2)

 # parse page into XML DOM document
 parse_doc = function (page) {

  # get html text so that bad tags can be stripped
  u = url(page)
  html = paste(readLines(u, warn=FALSE), collapse='\n')
  close(u)

  # remove spurious tag with no declared namespace (b:*)
  html = gsub('b:[^ ]+ ', '', html)

  # remove script tags with unparsable CDATA
  html = gsub('<script[^>]*>.*?</script>', '', html)

  return(xmlRoot(htmlTreeParse(html, useInt=TRUE, asText=TRUE)))

 }

 # get the next page
 get_next_page = function (doc) {

  return(xpathSApply(doc, '//a[@class="blog-pager-newer-link"]/@href'))

 }

 # get consumption summary
 get_consumption = function (doc) {

  # find a line in the post-body starting with Gelati
  # trim leading spaces
  return(gsub('^ +', '',
    xpathSApply(doc, '//div[@class="post-body entry-content"]/text()[contains(., "Gelati")]', xmlValue, trim=TRUE)
  ))

 }

 # parse consumption statistics
 parse_consumption = function (consumption, synonyms=NULL) {

  # regular expressions to identify:
  #  - keys (like Gelati:)
  #  - values (integers)
  key_regex = '\\w+(?=: )'  # word characters (one or more) followed by a colon and a space
  value_regex = '\\d+'  # integers of one or more digits

  # use perl regexes to enable zero-width lookahead
  key_matches = gregexpr(key_regex, consumption, perl=TRUE)

  # process values as the inversion of the key matches
  values = sapply(unlist(regmatches(consumption, key_matches, invert=TRUE))[-1], function(x) {
    sum(as.integer(unlist(regmatches(x, gregexpr(value_regex, x)))))
  })

  # extract keys and rename synonyms
  keys = unlist(regmatches(consumption, key_matches))
  keys = ifelse(keys %in% names(synonyms), synonyms[keys], keys)

  names(values) = keys

  return(values)

 }

 # set up synonyms to map to standardised labels
 synonyms = c('Beer'='Drink', 'Wine'='Drink')

 # start page
 page = 'http://barbara-foodwinepilgrimage.blogspot.com.au/2013/03/day-1-palma-de-mallorca-gelati-1-beer-2.html'

 # loop until there are no more pages
 l = list()

 while (!is.null(page)) {

  doc = parse_doc(page)

  stats = parse_consumption(get_consumption(doc), synonyms)

  l = append(l, list(stats))

  page = get_next_page(doc)

 }

 # summarise the results
 consumption = as.data.frame(do.call(rbind, l))
 consumption$Day = 1:nrow(consumption)
 d = melt(consumption, id='Day', variable.name='Type', value.name='Amount')

 p = ggplot(data=d, aes(x=Day, y=Amount)) +
    geom_bar(aes(colour=Type, fill=Type), stat='identity') +
    facet_grid(Type ~ .)
 plot(p)

 p = ggplot(data=d, aes(x=Amount)) +
    geom_bar(aes(colour=Type, fill=Type)) +
    facet_grid(Type ~ .)
 plot(p)
	# What did Barbara get up to in Spain?
	# TL;DR: 11 Gelato, 68 Drinks and 36 Tapas

	library(XML)
	library(ggplot2)
	library(reshape2)

	# parse page into XML DOM document
	parse_doc = function (page) {

	# get html text so that bad tags can be stripped
	u = url(page)
	html = paste(readLines(u, warn=FALSE), collapse='\n')
	close(u)

	# remove spurious tag with no declared namespace (b:*)
	html = gsub('b:[^ ]+ ', '', html)

	# remove script tags with unparsable CDATA
	html = gsub('<script[^>]>.?</script>', '', html)

	return(xmlRoot(htmlTreeParse(html, useInt=TRUE, asText=TRUE)))

	}

	# get the next page
	get_next_page = function (doc) {

	return(xpathSApply(doc, '//a[@class="blog-pager-newer-link"]/@href'))

	}

	# get consumption summary
	get_consumption = function (doc) {

	# find a line in the post-body starting with Gelati
	# trim leading spaces
	return(gsub('^ +', '',
	xpathSApply(doc, '//div[@class="post-body entry-content"]/text()[contains(., "Gelati")]', xmlValue, trim=TRUE)
	))

	}

	# parse consumption statistics
	parse_consumption = function (consumption, synonyms=NULL) {

	# regular expressions to identify:
	# - keys (like Gelati:)
	# - values (integers)
	key_regex = '\\w+(?=: )' # word characters (one or more) followed by a colon and a space
	value_regex = '\\d+' # integers of one or more digits

	# use perl regexes to enable zero-width lookahead
	key_matches = gregexpr(key_regex, consumption, perl=TRUE)

	# process values as the inversion of the key matches
	values = sapply(unlist(regmatches(consumption, key_matches, invert=TRUE))[-1], function(x) {
	sum(as.integer(unlist(regmatches(x, gregexpr(value_regex, x)))))
	})

	# extract keys and rename synonyms
	keys = unlist(regmatches(consumption, key_matches))
	keys = ifelse(keys %in% names(synonyms), synonyms[keys], keys)

	names(values) = keys

	return(values)

	}

	# set up synonyms to map to standardised labels
	synonyms = c('Beer'='Drink', 'Wine'='Drink')

	# start page
	page = 'http://barbara-foodwinepilgrimage.blogspot.com.au/2013/03/day-1-palma-de-mallorca-gelati-1-beer-2.html'

	# loop until there are no more pages
	l = list()

	while (!is.null(page)) {

	doc = parse_doc(page)

	stats = parse_consumption(get_consumption(doc), synonyms)

	l = append(l, list(stats))

	page = get_next_page(doc)

	}

	# summarise the results
	consumption = as.data.frame(do.call(rbind, l))
	consumption$Day = 1:nrow(consumption)
	d = melt(consumption, id='Day', variable.name='Type', value.name='Amount')

	p = ggplot(data=d, aes(x=Day, y=Amount)) +
	geom_bar(aes(colour=Type, fill=Type), stat='identity') +
	facet_grid(Type ~ .)
	plot(p)

	p = ggplot(data=d, aes(x=Amount)) +
	geom_bar(aes(colour=Type, fill=Type)) +
	facet_grid(Type ~ .)
	plot(p)
No results found