davidski · February 22, 2016 21:42
diff --git a/sugar.R b/sugar.R
 library(tm)      # text manipulation
 library(stringi) # better string replacement

 # specify the source url and the destination location
 uri <- 'http://www.actiononsugar.org/News%20Centre/Surveys%20/2016/170862.pdf'
 filename <- 'sugar.pdf'

 # be kind, download only once
 if (!file.exists(filename)) {
  download.file(uri, filename, mode="wb")
 }

 # depends on having xpdf tools availabe in PATH
 if(all(file.exists(Sys.which(c("pdfinfo", "pdftotext"))))) {
  pdf <- readPDF(control = list(text = "-table"))(elem = list(uri = filename),
                                                   language = "en",
                                                   id = "id1")
 }

 # clean up the data and make it ready for CSV conversion
 dat <- stri_replace_all_fixed(content(pdf)[13:274], ',', '-')
 dat <- stri_replace_all_regex(dat, '[:space:]{2,}', ',')
 dat <- dat[-168] # headers are repeated in the PDF content at this positon

 out <- read.csv(textConnection(dat), header = FALSE, stringsAsFactors = FALSE)
 colnames(out) <- c("cafe", "drink", "sugars", "teaspooons")
	library(tm) # text manipulation
	library(stringi) # better string replacement

	# specify the source url and the destination location
	uri <- 'http://www.actiononsugar.org/News%20Centre/Surveys%20/2016/170862.pdf'
	filename <- 'sugar.pdf'

	# be kind, download only once
	if (!file.exists(filename)) {
	download.file(uri, filename, mode="wb")
	}

	# depends on having xpdf tools availabe in PATH
	if(all(file.exists(Sys.which(c("pdfinfo", "pdftotext"))))) {
	pdf <- readPDF(control = list(text = "-table"))(elem = list(uri = filename),
	language = "en",
	id = "id1")
	}

	# clean up the data and make it ready for CSV conversion
	dat <- stri_replace_all_fixed(content(pdf)[13:274], ',', '-')
	dat <- stri_replace_all_regex(dat, '[:space:]{2,}', ',')
	dat <- dat[-168] # headers are repeated in the PDF content at this positon

	out <- read.csv(textConnection(dat), header = FALSE, stringsAsFactors = FALSE)
	colnames(out) <- c("cafe", "drink", "sugars", "teaspooons")