Created
February 22, 2016 21:42
-
-
Save davidski/16a1632c762a6e979d6e to your computer and use it in GitHub Desktop.
Sample extraction of data from a PDF into R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm) # text manipulation | |
library(stringi) # better string replacement | |
# specify the source url and the destination location | |
uri <- 'http://www.actiononsugar.org/News%20Centre/Surveys%20/2016/170862.pdf' | |
filename <- 'sugar.pdf' | |
# be kind, download only once | |
if (!file.exists(filename)) { | |
download.file(uri, filename, mode="wb") | |
} | |
# depends on having xpdf tools availabe in PATH | |
if(all(file.exists(Sys.which(c("pdfinfo", "pdftotext"))))) { | |
pdf <- readPDF(control = list(text = "-table"))(elem = list(uri = filename), | |
language = "en", | |
id = "id1") | |
} | |
# clean up the data and make it ready for CSV conversion | |
dat <- stri_replace_all_fixed(content(pdf)[13:274], ',', '-') | |
dat <- stri_replace_all_regex(dat, '[:space:]{2,}', ',') | |
dat <- dat[-168] # headers are repeated in the PDF content at this positon | |
out <- read.csv(textConnection(dat), header = FALSE, stringsAsFactors = FALSE) | |
colnames(out) <- c("cafe", "drink", "sugars", "teaspooons") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment