Skip to content

Instantly share code, notes, and snippets.

@explodecomputer
Created May 8, 2015 09:33
Show Gist options
  • Save explodecomputer/44ab5d3d2e93e6a240fd to your computer and use it in GitHub Desktop.
Save explodecomputer/44ab5d3d2e93e6a240fd to your computer and use it in GitHub Desktop.
Read pdf files
setwd("/I/DON\'T/KNOW/YOUR/COMPUTER")
# Install pdftotext from here:
# http://mac.softpedia.com/get/Word-Processing/pdftotext-Installer-Package.shtml
# Find pdf files in a directory
get_pdf_files <- function(wd="./")
{
pdfs <- grep(".pdf$", dir(wd), ignore.case=TRUE, value=TRUE)
return(pdfs)
}
# Run the pdftotext programme on a pdf file
convert_to_txt <- function(pdffile, eol = "mac")
{
txtfile <- gsub(".pdf$", ".txt", pdffile)
# run pdftotext and save the return flag - if not 0 then there was an error
error <- system(paste("pdftotext -enc UTF-8 -eol", eol, pdffile, txtfile))
# get rid of control characters
system(paste("tr -cd '\11\12\15\40-\176' < ", txtfile, " > newfile"))
system(paste("mv newfile", txtfile))
return(list(error=error, txtfile=txtfile))
}
# Find all pdfs
pdfs <- get_pdf_files()
# Read each txt file as a vector, each element being a line in the pdf stored as string
# Store each txt file as an element in a list
l <- list()
for(i in 1:length(pdfs))
{
out <- convert_to_txt(pdfs[i], eol="unix")
if(out$error == 0) # If there was no error then try reading the file
{
l[[i]] <- readLines(out$txtfile)
} else {
l[[i]] <- NULL
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment