-
-
Save benmarwick/11333467 to your computer and use it in GitHub Desktop.
# Here are a few methods for getting text from PDF files. Do read through | |
# the instructions carefully! NOte that this code is written for Windows 7, | |
# slight adjustments may be needed for other OSs | |
# Tell R what folder contains your 1000s of PDFs | |
dest <- "G:/somehere/with/many/PDFs" | |
# make a vector of PDF file names | |
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE) | |
# now there are a few options... | |
############### PDF (image of text format) to TXT ########## | |
# This is for is your PDF is an image of text, this is the case | |
# if you open the PDF in a PDF viewer and you cannot select | |
# words or lines with your cursor. | |
##### Wait! ##### | |
# Before proceeding, make sure you have a copy of Tesseract | |
# on your computer! Details & download: | |
# https://code.google.com/p/tesseract-ocr/ | |
# and a copy of ImageMagick: http://www.imagemagick.org/ | |
# and a copy of pdftoppm on your computer! | |
# Download: http://www.foolabs.com/xpdf/download.html | |
# And then after installing those three, restart to | |
# ensure R can find them on your path. | |
# And note that this process can be quite slow... | |
# PDF filenames can't have spaces in them for these operations | |
# so let's get rid of the spaces in the filenames | |
sapply(myfiles, FUN = function(i){ | |
file.rename(from = i, to = paste0(dirname(i), "/", gsub(" ", "", basename(i)))) | |
}) | |
# get the PDF file names without spaces | |
myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE) | |
# Now we can do the OCR to the renamed PDF files. Don't worry | |
# if you get messages like 'Config Error: No display | |
# font for...' it's nothing to worry about | |
lapply(myfiles, function(i){ | |
# convert pdf to ppm (an image format), just pages 1-10 of the PDF | |
# but you can change that easily, just remove or edit the | |
# -f 1 -l 10 bit in the line below | |
shell(shQuote(paste0("pdftoppm ", i, " -f 1 -l 10 -r 600 ocrbook"))) | |
# convert ppm to tif ready for tesseract | |
shell(shQuote(paste0("convert *.ppm ", i, ".tif"))) | |
# convert tif to text file | |
shell(shQuote(paste0("tesseract ", i, ".tif ", i, " -l eng"))) | |
# delete tif file | |
file.remove(paste0(i, ".tif" )) | |
}) | |
# where are the txt files you just made? | |
dest # in this folder | |
# And now you're ready to do some text mining on the text files | |
############### PDF (text format) to TXT ################### | |
##### Wait! ##### | |
# Before proceeding, make sure you have a copy of pdf2text | |
# on your computer! Details: https://en.wikipedia.org/wiki/Pdftotext | |
# Download: http://www.foolabs.com/xpdf/download.html | |
# If you have a PDF with text, ie you can open the PDF in a | |
# PDF viewer and select text with your curser, then use these | |
# lines to convert each PDF file that is named in the vector | |
# into text file is created in the same directory as the PDFs | |
# note that my pdftotext.exe is in a different location to yours | |
lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"', paste0('"', i, '"')), wait = FALSE) ) | |
# where are the txt files you just made? | |
dest # in this folder | |
# And now you're ready to do some text mining on the text files | |
############### PDF to CSV (DfR format) #################### | |
# or if you want DFR-style csv files... | |
# read txt files into R | |
mytxtfiles <- list.files(path = dest, pattern = "txt", full.names = TRUE) | |
library(tm) | |
mycorpus <- Corpus(DirSource(dest, pattern = "txt")) | |
# warnings may appear after you run the previous line, they | |
# can be ignored | |
mycorpus <- tm_map(mycorpus, removeNumbers) | |
mycorpus <- tm_map(mycorpus, removePunctuation) | |
mycorpus <- tm_map(mycorpus, stripWhitespace) | |
mydtm <- DocumentTermMatrix(mycorpus) | |
# remove some OCR weirdness | |
# words with more than 2 consecutive characters | |
mydtm <- mydtm[,!grepl("(.)\\1{2,}", mydtm$dimnames$Terms)] | |
# get each doc as a csv with words and counts | |
for(i in 1:nrow(mydtm)){ | |
# get word counts | |
counts <- as.vector(as.matrix(mydtm[1,])) | |
# get words | |
words <- mydtm$dimnames$Terms | |
# combine into data frame | |
df <- data.frame(word = words, count = counts,stringsAsFactors = FALSE) | |
# exclude words with count of zero | |
df <- df[df$count != 0,] | |
# write to CSV with original txt filename | |
write.csv(df, paste0(mydtm$dimnames$Docs[i],".csv"), row.names = FALSE) | |
} | |
# and now you're ready to work with the csv files | |
############### PDF to TXT (all text between two words) #### | |
## Below is about splitting the text files at certain characters | |
## can be skipped... | |
# if you just want the abstracts, we can use regex to extract that part of | |
# each txt file, Assumes that the abstract is always between the words 'Abstract' | |
# and 'Introduction' | |
abstracts <- lapply(mytxtfiles, function(i) { | |
j <- paste0(scan(i, what = character()), collapse = " ") | |
regmatches(j, gregexpr("(?<=Abstract).*?(?=Introduction)", j, perl=TRUE)) | |
}) | |
# Write abstracts into separate txt files... | |
# write abstracts as txt files | |
# (or use them in the list for whatever you want to do next) | |
lapply(1:length(abstracts), function(i) write.table(abstracts[i], file=paste(mytxtfiles[i], "abstract", "txt", sep="."), quote = FALSE, row.names = FALSE, col.names = FALSE, eol = " " )) | |
# And now you're ready to do some text mining on the txt | |
# originally on http://stackoverflow.com/a/21449040/1036500 |
Hi. The code worked great. I used the "PDF (text format) to TXT".
Issue: I am not able to see the newly created txt files in my directory (dest) or anywhere in the folders.
this doesn't work. Naming it OCRBOOK makes the file name OCRBOOK-000001.ppm and then when ImageMagick needs to do its thing, it looks for the original file name.ppm which obviously doesn't exist. How to fix?
Also, the xPDF related syntax appears to be wrong, the options need to go right after the exe is called.
shell(shQuote(paste0("pdftopng -f 1 -l 10 -r 600 -gray ", i, " whatever")))
Hi. The code worked great. I used the "PDF (text format) to TXT".
Issue: I am not able to see the newly created txt files in my directory (dest) or anywhere in the folders.
Dear koka0901;
I also have the same problem. Have you solved the issue ?
I am getting the following error when trying to convert from pdf to txt, Does anyone know why? Thanks in advance..
Syntax Error (4021433): Unknown filter 'DCTDecode'
Syntax Error (3525089): Unknown filter 'DCTDecode'
Syntax Error (2849935): Unknown filter 'DCTDecode'
Syntax Error (2187540): Unknown filter 'DCTDecode'
Syntax Error (1489650): Unknown filter 'DCTDecode'
Invalid Parameter - C:
'tesseract' is not recognized as an internal or external command,
operable program or batch file.
Hi. Thank you for your code. I am new to R and I am getting an error
'pdftoppmC:' is not recognized as an internal or external command,
operable program or batch file.
Invalid Parameter - C:
'tesseractC:' is not recognized as an internal or external command,
Hope you could help. Thanks!