benmarwick · July 18, 2022 03:48 · edselpogi · Mar 2, 2017 · koka0901 · Apr 11, 2017
diff --git a/PDF-2-text-or-CSV.r b/PDF-2-text-or-CSV.r
 # Here are a few methods for getting text from PDF files. Do read through 
 # the instructions carefully! NOte that this code is written for Windows 7,
 # slight adjustments may be needed for other OSs

 # Tell R what folder contains your 1000s of PDFs
 dest <- "G:/somehere/with/many/PDFs"

 # make a vector of PDF file names
 myfiles <- list.files(path = dest, pattern = "pdf",  full.names = TRUE)

 # now there are a few options...

 ############### PDF (image of text format) to TXT ##########
 # This is for is your PDF is an image of text, this is the case
 # if you open the PDF in a PDF viewer and you cannot select
 # words or lines with your cursor.

                     ##### Wait! #####
 # Before proceeding, make sure you have a copy of Tesseract
 # on your computer! Details & download:
 # https://code.google.com/p/tesseract-ocr/
 # and a copy of ImageMagick: http://www.imagemagick.org/
 # and a copy of pdftoppm on your computer! 
 # Download: http://www.foolabs.com/xpdf/download.html
 # And then after installing those three, restart to 
 # ensure R can find them on your path. 
 # And note that this process can be quite slow...

 # PDF filenames can't have spaces in them for these operations
 # so let's get rid of the spaces in the filenames

 sapply(myfiles, FUN = function(i){
  file.rename(from = i, to =  paste0(dirname(i), "/", gsub(" ", "", basename(i))))
 })

 # get the PDF file names without spaces
 myfiles <- list.files(path = dest, pattern = "pdf",  full.names = TRUE)

 # Now we can do the OCR to the renamed PDF files. Don't worry
 # if you get messages like 'Config Error: No display 
 # font for...' it's nothing to worry about

 lapply(myfiles, function(i){
  # convert pdf to ppm (an image format), just pages 1-10 of the PDF
  # but you can change that easily, just remove or edit the 
  # -f 1 -l 10 bit in the line below
  shell(shQuote(paste0("pdftoppm ", i, " -f 1 -l 10 -r 600 ocrbook")))
  # convert ppm to tif ready for tesseract
  shell(shQuote(paste0("convert *.ppm ", i, ".tif")))
  # convert tif to text file
  shell(shQuote(paste0("tesseract ", i, ".tif ", i, " -l eng")))
  # delete tif file
  file.remove(paste0(i, ".tif" ))
  })


 # where are the txt files you just made?
 dest # in this folder

 # And now you're ready to do some text mining on the text files

 ############### PDF (text format) to TXT ###################

                  ##### Wait! #####
 # Before proceeding, make sure you have a copy of pdf2text
 # on your computer! Details: https://en.wikipedia.org/wiki/Pdftotext
 # Download: http://www.foolabs.com/xpdf/download.html

 # If you have a PDF with text, ie you can open the PDF in a 
 # PDF viewer and select text with your curser, then use these 
 # lines to convert each PDF file that is named in the vector 
 # into text file is created in the same directory as the PDFs
 # note that my pdftotext.exe is in a different location to yours
 lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"', paste0('"', i, '"')), wait = FALSE) )

 # where are the txt files you just made?
 dest # in this folder

 # And now you're ready to do some text mining on the text files

 ############### PDF to CSV (DfR format) ####################

 # or if you want DFR-style csv files...
 # read txt files into R
 mytxtfiles <- list.files(path = dest, pattern = "txt",  full.names = TRUE)

 library(tm)
 mycorpus <- Corpus(DirSource(dest, pattern = "txt"))
 # warnings may appear after you run the previous line, they
 # can be ignored
 mycorpus <- tm_map(mycorpus,  removeNumbers)
 mycorpus <- tm_map(mycorpus,  removePunctuation)
 mycorpus <- tm_map(mycorpus,  stripWhitespace)
 mydtm <- DocumentTermMatrix(mycorpus)
 # remove some OCR weirdness
 # words with more than 2 consecutive characters
 mydtm <- mydtm[,!grepl("(.)\\1{2,}", mydtm$dimnames$Terms)]

 # get each doc as a csv with words and counts
 for(i in 1:nrow(mydtm)){
  # get word counts
  counts <- as.vector(as.matrix(mydtm[1,]))
  # get words
  words <- mydtm$dimnames$Terms
  # combine into data frame
  df <- data.frame(word = words, count = counts,stringsAsFactors = FALSE)
  # exclude words with count of zero
  df <- df[df$count != 0,]
  # write to CSV with original txt filename
  write.csv(df, paste0(mydtm$dimnames$Docs[i],".csv"), row.names = FALSE) 
 }

 # and now you're ready to work with the csv files

 ############### PDF to TXT (all text between two words) ####

 ## Below is about splitting the text files at certain characters
 ## can be skipped...

 # if you just want the abstracts, we can use regex to extract that part of
 # each txt file, Assumes that the abstract is always between the words 'Abstract'
 # and 'Introduction'

 abstracts <- lapply(mytxtfiles, function(i) {
  j <- paste0(scan(i, what = character()), collapse = " ")
  regmatches(j, gregexpr("(?<=Abstract).*?(?=Introduction)", j, perl=TRUE))
 })
 # Write abstracts into separate txt files...

 # write abstracts as txt files 
 # (or use them in the list for whatever you want to do next)
 lapply(1:length(abstracts),  function(i) write.table(abstracts[i], file=paste(mytxtfiles[i], "abstract", "txt", sep="."), quote = FALSE, row.names = FALSE, col.names = FALSE, eol = " " ))

 # And now you're ready to do some text mining on the txt 

 # originally on http://stackoverflow.com/a/21449040/1036500
	# Here are a few methods for getting text from PDF files. Do read through
	# the instructions carefully! NOte that this code is written for Windows 7,
	# slight adjustments may be needed for other OSs

	# Tell R what folder contains your 1000s of PDFs
	dest <- "G:/somehere/with/many/PDFs"

	# make a vector of PDF file names
	myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)

	# now there are a few options...

	############### PDF (image of text format) to TXT ##########
	# This is for is your PDF is an image of text, this is the case
	# if you open the PDF in a PDF viewer and you cannot select
	# words or lines with your cursor.

	##### Wait! #####
	# Before proceeding, make sure you have a copy of Tesseract
	# on your computer! Details & download:
	# https://code.google.com/p/tesseract-ocr/
	# and a copy of ImageMagick: http://www.imagemagick.org/
	# and a copy of pdftoppm on your computer!
	# Download: http://www.foolabs.com/xpdf/download.html
	# And then after installing those three, restart to
	# ensure R can find them on your path.
	# And note that this process can be quite slow...

	# PDF filenames can't have spaces in them for these operations
	# so let's get rid of the spaces in the filenames

	sapply(myfiles, FUN = function(i){
	file.rename(from = i, to = paste0(dirname(i), "/", gsub(" ", "", basename(i))))
	})

	# get the PDF file names without spaces
	myfiles <- list.files(path = dest, pattern = "pdf", full.names = TRUE)

	# Now we can do the OCR to the renamed PDF files. Don't worry
	# if you get messages like 'Config Error: No display
	# font for...' it's nothing to worry about

	lapply(myfiles, function(i){
	# convert pdf to ppm (an image format), just pages 1-10 of the PDF
	# but you can change that easily, just remove or edit the
	# -f 1 -l 10 bit in the line below
	shell(shQuote(paste0("pdftoppm ", i, " -f 1 -l 10 -r 600 ocrbook")))
	# convert ppm to tif ready for tesseract
	shell(shQuote(paste0("convert *.ppm ", i, ".tif")))
	# convert tif to text file
	shell(shQuote(paste0("tesseract ", i, ".tif ", i, " -l eng")))
	# delete tif file
	file.remove(paste0(i, ".tif" ))
	})


	# where are the txt files you just made?
	dest # in this folder

	# And now you're ready to do some text mining on the text files

	############### PDF (text format) to TXT ###################

	##### Wait! #####
	# Before proceeding, make sure you have a copy of pdf2text
	# on your computer! Details: https://en.wikipedia.org/wiki/Pdftotext
	# Download: http://www.foolabs.com/xpdf/download.html

	# If you have a PDF with text, ie you can open the PDF in a
	# PDF viewer and select text with your curser, then use these
	# lines to convert each PDF file that is named in the vector
	# into text file is created in the same directory as the PDFs
	# note that my pdftotext.exe is in a different location to yours
	lapply(myfiles, function(i) system(paste('"C:/Program Files/xpdf/bin64/pdftotext.exe"', paste0('"', i, '"')), wait = FALSE) )

	# where are the txt files you just made?
	dest # in this folder

	# And now you're ready to do some text mining on the text files

	############### PDF to CSV (DfR format) ####################

	# or if you want DFR-style csv files...
	# read txt files into R
	mytxtfiles <- list.files(path = dest, pattern = "txt", full.names = TRUE)

	library(tm)
	mycorpus <- Corpus(DirSource(dest, pattern = "txt"))
	# warnings may appear after you run the previous line, they
	# can be ignored
	mycorpus <- tm_map(mycorpus, removeNumbers)
	mycorpus <- tm_map(mycorpus, removePunctuation)
	mycorpus <- tm_map(mycorpus, stripWhitespace)
	mydtm <- DocumentTermMatrix(mycorpus)
	# remove some OCR weirdness
	# words with more than 2 consecutive characters
	mydtm <- mydtm[,!grepl("(.)\\1{2,}", mydtm$dimnames$Terms)]

	# get each doc as a csv with words and counts
	for(i in 1:nrow(mydtm)){
	# get word counts
	counts <- as.vector(as.matrix(mydtm[1,]))
	# get words
	words <- mydtm$dimnames$Terms
	# combine into data frame
	df <- data.frame(word = words, count = counts,stringsAsFactors = FALSE)
	# exclude words with count of zero
	df <- df[df$count != 0,]
	# write to CSV with original txt filename
	write.csv(df, paste0(mydtm$dimnames$Docs[i],".csv"), row.names = FALSE)
	}

	# and now you're ready to work with the csv files

	############### PDF to TXT (all text between two words) ####

	## Below is about splitting the text files at certain characters
	## can be skipped...

	# if you just want the abstracts, we can use regex to extract that part of
	# each txt file, Assumes that the abstract is always between the words 'Abstract'
	# and 'Introduction'

	abstracts <- lapply(mytxtfiles, function(i) {
	j <- paste0(scan(i, what = character()), collapse = " ")
	regmatches(j, gregexpr("(?<=Abstract).*?(?=Introduction)", j, perl=TRUE))
	})
	# Write abstracts into separate txt files...

	# write abstracts as txt files
	# (or use them in the list for whatever you want to do next)
	lapply(1:length(abstracts), function(i) write.table(abstracts[i], file=paste(mytxtfiles[i], "abstract", "txt", sep="."), quote = FALSE, row.names = FALSE, col.names = FALSE, eol = " " ))

	# And now you're ready to do some text mining on the txt

	# originally on http://stackoverflow.com/a/21449040/1036500
No results found