dwbapst · November 15, 2017 10:26 · jpardofurness · Mar 12, 2024
diff --git a/count words in PDFs.R b/count words in PDFs.R

 library(pdftools)
 # finds files in current directory
 pdfs <- list.files(pattern = "pdf",  full.names = TRUE)

 # uses pdftools to convert pdfs to plain-text, replaces line breaks with spaces
  # and then counts the words, ignoring non-word symbols
 readCleanCount<-function(pdf){
 	txt<-pdf_text(pdf)
 	txt<-paste(gsub(txt,pattern="\r\n",replace=" "),collapse=" ")
  # regex stolen from StackOverFlow, like 99% of all regex
  # https://stackoverflow.com/questions/8920145/count-the-number-of-words-in-a-string-in-r
 	count<-sapply(gregexpr("[[:alpha:]]+", txt), function(x) sum(x > 0))
 	return(count)
 	}


 cbind(pdfs,sapply(pdfs, readCleanCount))

	library(pdftools)
	# finds files in current directory
	pdfs <- list.files(pattern = "pdf", full.names = TRUE)

	# uses pdftools to convert pdfs to plain-text, replaces line breaks with spaces
	# and then counts the words, ignoring non-word symbols
	readCleanCount<-function(pdf){
	txt<-pdf_text(pdf)
	txt<-paste(gsub(txt,pattern="\r\n",replace=" "),collapse=" ")
	# regex stolen from StackOverFlow, like 99% of all regex
	# https://stackoverflow.com/questions/8920145/count-the-number-of-words-in-a-string-in-r
	count<-sapply(gregexpr("[[:alpha:]]+", txt), function(x) sum(x > 0))
	return(count)
	}


	cbind(pdfs,sapply(pdfs, readCleanCount))