jmclawson · July 12, 2016 18:53
diff --git a/nounsplitter.R b/nounsplitter.R
 library(NLP) 
 library(openNLP)

 # credit due to http://stackoverflow.com/questions/30995232/how-to-use-opennlp-to-get-pos-tags-in-r
 ## SET "data.dir" to a directory containing text files. It's not designed for nested directories.
 data.dir <- "data/texts/"

 ## SET "saveas.file" to the destination directory. Resulting files have an "n" prepended to the file name, but it may be useful to save to a separate directory.
 saveas.file <- "data/texts-n/"

 file.list <- list.files(path=data.dir) # get all the filenames in the path

 for (text in file.list) { # run the loop for every file in the folder
  txt <- readLines(paste(data.dir, text, sep=""))
  txt <- as.String(txt)
  wordAnnotation <- annotate(txt, list(Maxent_Sent_Token_Annotator(), Maxent_Word_Token_Annotator()))
  POSAnnotation <- annotate(txt, Maxent_POS_Tag_Annotator(), wordAnnotation)
  POSwords <- subset(POSAnnotation, type == "word")
  tags <- sapply(POSwords$features, '[[', "POS")
  thisPOSindex <- grep("NN$", tags)# Searches for tagged common nouns (NN); change it for other part-of-speech tags
  tokenizedAndTagged <- sprintf("%s/%s", txt[POSwords][thisPOSindex], tags[thisPOSindex])
  untokenizedAndTagged <- paste(tokenizedAndTagged, collapse = " ")
  untokenizedAndTagged <- gsub("\\/NN", "", untokenizedAndTagged)# "NN" signifies common nouns
  save.text <- paste(saveas.file, "n", text, sep="")
  write(untokenizedAndTagged, file=save.text, append = FALSE, sep="")
 }
	library(NLP)
	library(openNLP)

	# credit due to http://stackoverflow.com/questions/30995232/how-to-use-opennlp-to-get-pos-tags-in-r
	## SET "data.dir" to a directory containing text files. It's not designed for nested directories.
	data.dir <- "data/texts/"

	## SET "saveas.file" to the destination directory. Resulting files have an "n" prepended to the file name, but it may be useful to save to a separate directory.
	saveas.file <- "data/texts-n/"

	file.list <- list.files(path=data.dir) # get all the filenames in the path

	for (text in file.list) { # run the loop for every file in the folder
	txt <- readLines(paste(data.dir, text, sep=""))
	txt <- as.String(txt)
	wordAnnotation <- annotate(txt, list(Maxent_Sent_Token_Annotator(), Maxent_Word_Token_Annotator()))
	POSAnnotation <- annotate(txt, Maxent_POS_Tag_Annotator(), wordAnnotation)
	POSwords <- subset(POSAnnotation, type == "word")
	tags <- sapply(POSwords$features, '[[', "POS")
	thisPOSindex <- grep("NN$", tags)# Searches for tagged common nouns (NN); change it for other part-of-speech tags
	tokenizedAndTagged <- sprintf("%s/%s", txt[POSwords][thisPOSindex], tags[thisPOSindex])
	untokenizedAndTagged <- paste(tokenizedAndTagged, collapse = " ")
	untokenizedAndTagged <- gsub("\\/NN", "", untokenizedAndTagged)# "NN" signifies common nouns
	save.text <- paste(saveas.file, "n", text, sep="")
	write(untokenizedAndTagged, file=save.text, append = FALSE, sep="")
	}