Last active
July 12, 2016 18:53
-
-
Save jmclawson/f8a3fcca635b168079ad81514d3e45aa to your computer and use it in GitHub Desktop.
Strip out all but common nouns in a given text file, using part-of-speech tagging
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(NLP) | |
library(openNLP) | |
# credit due to http://stackoverflow.com/questions/30995232/how-to-use-opennlp-to-get-pos-tags-in-r | |
## SET "data.dir" to a directory containing text files. It's not designed for nested directories. | |
data.dir <- "data/texts/" | |
## SET "saveas.file" to the destination directory. Resulting files have an "n" prepended to the file name, but it may be useful to save to a separate directory. | |
saveas.file <- "data/texts-n/" | |
file.list <- list.files(path=data.dir) # get all the filenames in the path | |
for (text in file.list) { # run the loop for every file in the folder | |
txt <- readLines(paste(data.dir, text, sep="")) | |
txt <- as.String(txt) | |
wordAnnotation <- annotate(txt, list(Maxent_Sent_Token_Annotator(), Maxent_Word_Token_Annotator())) | |
POSAnnotation <- annotate(txt, Maxent_POS_Tag_Annotator(), wordAnnotation) | |
POSwords <- subset(POSAnnotation, type == "word") | |
tags <- sapply(POSwords$features, '[[', "POS") | |
thisPOSindex <- grep("NN$", tags)# Searches for tagged common nouns (NN); change it for other part-of-speech tags | |
tokenizedAndTagged <- sprintf("%s/%s", txt[POSwords][thisPOSindex], tags[thisPOSindex]) | |
untokenizedAndTagged <- paste(tokenizedAndTagged, collapse = " ") | |
untokenizedAndTagged <- gsub("\\/NN", "", untokenizedAndTagged)# "NN" signifies common nouns | |
save.text <- paste(saveas.file, "n", text, sep="") | |
write(untokenizedAndTagged, file=save.text, append = FALSE, sep="") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment