Last active
April 23, 2018 10:54
-
-
Save knbknb/e589b1d480d1c94f34c223db0b530b1c to your computer and use it in GitHub Desktop.
call coreNLP v3.9.1from R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Sys.setenv(JAVA_OPTIONS = "-Xmx8g -Xms1G -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit") | |
library(tidyverse) | |
library(coreNLP) | |
# from: https://stackoverflow.com/questions/48455079/extract-city-names-from-large-text-with-r | |
# using COreNLP to extract tokens | |
# knb 20180422 | |
outputloc <- "/opt/smallapps/corenlp/" | |
nlpver <- 2018 | |
if(nlpver != 2018){ | |
ver <- "stanford-corenlp-full-2015-12-09" | |
mod <- "stanford-english-corenlp-2016-01-10-models.jar" | |
writeLines('annotators = tokenize, ssplit, pos, lemma, ner, regexner\n', 'corenlp.properties') | |
} else { | |
ver <- "stanford-corenlp-full-2018-02-27" | |
mod <- "stanford-english-corenlp-2018-02-27-models.jar" | |
#writeLines('annotators = tokenize,ssplit,pos,lemma,ner,parse,dcoref, regexner\n', 'corenlp.properties') | |
writeLines('annotators = tokenize, ssplit, pos, lemma, ner, regexner\n', 'corenlp.properties') | |
} | |
if(! file.exists(file.path(outputloc, ver))){ | |
#downloadCoreNLP(outputloc, type = c("base")) # downloads ~ 385 MB, installed 581 MB | |
stop ("downlod the base package from https://nlp.stanford.edu/software/") | |
} | |
if(! file.exists(file.path(outputloc, ver, mod))){ | |
#downloadCoreNLP(outputloc, type = c("english")) # ~ 946 MB | |
stop ("download a language model from https://nlp.stanford.edu/software/") | |
} | |
#"chinese", "english", "french", "german", "spanish" | |
# set which annotators to use. | |
# if this doesn't work, re-install rJava | |
initCoreNLP(libLoc = file.path(Sys.getenv('CORENLP_HOME'),ver), type="english", | |
parameterFile = 'corenlp.properties', mem="8g") | |
unlink('corenlp.properties') # clean up | |
example(getSentiment) # if this generates output, coreNLP has been loaded | |
#sessionInfo() | |
places <- data_frame(string = "Ucsd Medical Center, San Diego, California, USA|Yale Cancer Center, New Haven, Connecticut, USA|Massachusetts General Hospital., Boston, Massachusetts, USA|Dana Farber Cancer Institute, Boston, Massachusetts, USA|Washington University, Saint Louis, Missouri, USA|Mount SInai Medical Center, New York, New York, USA|Memorial Sloan Kettering Cancer Center, New York, New York, USA|Carolinas Healthcare System, Charlotte, North Carolina, USA|University Hospitals Case Medical Center; Seidman Cancer Center, Cleveland, Ohio, USA|Vanderbilt University Medical Center, Nashville, Tennessee, USA|Seattle Cancer Care Alliance, Seattle, Washington, USA|National Cancer Center, Gyeonggi-do, Korea, Republic of|Seoul National University Hospital, Seoul, Korea, Republic of|Severance Hospital, Yonsei University Health System, Seoul, Korea, Republic of|Korea University Guro Hospital, Seoul, Korea, Republic of|Asan Medical Center., Seoul, Korea, Republic of|VU MEDISCH CENTRUM; Dept. of Medical Oncology") %>% | |
separate_rows(string, sep = '\\|') # separate strings | |
#coreNLP::annotateString() | |
places_ner <- places %>% | |
mutate(#annotations = map(string, annotateString, format="text", outputFile=NA), | |
annotations = map(string, annotateString), | |
tokens = map(annotations, 'token'), | |
#run-length type id | |
tokens = map(tokens, group_by, token_id = data.table::rleid(NER)), | |
city = map(tokens, filter, NER == 'CITY'), | |
city = map(city, summarise, city = paste(token, collapse = ' ')), | |
city = map_chr(city, function(x) {ifelse(is.tbl(x), x$city, NA_character_)})) | |
#function(x) {if(length(x$token_id) == 0) NA_character_ else unlist(x$city)})) | |
# S3 method | |
print(places_ner$annotations[1]) | |
# very verbose output | |
places_ner$annotations[1] %>% unlist() | |
places_ner %>% | |
select(city, string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Run from within a new R Session.
Last line of code
places_ner %>% select(city, string)
Prints this table (note that not all Cities are resolved correctly by the NLP parser/pipeline):