knbknb · April 23, 2018 10:54 · knbknb · Apr 22, 2018
diff --git a/coreNLP_test.R b/coreNLP_test.R
 #Sys.setenv(JAVA_OPTIONS = "-Xmx8g -Xms1G -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit")
 library(tidyverse)
 library(coreNLP)
 # from: https://stackoverflow.com/questions/48455079/extract-city-names-from-large-text-with-r
 # using COreNLP to extract tokens
 # knb 20180422

 outputloc <- "/opt/smallapps/corenlp/"
 nlpver <- 2018
 if(nlpver != 2018){
        ver <- "stanford-corenlp-full-2015-12-09"
        mod <- "stanford-english-corenlp-2016-01-10-models.jar"
        writeLines('annotators = tokenize, ssplit, pos, lemma, ner, regexner\n', 'corenlp.properties')

 } else {
        ver <- "stanford-corenlp-full-2018-02-27"
        mod <- "stanford-english-corenlp-2018-02-27-models.jar"
        #writeLines('annotators = tokenize,ssplit,pos,lemma,ner,parse,dcoref, regexner\n', 'corenlp.properties')
        writeLines('annotators = tokenize, ssplit, pos, lemma, ner, regexner\n', 'corenlp.properties')
 }

 if(! file.exists(file.path(outputloc, ver))){
        #downloadCoreNLP(outputloc, type = c("base"))    # downloads ~ 385 MB, installed 581 MB
        stop ("downlod the base package from https://nlp.stanford.edu/software/")
 }
 if(! file.exists(file.path(outputloc, ver, mod))){
        #downloadCoreNLP(outputloc, type = c("english")) # ~ 946 MB
        stop ("download a language model from https://nlp.stanford.edu/software/")
 }
 #"chinese", "english", "french", "german", "spanish"
 # set which annotators to use.
 # if this doesn't work, re-install rJava
 initCoreNLP(libLoc = file.path(Sys.getenv('CORENLP_HOME'),ver), type="english",
            parameterFile = 'corenlp.properties', mem="8g")
 unlink('corenlp.properties')    # clean up

 example(getSentiment) # if this generates output, coreNLP has been loaded
 #sessionInfo()

 places <- data_frame(string = "Ucsd Medical Center, San Diego, California, USA|Yale Cancer Center, New Haven, Connecticut, USA|Massachusetts General Hospital., Boston, Massachusetts, USA|Dana Farber Cancer Institute, Boston, Massachusetts, USA|Washington University, Saint Louis, Missouri, USA|Mount SInai Medical Center, New York, New York, USA|Memorial Sloan Kettering Cancer Center, New York, New York, USA|Carolinas Healthcare System, Charlotte, North Carolina, USA|University Hospitals Case Medical Center; Seidman Cancer Center, Cleveland, Ohio, USA|Vanderbilt University Medical Center, Nashville, Tennessee, USA|Seattle Cancer Care Alliance, Seattle, Washington, USA|National Cancer Center, Gyeonggi-do, Korea, Republic of|Seoul National University Hospital, Seoul, Korea, Republic of|Severance Hospital, Yonsei University Health System, Seoul, Korea, Republic of|Korea University Guro Hospital, Seoul, Korea, Republic of|Asan Medical Center., Seoul, Korea, Republic of|VU MEDISCH CENTRUM; Dept. of Medical Oncology") %>%
        separate_rows(string, sep = '\\|')    # separate strings
 #coreNLP::annotateString()

 places_ner <- places %>%
        mutate(#annotations = map(string, annotateString, format="text", outputFile=NA),
                annotations = map(string, annotateString),
               tokens = map(annotations, 'token'),
               #run-length type id
               tokens = map(tokens, group_by, token_id = data.table::rleid(NER)),
               city = map(tokens, filter, NER == 'CITY'),
               city = map(city, summarise, city = paste(token, collapse = ' ')),
               city = map_chr(city, function(x) {ifelse(is.tbl(x), x$city, NA_character_)}))
               #function(x) {if(length(x$token_id) == 0) NA_character_ else unlist(x$city)}))

 # S3 method
 print(places_ner$annotations[1])
 # very verbose output
 places_ner$annotations[1] %>%  unlist()




 places_ner %>%
        select(city, string)
	#Sys.setenv(JAVA_OPTIONS = "-Xmx8g -Xms1G -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit")
	library(tidyverse)
	library(coreNLP)
	# from: https://stackoverflow.com/questions/48455079/extract-city-names-from-large-text-with-r
	# using COreNLP to extract tokens
	# knb 20180422

	outputloc <- "/opt/smallapps/corenlp/"
	nlpver <- 2018
	if(nlpver != 2018){
	ver <- "stanford-corenlp-full-2015-12-09"
	mod <- "stanford-english-corenlp-2016-01-10-models.jar"
	writeLines('annotators = tokenize, ssplit, pos, lemma, ner, regexner\n', 'corenlp.properties')

	} else {
	ver <- "stanford-corenlp-full-2018-02-27"
	mod <- "stanford-english-corenlp-2018-02-27-models.jar"
	#writeLines('annotators = tokenize,ssplit,pos,lemma,ner,parse,dcoref, regexner\n', 'corenlp.properties')
	writeLines('annotators = tokenize, ssplit, pos, lemma, ner, regexner\n', 'corenlp.properties')
	}

	if(! file.exists(file.path(outputloc, ver))){
	#downloadCoreNLP(outputloc, type = c("base")) # downloads ~ 385 MB, installed 581 MB
	stop ("downlod the base package from https://nlp.stanford.edu/software/")
	}
	if(! file.exists(file.path(outputloc, ver, mod))){
	#downloadCoreNLP(outputloc, type = c("english")) # ~ 946 MB
	stop ("download a language model from https://nlp.stanford.edu/software/")
	}
	#"chinese", "english", "french", "german", "spanish"
	# set which annotators to use.
	# if this doesn't work, re-install rJava
	initCoreNLP(libLoc = file.path(Sys.getenv('CORENLP_HOME'),ver), type="english",
	parameterFile = 'corenlp.properties', mem="8g")
	unlink('corenlp.properties') # clean up

	example(getSentiment) # if this generates output, coreNLP has been loaded
	#sessionInfo()

	places <- data_frame(string = "Ucsd Medical Center, San Diego, California, USA\|Yale Cancer Center, New Haven, Connecticut, USA\|Massachusetts General Hospital., Boston, Massachusetts, USA\|Dana Farber Cancer Institute, Boston, Massachusetts, USA\|Washington University, Saint Louis, Missouri, USA\|Mount SInai Medical Center, New York, New York, USA\|Memorial Sloan Kettering Cancer Center, New York, New York, USA\|Carolinas Healthcare System, Charlotte, North Carolina, USA\|University Hospitals Case Medical Center; Seidman Cancer Center, Cleveland, Ohio, USA\|Vanderbilt University Medical Center, Nashville, Tennessee, USA\|Seattle Cancer Care Alliance, Seattle, Washington, USA\|National Cancer Center, Gyeonggi-do, Korea, Republic of\|Seoul National University Hospital, Seoul, Korea, Republic of\|Severance Hospital, Yonsei University Health System, Seoul, Korea, Republic of\|Korea University Guro Hospital, Seoul, Korea, Republic of\|Asan Medical Center., Seoul, Korea, Republic of\|VU MEDISCH CENTRUM; Dept. of Medical Oncology") %>%
	separate_rows(string, sep = '\\\|') # separate strings
	#coreNLP::annotateString()

	places_ner <- places %>%
	mutate(#annotations = map(string, annotateString, format="text", outputFile=NA),
	annotations = map(string, annotateString),
	tokens = map(annotations, 'token'),
	#run-length type id
	tokens = map(tokens, group_by, token_id = data.table::rleid(NER)),
	city = map(tokens, filter, NER == 'CITY'),
	city = map(city, summarise, city = paste(token, collapse = ' ')),
	city = map_chr(city, function(x) {ifelse(is.tbl(x), x$city, NA_character_)}))
	#function(x) {if(length(x$token_id) == 0) NA_character_ else unlist(x$city)}))

	# S3 method
	print(places_ner$annotations[1])
	# very verbose output
	places_ner$annotations[1] %>% unlist()




	places_ner %>%
	select(city, string)
city	string
San Diego	Ucsd Medical Center, San Diego, California, USA
New Haven	Yale Cancer Center, New Haven, Connecticut, USA
Boston	Massachusetts General Hospital., Boston, Massachusetts, USA
Boston	Dana Farber Cancer Institute, Boston, Massachusetts, USA
NA	Washington University, Saint Louis, Missouri, USA
NA	Mount SInai Medical Center, New York, New York, USA
NA	Memorial Sloan Kettering Cancer Center, New York, New York, USA
Charlotte	Carolinas Healthcare System, Charlotte, North Carolina, USA
Cleveland	University Hospitals Case Medical Center; Seidman Cancer Center, Cleveland, Ohio, USA
Nashville	Vanderbilt University Medical Center, Nashville, Tennessee, USA
Seattle	Seattle Cancer Care Alliance, Seattle, Washington, USA
NA	National Cancer Center, Gyeonggi-do, Korea, Republic of
Seoul	Seoul National University Hospital, Seoul, Korea, Republic of
Seoul	Severance Hospital, Yonsei University Health System, Seoul, Korea, Republic of
Seoul	Korea University Guro Hospital, Seoul, Korea, Republic of
Seoul	Asan Medical Center., Seoul, Korea, Republic of
NA	VU MEDISCH CENTRUM; Dept. of Medical Oncology