Last active
June 6, 2018 22:39
-
-
Save seralf/6664201 to your computer and use it in GitHub Desktop.
A simple, introductory example, to play with Stanford Named Entity Recognition tool with the scala language.
NOTE: the model used here is one of the provided model in the standard distribution.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package ner | |
import edu.stanford.nlp.ie.crf.CRFClassifier | |
import scala.collection.JavaConversions._ | |
import scala.collection.JavaConverters._ | |
import edu.stanford.nlp.ling.CoreAnnotations | |
import java.util.ArrayList | |
import java.util.HashMap | |
import java.util.Map | |
import scala.xml.XML | |
import javax.xml.parsers.SAXParser | |
import java.util.LinkedHashMap | |
import edu.stanford.nlp.ling.CoreLabel | |
import java.util.List | |
import scala.collection.immutable.Seq | |
import scala.collection.Seq | |
import scala.collection.mutable.ListBuffer | |
object MainNERAnnotator extends App { | |
val serializedClassifier = "stanford_classifiers/english.all.3class.distsim.crf.ser.gz" | |
val classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier) | |
val text = """ | |
The Stanford University, which is where the Stanford NER tool is from, is located in California. | |
Leonardo Da Vinci was a famous italian painter, originally born in Vinci, Italy. | |
Wikipedia is one very big source of information for playing with Named Entities; | |
for example start reading the page: http://en.wikipedia.org/wiki/Named-entity_recognition. | |
""" | |
val keywords = new ListBuffer[(String, String)] | |
for (sentence <- classifier.classify(text)) { | |
def recognized(word: CoreLabel): Boolean = { | |
!word.get(classOf[CoreAnnotations.AnswerAnnotation]).equals("O") && | |
!word.get(classOf[CoreAnnotations.AnswerAnnotation]).equals("") | |
} | |
def annotation(word: CoreLabel) = word.get(classOf[CoreAnnotations.AnswerAnnotation]) | |
val annotatedWords = sentence.toList.filterNot(word => !recognized(word)) | |
val map = annotatedWords | |
.groupBy(word => annotation(word)) | |
.map(e => e._1 match { | |
case "LOCATION" => (e._1, e._2.mkString(", ")) | |
case _ => (e._1, e._2.mkString(" ")) | |
}) | |
keywords.appendAll(map) | |
} | |
println("\nKEYWORDS: " + keywords.toList.mkString(" | ")) | |
val out2 = classifier.classifyWithInlineXML(text) | |
println(out2) | |
val out3 = classifier.classifyToString(text) | |
println(out3) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment