Created
February 13, 2018 17:57
-
-
Save Jawn78/bab736aeeb9c5b0f64d99e7aceeb1d98 to your computer and use it in GitHub Desktop.
Open NLP, Apche Tika, and XML used to perform entity recognition, though the models need to be trained, which has led me to work with REGEX and other libraries to process the data while adding information to the model to improve recognition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* To change this license header, choose License Headers in Project Properties. | |
* To change this template file, choose Tools | Templates | |
* and open the template in the editor. | |
*/ | |
package rex1nlp; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.InputStream; | |
import opennlp.tools.namefind.NameFinderME; | |
import opennlp.tools.namefind.TokenNameFinderModel; | |
import opennlp.tools.tokenize.TokenizerME; | |
import opennlp.tools.tokenize.TokenizerModel; | |
import opennlp.tools.util.Span; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.parser.AutoDetectParser; | |
import org.apache.tika.parser.ParseContext; | |
import org.apache.tika.parser.Parser; | |
import org.apache.tika.sax.BodyContentHandler; | |
import org.xml.sax.ContentHandler; | |
public class rexNERex { | |
public static void main(String args[]) throws Exception{ | |
//Getting the sentence in the form of String array | |
String target = "C:\\Users\\RexPC\\Documents\\Haily.docx"; | |
File document = new File(target); | |
Parser parser = new AutoDetectParser(); | |
ContentHandler handler = new BodyContentHandler(); | |
Metadata metadata = new Metadata(); | |
parser.parse(new FileInputStream(document), handler, metadata, new ParseContext()); | |
// System.out.println(handler); | |
//Loading the tokenizer model | |
InputStream inputStreamTokenizer = new | |
FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-token.bin"); | |
TokenizerModel tokenModel = new TokenizerModel(inputStreamTokenizer); | |
//Instantiating the TokenizerME class | |
TokenizerME tokenizer = new TokenizerME(tokenModel); | |
//Tokenizing the sentence in to a string array | |
String tokens[] = tokenizer.tokenize(handler.toString()); | |
for(String tokenin: tokens) | |
System.out.println(tokenin); | |
InputStream inputStreamNameFinder = new FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-ner-person.bin"); | |
TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder); | |
//Instantiating the NameFinderME class | |
NameFinderME nameFinder = new NameFinderME(model); | |
//Finding the names in the sentence | |
Span nameSpans[] = nameFinder.find(tokens); | |
//Printing the names and their spans in a sentence | |
// for(Span s: nameSpans) | |
// System.out.println(s.toString()); | |
/* | |
InputStream modelIn = new FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-sent.bin"); | |
SentenceModel stcmodel = null; | |
try { | |
stcmodel = new SentenceModel(modelIn); | |
} | |
catch (IOException e) { | |
} | |
//Instantiating the SentenceDetectorME class | |
SentenceDetectorME detector = new SentenceDetectorME(stcmodel); | |
String sentences[]; | |
sentences = detector.sentDetect(handler.toString()); | |
//Finding the names in the sentence | |
Span nameSpans[] = nameFinder.find(sentences); | |
//Printing the spans of the names in the sentence | |
for(Span s: nameSpans) | |
System.out.println(s.toString()); | |
*/ | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a project to use OpenNLP, and Apache Tika, to do entity recognition. Tika autoparses the data and chunks it into sentences and tokens for Open NLP to process. The results showed a need to improve the model. I am looking at Open NLP and other REGEX functionality to process inputs concurrently with OpenNLP and result in more training data while increasing accuracy of results. There may be a better option to use layers of neural networks to accomplish more efficiently.