Last active
May 15, 2019 15:46
-
-
Save sergey-tihon/41d122e67ca74384f02a3aa0456ed365 to your computer and use it in GitHub Desktop.
The sample of training custom NER model using OpenNLP.NET https://github.com/sergey-tihon/OpenNLP.NET
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#load "common.fsx" | |
open java.nio.charset | |
open java.io | |
#I "../packages/OpenNLP.NET/lib/" | |
#r "opennlp-tools-1.8.4.dll" | |
#r "opennlp-uima-1.8.4.dll" | |
open opennlp.tools.util | |
open opennlp.tools.namefind | |
// The training data should contain at least 15000 sentences to create a model which performs well | |
let train (inputFile:string) = | |
let factory = | |
{ new InputStreamFactory with | |
member __.createInputStream () = | |
new FileInputStream(inputFile) :> InputStream } | |
let lineStream = new PlainTextByLineStream(factory, StandardCharsets.UTF_8) | |
use sampleStream = new NameSampleDataStream(lineStream) | |
let nameFinderFactory = new TokenNameFinderFactory() | |
let trainingParameters = new TrainingParameters(); | |
//trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, "5"); | |
//trainingParameters.put(TrainingParameters.CUTOFF_PARAM, "200"); | |
NameFinderME.train ("en", "person", sampleStream, trainingParameters, nameFinderFactory) | |
let save (outputFile:string) (model:TokenNameFinderModel) = | |
use modelOut = new BufferedOutputStream(new FileOutputStream(outputFile)) | |
model.serialize(modelOut) | |
let load (inputFile:string) = | |
use modelIn = new FileInputStream(inputFile) | |
TokenNameFinderModel(modelIn) | |
// http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt?revision=1245855&view=markup | |
let newModel = train <| Common.Data.openNLP.``my-model.train`` | |
newModel |> save Common.Data.openNLP.``my-model.bin`` | |
let model = load Common.Data.openNLP.``my-model.bin`` | |
open opennlp.tools.tokenize | |
let tokenizer = | |
let file = Common.PaketFiles.``opennlp.sourceforge.net``.``en-token.bin`` | |
use modelIn = new FileInputStream(file) | |
let model = TokenizerModel(modelIn) | |
TokenizerME(model) | |
let sentence = tokenizer.tokenize("Hi Sergey Tihon, it's your NER model.") | |
let nameFinder = new NameFinderME(model) | |
let spans = nameFinder.find(sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment