Created
July 25, 2011 17:52
-
-
Save spatzle/1104702 to your computer and use it in GitHub Desktop.
Opennlp-servlet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.example.opennlp; | |
import java.io.*; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Map; | |
import java.util.TreeSet; | |
import javax.servlet.*; | |
import javax.servlet.http.*; | |
import com.google.gson.Gson; | |
import opennlp.tools.dictionary.Dictionary; | |
import opennlp.tools.namefind.DictionaryNameFinder; | |
import opennlp.tools.namefind.NameFinderME; | |
import opennlp.tools.namefind.TokenNameFinder; | |
import opennlp.tools.namefind.TokenNameFinderModel; | |
import opennlp.tools.util.Span; | |
import opennlp.tools.util.StringList; | |
import opennlp.tools.tokenize.TokenizerModel; | |
import opennlp.tools.tokenize.TokenizerME; | |
/** | |
* This is a servlet interface to the Opennlp maxent classifier. | |
* | |
* @author Joyce Chan 2011 | |
**/ | |
public class OpennlpServlet extends HttpServlet { | |
private TokenNameFinder f_nameFinder; | |
private TokenNameFinder r_nameFinder; | |
private TokenizerME tokenizer ; | |
private TokenizerME tokenizer_internal_use ; | |
private TreeSet<String> stopwords = new TreeSet<String>(); | |
private final String ENGLISH_STOP_WORDS[] = { | |
"a", "an", "and", "are", "as", "at", "be", "but", "by", | |
"for", "if", "in", "into", "is", "it", | |
"no", "not", "of", "on", "or", "such", | |
"that", "the", "their", "then", "there", "these", | |
"they", "this", "to", "was", "will", "with" | |
}; | |
private Dictionary r_dictionary; | |
private static final long serialVersionUID = 1L; | |
private TokenizerME getTokenizer(String tokenizer_title) throws ServletException{ | |
String title; | |
String location; | |
TokenizerME tokenizer = null; | |
// get default classifier | |
title = getServletConfig().getInitParameter(tokenizer_title); | |
if (title == null || title.trim().equals("")) throw new ServletException("Default Tokenizer not given."); | |
location = getServletConfig().getInitParameter(title); | |
if (location == null || location.trim().equals("")) throw new ServletException("Tokenizer location not given."); | |
InputStream stream = getServletConfig().getServletContext().getResourceAsStream(location); | |
if (stream == null) throw new ServletException("File not found. Filename = " + location); | |
try { | |
tokenizer = new TokenizerME(new TokenizerModel( new BufferedInputStream(stream))); | |
} catch (IOException e) { | |
try{ | |
stream = new FileInputStream(location); | |
}catch(IOException e2){ | |
throw new ServletException("IO problem reading tokenizer A. "+location); | |
} | |
throw new ServletException("IO problem reading tokenizer B."); | |
} | |
finally { | |
if ( stream != null){ | |
try { stream.close(); } catch (IOException e) { } | |
} | |
} return tokenizer; | |
} //end getTokenModel | |
// for ner | |
private TokenNameFinder getNameFinder(String classifier_title) throws ServletException{ | |
String title; | |
String location; | |
TokenNameFinder nmFinder = null; | |
// get default classifier | |
title = getServletConfig().getInitParameter(classifier_title); | |
if (title == null || title.trim().equals("")) throw new ServletException("Default classifier not given."); | |
location = getServletConfig().getInitParameter(title); | |
if (location == null || location.trim().equals("")) throw new ServletException("Classifier location not given."); | |
InputStream filestream = getServletConfig().getServletContext().getResourceAsStream(location); | |
if (filestream == null) throw new ServletException("File not found. Filename = " + location); | |
try { | |
nmFinder = new NameFinderME(new TokenNameFinderModel( new BufferedInputStream(filestream))); | |
} catch (IOException e) { | |
throw new ServletException("IO problem reading classifier."); | |
} finally { | |
if ( filestream != null){ | |
try { filestream.close(); } catch (IOException e) { } | |
} | |
} return nmFinder; | |
} //end getTokenNameFinder | |
private DictionaryNameFinder getDictionaryNameFinder(String filetitle) throws ServletException{ | |
r_dictionary = new Dictionary(); | |
String title; | |
String location; | |
title = getServletConfig().getInitParameter(filetitle); | |
if (title == null || title.trim().equals("")) throw new ServletException("Default classifier not given."); | |
location = getServletConfig().getInitParameter(title); | |
if (location == null || location.trim().equals("")) throw new ServletException("Classifier location not given."); | |
InputStream filestream = getServletConfig().getServletContext().getResourceAsStream(location); | |
if (filestream == null) throw new ServletException("File not found. Filename = " + location); | |
try { | |
BufferedReader br = new BufferedReader(new InputStreamReader(new DataInputStream(filestream))); | |
String strLine; | |
while ((strLine = br.readLine()) != null) { | |
//change to lower case, remove commas, tokenize, remove stop words | |
String s[] = removeStopWordsFromSentence(tokenizer.tokenize(strLine.toLowerCase().replace(",", ""))); | |
putInDict(r_dictionary, s); | |
for (int i =0;i<s.length;i++){ | |
putInDict(r_dictionary,s[i]); | |
} | |
} | |
} catch (Exception e) {// Catch exception if any | |
System.err.println("Error: " + e.getMessage()); | |
}finally{ | |
if ( filestream != null){ | |
try { filestream.close(); } catch (IOException e) { } | |
} | |
} | |
DictionaryNameFinder dnf = new DictionaryNameFinder(r_dictionary); | |
return dnf; | |
} | |
// here are 2 examples: | |
// a maxent trained classifier | |
// and a non-trained classifier that will classify from dictionary words only | |
// same tokenizer, you can download on the opennlp project page, the english one is ok | |
public void init() throws ServletException { | |
tokenizer = getTokenizer("default-tokenizer"); | |
tokenizer_internal_use = getTokenizer("default-tokenizer"); | |
// a maxent trained classifier | |
f_nameFinder = getNameFinder("f-classifier"); | |
// not trained, just a text file to be used to form the dictionary | |
r_nameFinder = getDictionaryNameFinder("r-raw-text"); | |
addToStopWords(); | |
} | |
public void doGet(HttpServletRequest request, HttpServletResponse response) | |
throws ServletException, IOException { | |
doPost(request, response); | |
} | |
public void doPost(HttpServletRequest request, HttpServletResponse response) | |
throws ServletException, IOException { | |
@SuppressWarnings("unchecked") | |
Map<String, String> reqMap = request.getParameterMap(); | |
PrintWriter pw = response.getWriter(); | |
String names[] = {}; | |
if (reqMap.containsKey("sentence")){ | |
String sentence = request.getParameter("sentence"); | |
String tokens[] = removeStopWordsFromSentence(tokenizer.tokenize(sentence.toLowerCase().replace(",", ""))); | |
Span spannames[] = null; | |
if (reqMap.containsKey("dict") && request.getParameter("dict").equals("on")){ | |
spannames = r_nameFinder.find(tokens); | |
}else{ | |
spannames = f_nameFinder.find(tokens); | |
} | |
names = Span.spansToStrings(spannames,tokens); | |
if (reqMap.containsKey("wt") && request.getParameter("wt").equals("json")){ | |
printAsJson(response, pw, names); | |
} else printAsHtml(response, pw, names); | |
} | |
} //end doPost | |
// private methods | |
private void addToStopWords(){ | |
for (int i=0; i< ENGLISH_STOP_WORDS.length;i++){ | |
stopwords.add(ENGLISH_STOP_WORDS[i]); | |
} | |
} | |
private String[] removeStopWordsFromSentence(String[] tokens){ | |
ArrayList<String> newTokens = new ArrayList<String>(Arrays.asList(tokens)); | |
for (int i=0;i<newTokens.size();i++){ | |
if (stopwords.contains(newTokens.get(i))){ | |
newTokens.remove(i); | |
} | |
} | |
return (String []) newTokens.toArray (new String [newTokens.size ()]); | |
} | |
private Dictionary putInDict(Dictionary d,String s){ | |
StringList sl = new StringList(new String[]{s}); | |
d.put(sl); | |
return d; | |
} | |
private Dictionary putInDict(Dictionary d,String[] s){ | |
StringList sl = new StringList(s); | |
d.put(sl); | |
return d; | |
} | |
// prints result as json | |
private void printAsJson(HttpServletResponse response, PrintWriter pw, String[] names){ | |
Gson gson = new Gson(); | |
String jsonOut = gson.toJson(names); | |
response.setContentType("application/json"); | |
pw.println(jsonOut); | |
} | |
// this prints it as html on a bulleted list | |
private void printAsHtml(HttpServletResponse response,PrintWriter pw, String[] names){ | |
response.setContentType("text/html"); | |
pw.println("<html>"); | |
pw.println("<head><title>Named entities</title></title>"); | |
pw.println("<body>"); | |
pw.println("<h1>Here are your named entities</h1>"); | |
pw.println("<ul>"); | |
for (int i=0;i<names.length;i++){ | |
pw.println("<li>"+names[i]); | |
} | |
pw.println("</ ul>"); | |
pw.println("</body></html>"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment