Skip to content

Instantly share code, notes, and snippets.

@odashi
Created January 26, 2016 20:22
Show Gist options
  • Save odashi/db0253e5fd4823e53566 to your computer and use it in GitHub Desktop.
Save odashi/db0253e5fd4823e53566 to your computer and use it in GitHub Desktop.
Stanford Tokenizerを強制的に1行ずつ解析させるラッパ。パイプ通信用に使える。
import java.io.*;
import java.util.*;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
public class StanfordTokenizerRunner {
private static List<String> tokenize(String text) {
PTBTokenizer<Word> tokenizer = new PTBTokenizer<Word>(
new StringReader(text), new WordTokenFactory(), "");
List<String> tokens = new ArrayList<String>();
while (tokenizer.hasNext()) {
tokens.add(tokenizer.next().word());
}
return tokens;
}
private static String join(List<String> tokens, String separator) {
String ret = "";
for (String token : tokens) {
if (ret.length() > 0) ret += " ";
ret += token;
}
return ret;
}
public static void main(String[] args) throws IOException {
BufferedReader reader = new BufferedReader(
new InputStreamReader(System.in));
while (true) {
String text = reader.readLine();
if (text == null) break;
System.out.println(join(tokenize(text), " "));
System.out.flush();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment