Created
January 26, 2016 20:22
-
-
Save odashi/db0253e5fd4823e53566 to your computer and use it in GitHub Desktop.
Stanford Tokenizerを強制的に1行ずつ解析させるラッパ。パイプ通信用に使える。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.util.*; | |
import edu.stanford.nlp.ling.Word; | |
import edu.stanford.nlp.process.WordTokenFactory; | |
import edu.stanford.nlp.process.PTBTokenizer; | |
public class StanfordTokenizerRunner { | |
private static List<String> tokenize(String text) { | |
PTBTokenizer<Word> tokenizer = new PTBTokenizer<Word>( | |
new StringReader(text), new WordTokenFactory(), ""); | |
List<String> tokens = new ArrayList<String>(); | |
while (tokenizer.hasNext()) { | |
tokens.add(tokenizer.next().word()); | |
} | |
return tokens; | |
} | |
private static String join(List<String> tokens, String separator) { | |
String ret = ""; | |
for (String token : tokens) { | |
if (ret.length() > 0) ret += " "; | |
ret += token; | |
} | |
return ret; | |
} | |
public static void main(String[] args) throws IOException { | |
BufferedReader reader = new BufferedReader( | |
new InputStreamReader(System.in)); | |
while (true) { | |
String text = reader.readLine(); | |
if (text == null) break; | |
System.out.println(join(tokenize(text), " ")); | |
System.out.flush(); | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment