dumpTokenized("中文标题", new CJKAnalyzer(Version.LUCENE_47));
Text to tokenize [中文标题] via LimitTokenCountAnalyzer
[中] [文] [标] [题]
| private void dumpTokenized(String text, Analyzer analyzer) throws IOException { | |
| List<String> tokens = tokenize(text, analyzer); | |
| System.out.printf("Text to tokenize [%s] via %s %n", text, analyzer.getClass().getSimpleName()); | |
| for (String t : tokens){ | |
| System.out.printf("[%s] ", t); | |
| } | |
| System.out.println(); | |
| } | |
| private List<String> tokenize(String text, Analyzer analyzer) throws IOException { | |
| List<String> tokens = new ArrayList<String>(); | |
| TokenStream stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text)); | |
| CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); | |
| OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); | |
| stream.reset(); | |
| while (stream.incrementToken()) { | |
| String term = termAtt.toString(); | |
| tokens.add(term); | |
| } | |
| stream.close(); | |
| return tokens; | |
| } |