Last active
December 12, 2015 09:49
-
-
Save mocobeta/4754433 to your computer and use it in GitHub Desktop.
Lucene カスタム TokenFilter, Analyzer の例
for Lucene 4.1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* 以下は、Apache Softoware Licence v2.0 の元に頒布されているコードに一部改変を加えたものです。 | |
* http://www.apache.org/licenses/LICENSE-2.0.txt | |
*/ | |
import java.io.IOException; | |
import java.io.Reader; | |
import java.util.HashMap; | |
import java.util.Map; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.Tokenizer; | |
import org.apache.lucene.analysis.core.LowerCaseFilter; | |
import org.apache.lucene.analysis.core.StopAnalyzer; | |
import org.apache.lucene.analysis.core.StopFilter; | |
import org.apache.lucene.analysis.standard.StandardFilter; | |
import org.apache.lucene.analysis.standard.StandardTokenizer; | |
import org.apache.lucene.util.Version; | |
import example.AnalyzerUtils; | |
public class SynonymAnalyzer extends Analyzer { | |
private SynonymEngine engine; | |
public SynonymAnalyzer(SynonymEngine engine) { | |
this.engine = engine; | |
} | |
@Override | |
protected TokenStreamComponents createComponents(String arg0, Reader in) { | |
Tokenizer source = new StandardTokenizer(Version.LUCENE_41, in); | |
return new TokenStreamComponents(source, | |
new SynonymFilter( | |
new StopFilter(Version.LUCENE_41, | |
new LowerCaseFilter(Version.LUCENE_41, | |
new StandardFilter(Version.LUCENE_41, source)), | |
StopAnalyzer.ENGLISH_STOP_WORDS_SET), | |
engine)); | |
} | |
public static void main(String[] args) throws IOException { | |
Analyzer analyzer = new SynonymAnalyzer(new TestSynonymEngine()); | |
AnalyzerUtils.displayTokensWithPositions(analyzer, | |
"The quick brown fox jumps over the lazy dog"); | |
} | |
} | |
/** | |
* テスト用SynonymEngine | |
* ハードコーディングされたシノニムのみ返す | |
**/ | |
class TestSynonymEngine implements SynonymEngine { | |
private static Map<String, String[]> map = new HashMap<String, String[]>(); | |
static { | |
map.put("quick", new String[]{"fast", "speedy"}); | |
map.put("jumps", new String[]{"leaps", "hops"}); | |
map.put("over", new String[]{"above"}); | |
map.put("lazy", new String[]{"apathetic", "sluggish"}); | |
map.put("dog", new String[]{"canine", "pooch"}); | |
} | |
@Override | |
public String[] getSynonyms(String s) throws IOException { | |
return map.get(s); | |
} | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* 以下は、Apache Softoware Licence v2.0 の元に頒布されているコードに一部改変を加えたものです。 | |
* http://www.apache.org/licenses/LICENSE-2.0.txt | |
*/ | |
import java.io.IOException; | |
import java.io.StringReader; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.TextField; | |
import org.apache.lucene.document.Field.Store; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.index.Term; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.PhraseQuery; | |
import org.apache.lucene.search.TermQuery; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.lucene.util.Version; | |
import org.junit.After; | |
import org.junit.Before; | |
import org.junit.Test; | |
public class SynonymAnalyzerTest { | |
private RAMDirectory directory; | |
private IndexReader reader; | |
private static SynonymAnalyzer analyzer = | |
new SynonymAnalyzer(new TestSynonymEngine()); | |
@Before | |
public void setUp() throws Exception { | |
directory = new RAMDirectory(); | |
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, analyzer); | |
IndexWriter writer = new IndexWriter(directory, config); | |
Document doc = new Document(); | |
doc.add(new TextField("content", "The quick brown fox jumps over the lazy dog", Store.YES)); | |
writer.addDocument(doc); | |
writer.close(); | |
reader = DirectoryReader.open(directory); | |
} | |
@After | |
public void tearDown() throws IOException { | |
reader.close(); | |
} | |
@Test | |
public void testSearchByAPI() throws Exception { | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// "hops" にマッチするドキュメントを検索 | |
TermQuery termQuery = new TermQuery(new Term("content", "hops")); | |
assertEquals(1, searcher.search(termQuery, 10).totalHits); | |
// フレーズ "fox hops" にマッチするドキュメントを検索 | |
PhraseQuery phraseQuery = new PhraseQuery(); | |
phraseQuery.add(new Term("content", "fox")); | |
phraseQuery.add(new Term("content", "hops")); | |
assertEquals(1, searcher.search(phraseQuery, 10).totalHits); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* 以下は、Apache Softoware Licence v2.0 の元に頒布されているコードに一部改変を加えたものです。 | |
* http://www.apache.org/licenses/LICENSE-2.0.txt | |
*/ | |
import java.io.IOException; | |
public interface SynonymEngine { | |
/** 文字列に対応するシノニム群を返す */ | |
String[] getSynonyms(String s) throws IOException; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* 以下は、Apache Softoware Licence v2.0 の元に頒布されているコードに一部改変を加えたものです。 | |
* http://www.apache.org/licenses/LICENSE-2.0.txt | |
*/ | |
import java.io.IOException; | |
import java.util.Stack; | |
import org.apache.lucene.analysis.TokenFilter; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | |
import org.apache.lucene.util.AttributeSource; | |
public class SynonymFilter extends TokenFilter { | |
public static final String TOKEN_TYPE_SYNONYM = "SYNONYM"; | |
private Stack<String> synonymStack; | |
private SynonymEngine engine; | |
private AttributeSource.State current; | |
// 使用する Attribute は CharTermAttribute と PositionIncrementAttribute | |
private final CharTermAttribute termAtt; | |
private final PositionIncrementAttribute posIncrAtt; | |
public SynonymFilter(TokenStream input, SynonymEngine engine) { | |
super(input); | |
// synonymStack 初期化 | |
synonymStack = new Stack<String>(); | |
this.engine = engine; | |
// TokenStream に Attribute を追加 | |
this.termAtt = addAttribute(CharTermAttribute.class); | |
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); | |
} | |
@Override | |
public boolean incrementToken() throws IOException { | |
// synonymStack から全てのシノニムをpopする | |
if (synonymStack.size() > 0) { | |
String syn = synonymStack.pop(); | |
restoreState(current); | |
char[] buf = syn.toCharArray(); | |
// CharTermAttribute にシノニム文字列をセット | |
termAtt.resizeBuffer(buf.length); | |
termAtt.copyBuffer(buf, 0, buf.length); | |
// postion increment を 0 にセット | |
posIncrAtt.setPositionIncrement(0); | |
return true; | |
} | |
// 次の Token を読み出す | |
if (!input.incrementToken()) | |
return false; | |
// synonymStackにシノニムをpushする | |
if (addAliasesToStack()) { | |
current = captureState(); | |
} | |
return true; | |
} | |
private boolean addAliasesToStack() throws IOException { | |
// 現在の CharTermAttribute に対応するシノニムをすべて synonymStack に push | |
String[] synonyms = engine.getSynonyms(termAtt.toString()); | |
if (synonyms == null) { | |
return false; | |
} | |
for (String synonym : synonyms) { | |
synonymStack.push(synonym); | |
} | |
return true; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment