Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Last active December 12, 2015 09:49
Show Gist options
  • Save mocobeta/4754433 to your computer and use it in GitHub Desktop.
Save mocobeta/4754433 to your computer and use it in GitHub Desktop.
Lucene カスタム TokenFilter, Analyzer の例 for Lucene 4.1
/**
* 以下は、Apache Softoware Licence v2.0 の元に頒布されているコードに一部改変を加えたものです。
* http://www.apache.org/licenses/LICENSE-2.0.txt
*/
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
import example.AnalyzerUtils;
public class SynonymAnalyzer extends Analyzer {
private SynonymEngine engine;
public SynonymAnalyzer(SynonymEngine engine) {
this.engine = engine;
}
@Override
protected TokenStreamComponents createComponents(String arg0, Reader in) {
Tokenizer source = new StandardTokenizer(Version.LUCENE_41, in);
return new TokenStreamComponents(source,
new SynonymFilter(
new StopFilter(Version.LUCENE_41,
new LowerCaseFilter(Version.LUCENE_41,
new StandardFilter(Version.LUCENE_41, source)),
StopAnalyzer.ENGLISH_STOP_WORDS_SET),
engine));
}
public static void main(String[] args) throws IOException {
Analyzer analyzer = new SynonymAnalyzer(new TestSynonymEngine());
AnalyzerUtils.displayTokensWithPositions(analyzer,
"The quick brown fox jumps over the lazy dog");
}
}
/**
* テスト用SynonymEngine
* ハードコーディングされたシノニムのみ返す
**/
class TestSynonymEngine implements SynonymEngine {
private static Map<String, String[]> map = new HashMap<String, String[]>();
static {
map.put("quick", new String[]{"fast", "speedy"});
map.put("jumps", new String[]{"leaps", "hops"});
map.put("over", new String[]{"above"});
map.put("lazy", new String[]{"apathetic", "sluggish"});
map.put("dog", new String[]{"canine", "pooch"});
}
@Override
public String[] getSynonyms(String s) throws IOException {
return map.get(s);
}
}
/**
* 以下は、Apache Softoware Licence v2.0 の元に頒布されているコードに一部改変を加えたものです。
* http://www.apache.org/licenses/LICENSE-2.0.txt
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class SynonymAnalyzerTest {
private RAMDirectory directory;
private IndexReader reader;
private static SynonymAnalyzer analyzer =
new SynonymAnalyzer(new TestSynonymEngine());
@Before
public void setUp() throws Exception {
directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, analyzer);
IndexWriter writer = new IndexWriter(directory, config);
Document doc = new Document();
doc.add(new TextField("content", "The quick brown fox jumps over the lazy dog", Store.YES));
writer.addDocument(doc);
writer.close();
reader = DirectoryReader.open(directory);
}
@After
public void tearDown() throws IOException {
reader.close();
}
@Test
public void testSearchByAPI() throws Exception {
IndexSearcher searcher = new IndexSearcher(reader);
// "hops" にマッチするドキュメントを検索
TermQuery termQuery = new TermQuery(new Term("content", "hops"));
assertEquals(1, searcher.search(termQuery, 10).totalHits);
// フレーズ "fox hops" にマッチするドキュメントを検索
PhraseQuery phraseQuery = new PhraseQuery();
phraseQuery.add(new Term("content", "fox"));
phraseQuery.add(new Term("content", "hops"));
assertEquals(1, searcher.search(phraseQuery, 10).totalHits);
}
}
/**
* 以下は、Apache Softoware Licence v2.0 の元に頒布されているコードに一部改変を加えたものです。
* http://www.apache.org/licenses/LICENSE-2.0.txt
*/
import java.io.IOException;
public interface SynonymEngine {
/** 文字列に対応するシノニム群を返す */
String[] getSynonyms(String s) throws IOException;
}
/**
* 以下は、Apache Softoware Licence v2.0 の元に頒布されているコードに一部改変を加えたものです。
* http://www.apache.org/licenses/LICENSE-2.0.txt
*/
import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class SynonymFilter extends TokenFilter {
public static final String TOKEN_TYPE_SYNONYM = "SYNONYM";
private Stack<String> synonymStack;
private SynonymEngine engine;
private AttributeSource.State current;
// 使用する Attribute は CharTermAttribute と PositionIncrementAttribute
private final CharTermAttribute termAtt;
private final PositionIncrementAttribute posIncrAtt;
public SynonymFilter(TokenStream input, SynonymEngine engine) {
super(input);
// synonymStack 初期化
synonymStack = new Stack<String>();
this.engine = engine;
// TokenStream に Attribute を追加
this.termAtt = addAttribute(CharTermAttribute.class);
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
// synonymStack から全てのシノニムをpopする
if (synonymStack.size() > 0) {
String syn = synonymStack.pop();
restoreState(current);
char[] buf = syn.toCharArray();
// CharTermAttribute にシノニム文字列をセット
termAtt.resizeBuffer(buf.length);
termAtt.copyBuffer(buf, 0, buf.length);
// postion increment を 0 にセット
posIncrAtt.setPositionIncrement(0);
return true;
}
// 次の Token を読み出す
if (!input.incrementToken())
return false;
// synonymStackにシノニムをpushする
if (addAliasesToStack()) {
current = captureState();
}
return true;
}
private boolean addAliasesToStack() throws IOException {
// 現在の CharTermAttribute に対応するシノニムをすべて synonymStack に push
String[] synonyms = engine.getSynonyms(termAtt.toString());
if (synonyms == null) {
return false;
}
for (String synonym : synonyms) {
synonymStack.push(synonym);
}
return true;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment