-
-
Save asif31iqbal/31bae0a2ff9ebc36e04085d5692f966e to your computer and use it in GitHub Desktop.
Lucene 4.1 querying sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import static org.junit.Assert.*; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.IOException; | |
import java.util.Properties; | |
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; | |
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.IntField; | |
import org.apache.lucene.document.StoredField; | |
import org.apache.lucene.document.StringField; | |
import org.apache.lucene.document.Field.Store; | |
import org.apache.lucene.document.TextField; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.index.Term; | |
import org.apache.lucene.search.BooleanQuery; | |
import org.apache.lucene.search.FuzzyQuery; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.NumericRangeQuery; | |
import org.apache.lucene.search.PhraseQuery; | |
import org.apache.lucene.search.PrefixQuery; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.ScoreDoc; | |
import org.apache.lucene.search.TermQuery; | |
import org.apache.lucene.search.TermRangeQuery; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.search.BooleanClause.Occur; | |
import org.apache.lucene.search.WildcardQuery; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.lucene.util.BytesRef; | |
import org.apache.lucene.util.Version; | |
import org.junit.Before; | |
import org.junit.Test; | |
public class QueryTest { | |
@Test | |
public void testTermQuery() throws IOException { | |
Directory dir = new RAMDirectory(); | |
new BookDataIndexer(dir, "bookdata").indexBookData(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// isbn が "0854402624" に一致するドキュメントを検索 | |
Term t = new Term("isbn", "0854402624"); | |
Query query = new TermQuery(t); | |
TopDocs docs = searcher.search(query, 10); | |
assertEquals(1, docs.totalHits); | |
} | |
@Test | |
public void testTermRangeQuery() throws IOException { | |
Directory dir = new RAMDirectory(); | |
new BookDataIndexer(dir, "bookdata").indexBookData(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// title2 が(辞書順で) "d" より大きく、"j" より小さいドキュメントを検索 | |
// 上限値と下限値は範囲に含まれる | |
BytesRef lower = new BytesRef("d"); | |
BytesRef upper = new BytesRef("j"); | |
TermRangeQuery query = new TermRangeQuery("title2", lower, upper, true, true); | |
TopDocs docs = searcher.search(query, 10); | |
assertEquals(3, docs.totalHits); | |
} | |
@Test | |
public void testNumericRangeQuery() throws IOException { | |
Directory dir = new RAMDirectory(); | |
new BookDataIndexer(dir, "bookdata").indexBookData(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// pubmonth が 200406 より大きく、 200409 より小さいドキュメントを検索 | |
// 上限値と下限値は範囲に含まれる | |
NumericRangeQuery<Integer> query = | |
NumericRangeQuery.newIntRange("pubmonth", 200406, 200409, true, true); | |
TopDocs docs = searcher.search(query, 10); | |
assertEquals(1, docs.totalHits); | |
} | |
@Test | |
public void testPrefixQueryTest() throws IOException { | |
Directory dir = new RAMDirectory(); | |
new BookDataIndexer(dir, "bookdata").indexBookData(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
Term t = new Term("category", "/technology/computers/programming"); | |
// category が "/technology/computers/programming" から始まるドキュメントを検索 | |
PrefixQuery query = new PrefixQuery(t); | |
TopDocs docs = searcher.search(query, 10); | |
int programmingAndBelow = docs.totalHits; | |
// category が "/technology/computers/programming" に一致するドキュメントを検索 | |
docs = searcher.search(new TermQuery(t), 10); | |
int justProgramming = docs.totalHits; | |
// ヒット数が違う | |
assertTrue(programmingAndBelow > justProgramming); | |
} | |
@Test | |
public void testBooleanQueryAnd() throws IOException { | |
Directory dir = new RAMDirectory(); | |
new BookDataIndexer(dir, "bookdata").indexBookData(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// AND 検索 | |
TermQuery searchingBooks = | |
new TermQuery(new Term("subject", "search")); | |
Query books2004 = | |
NumericRangeQuery.newIntRange("pubmonth", 200401, 200412, true, true); | |
BooleanQuery query = new BooleanQuery(); | |
// Occur.MUST を指定 | |
query.add(searchingBooks, Occur.MUST); | |
query.add(books2004, Occur.MUST); | |
TopDocs docs = searcher.search(query, 10); | |
assertEquals(1, docs.totalHits); | |
assertTrue(hitsIncludeTitle(searcher, docs, "Lucene in Action")); | |
} | |
@Test | |
public void testBooleanQueryOr() throws IOException { | |
Directory dir = new RAMDirectory(); | |
new BookDataIndexer(dir, "bookdata").indexBookData(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// OR 検索 | |
TermQuery methodologyBooks = | |
new TermQuery(new Term("category", "/technology/computers/programming/methodology")); | |
TermQuery easternPhilosophyBooks = | |
new TermQuery(new Term("category", "/philosophy/eastern")); | |
BooleanQuery query = new BooleanQuery(); | |
// Occur.SHOULD を指定 | |
query.add(methodologyBooks, Occur.SHOULD); | |
query.add(easternPhilosophyBooks, Occur.SHOULD); | |
TopDocs docs = searcher.search(query, 10); | |
assertTrue(hitsIncludeTitle(searcher, docs, "Extreme Programming Explained")); | |
assertTrue(hitsIncludeTitle(searcher, docs, "Tao Te Ching 道德經")); | |
} | |
private boolean hitsIncludeTitle(IndexSearcher searcher, TopDocs hits, String title) | |
throws IOException { | |
for (ScoreDoc match : hits.scoreDocs) { | |
Document doc = searcher.doc(match.doc); | |
if (title.equals(doc.get("title"))) { | |
return true; | |
} | |
} | |
return false; | |
} | |
@Test | |
public void testPhraseQuery() throws IOException { | |
Directory dir = new RAMDirectory(); | |
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, | |
new WhitespaceAnalyzer(Version.LUCENE_41)); | |
IndexWriter writer = new IndexWriter(dir, config); | |
Document doc = new Document(); | |
doc.add(new TextField("field", | |
"the quick brown fox jumped over the lazy dog", | |
Store.YES)); | |
writer.addDocument(doc); | |
writer.close(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// "quick" のすぐ後に "fox" が出現するドキュメントを検索 | |
assertFalse(matched(searcher, new String[]{"quick", "fox"}, 0)); | |
// "quick" と "fox" が距離1語以内で出現するドキュメントを検索 | |
assertTrue(matched(searcher, new String[]{"quick", "fox"}, 1)); | |
// "quick" と "jumped" と "lazy" が距離3語以内で出現するドキュメントを検索 | |
assertFalse(matched(searcher, new String[]{"quick", "jumped", "lazy"}, 3)); | |
// "quick" と "jumped" と "lazy" が距離4語以内で出現するドキュメントを検索 | |
assertTrue(matched(searcher, new String[]{"quick", "jumped", "lazy"}, 4)); | |
} | |
private boolean matched(IndexSearcher searcher, String[] phrase, int slop) throws IOException { | |
PhraseQuery query = new PhraseQuery(); | |
query.setSlop(slop); | |
for (String word : phrase) { | |
query.add(new Term("field", word)); | |
} | |
TopDocs docs = searcher.search(query, 10); | |
return docs.totalHits > 0; | |
} | |
@Test | |
public void testWildcardQuery() throws IOException { | |
Directory dir = new RAMDirectory(); | |
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, | |
new WhitespaceAnalyzer(Version.LUCENE_41)); | |
IndexWriter writer = new IndexWriter(dir, config); | |
String[] fields = new String[]{"wild", "child", "mild", "mildew"}; | |
for (String field : fields) { | |
Document doc = new Document(); | |
doc.add(new TextField("contents", field, Store.YES)); | |
writer.addDocument(doc); | |
} | |
writer.close(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// パターン "?ild*" にマッチするドキュメントを検索 | |
WildcardQuery query = new WildcardQuery(new Term("contents", "?ild*")); | |
TopDocs docs = searcher.search(query, 10); | |
assertEquals(3, docs.totalHits); | |
assertEquals(docs.scoreDocs[0].score, docs.scoreDocs[1].score, 0.0); | |
assertEquals(docs.scoreDocs[1].score, docs.scoreDocs[2].score, 0.0); | |
} | |
@Test | |
public void testFuzzyQuery() throws IOException { | |
Directory dir = new RAMDirectory(); | |
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, | |
new WhitespaceAnalyzer(Version.LUCENE_41)); | |
IndexWriter writer = new IndexWriter(dir, config); | |
String[] fields = new String[]{"fuzzy", "wuzzy"}; | |
for (String field : fields) { | |
Document doc = new Document(); | |
doc.add(new TextField("contents", field, Store.YES)); | |
writer.addDocument(doc); | |
} | |
writer.close(); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// "wuzza" と編集距離が近いドキュメントを検索 | |
FuzzyQuery query = new FuzzyQuery(new Term("contents", "wuzza")); | |
TopDocs docs = searcher.search(query, 10); | |
assertEquals(2, docs.totalHits); | |
assertTrue(docs.scoreDocs[0].score != docs.scoreDocs[1].score); | |
// "wuzzy" がより "wuzza" に近いのでスコアが高い | |
Document doc = searcher.doc(docs.scoreDocs[0].doc); | |
assertEquals("wuzzy", doc.get("contents")); | |
} | |
} | |
// テスト用インデックス作成クラス | |
// データは Lucene in Action サポートサイトからダウンロードできる | |
// http://www.manning.com/hatcher3/ | |
class BookDataIndexer { | |
private Directory directory; | |
private String baseDir; | |
public BookDataIndexer(Directory directory, String baseDir) { | |
this.directory = directory; | |
this.baseDir = baseDir; | |
} | |
public void indexBookData() throws IOException { | |
File d = new File(baseDir); | |
IndexWriter writer = getWriter(); | |
indexBookData(writer, d); | |
writer.close(); | |
} | |
private void indexBookData(IndexWriter writer, File file) throws IOException { | |
if (file.isDirectory()) { | |
for (File f : file.listFiles()) { | |
indexBookData(writer, f); | |
} | |
} else { | |
String category = file.getParent().substring(baseDir.length()) | |
.replace(File.separatorChar, '/'); | |
Document doc = getDocument(category, file); | |
writer.addDocument(doc); | |
} | |
} | |
private IndexWriter getWriter() throws IOException { | |
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, | |
new StandardAnalyzer(Version.LUCENE_41)); | |
IndexWriter writer = new IndexWriter(directory, config); | |
return writer; | |
} | |
private Document getDocument(String category, File file) throws IOException { | |
Properties props = new Properties(); | |
props.load(new FileInputStream(file)); | |
String isbn = props.getProperty("isbn"); | |
String title = props.getProperty("title"); | |
String author = props.getProperty("author"); | |
String url = props.getProperty("url"); | |
String subject = props.getProperty("subject"); | |
String pubmonth = props.getProperty("pubmonth"); | |
Document doc = new Document(); | |
doc.add(new StringField("isbn", isbn, Store.YES)); | |
doc.add(new StringField("category", category, Store.YES)); | |
doc.add(new StringField("title", title, Store.YES)); | |
doc.add(new StringField("title2", title.toLowerCase(), Store.NO)); | |
for (String val : author.split(",")) { | |
doc.add(new StringField("author", val, Store.YES)); | |
} | |
doc.add(new StoredField("url", url)); | |
doc.add(new TextField("subject", subject, Store.NO)); | |
doc.add(new IntField("pubmonth", Integer.parseInt(pubmonth), Store.YES)); | |
String contents = title + " " + subject + " " + author; | |
doc.add(new TextField("contents", contents, Store.NO)); | |
return doc; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment