Skip to content

Instantly share code, notes, and snippets.

@asif31iqbal
Forked from mocobeta/QueryTest.java
Created June 10, 2016 21:57
Show Gist options
  • Save asif31iqbal/31bae0a2ff9ebc36e04085d5692f966e to your computer and use it in GitHub Desktop.
Save asif31iqbal/31bae0a2ff9ebc36e04085d5692f966e to your computer and use it in GitHub Desktop.
Lucene 4.1 querying sample
import static org.junit.Assert.*;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;
public class QueryTest {
@Test
public void testTermQuery() throws IOException {
Directory dir = new RAMDirectory();
new BookDataIndexer(dir, "bookdata").indexBookData();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// isbn が "0854402624" に一致するドキュメントを検索
Term t = new Term("isbn", "0854402624");
Query query = new TermQuery(t);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
}
@Test
public void testTermRangeQuery() throws IOException {
Directory dir = new RAMDirectory();
new BookDataIndexer(dir, "bookdata").indexBookData();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// title2 が(辞書順で) "d" より大きく、"j" より小さいドキュメントを検索
// 上限値と下限値は範囲に含まれる
BytesRef lower = new BytesRef("d");
BytesRef upper = new BytesRef("j");
TermRangeQuery query = new TermRangeQuery("title2", lower, upper, true, true);
TopDocs docs = searcher.search(query, 10);
assertEquals(3, docs.totalHits);
}
@Test
public void testNumericRangeQuery() throws IOException {
Directory dir = new RAMDirectory();
new BookDataIndexer(dir, "bookdata").indexBookData();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// pubmonth が 200406 より大きく、 200409 より小さいドキュメントを検索
// 上限値と下限値は範囲に含まれる
NumericRangeQuery<Integer> query =
NumericRangeQuery.newIntRange("pubmonth", 200406, 200409, true, true);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
}
@Test
public void testPrefixQueryTest() throws IOException {
Directory dir = new RAMDirectory();
new BookDataIndexer(dir, "bookdata").indexBookData();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
Term t = new Term("category", "/technology/computers/programming");
// category が "/technology/computers/programming" から始まるドキュメントを検索
PrefixQuery query = new PrefixQuery(t);
TopDocs docs = searcher.search(query, 10);
int programmingAndBelow = docs.totalHits;
// category が "/technology/computers/programming" に一致するドキュメントを検索
docs = searcher.search(new TermQuery(t), 10);
int justProgramming = docs.totalHits;
// ヒット数が違う
assertTrue(programmingAndBelow > justProgramming);
}
@Test
public void testBooleanQueryAnd() throws IOException {
Directory dir = new RAMDirectory();
new BookDataIndexer(dir, "bookdata").indexBookData();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// AND 検索
TermQuery searchingBooks =
new TermQuery(new Term("subject", "search"));
Query books2004 =
NumericRangeQuery.newIntRange("pubmonth", 200401, 200412, true, true);
BooleanQuery query = new BooleanQuery();
// Occur.MUST を指定
query.add(searchingBooks, Occur.MUST);
query.add(books2004, Occur.MUST);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
assertTrue(hitsIncludeTitle(searcher, docs, "Lucene in Action"));
}
@Test
public void testBooleanQueryOr() throws IOException {
Directory dir = new RAMDirectory();
new BookDataIndexer(dir, "bookdata").indexBookData();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// OR 検索
TermQuery methodologyBooks =
new TermQuery(new Term("category", "/technology/computers/programming/methodology"));
TermQuery easternPhilosophyBooks =
new TermQuery(new Term("category", "/philosophy/eastern"));
BooleanQuery query = new BooleanQuery();
// Occur.SHOULD を指定
query.add(methodologyBooks, Occur.SHOULD);
query.add(easternPhilosophyBooks, Occur.SHOULD);
TopDocs docs = searcher.search(query, 10);
assertTrue(hitsIncludeTitle(searcher, docs, "Extreme Programming Explained"));
assertTrue(hitsIncludeTitle(searcher, docs, "Tao Te Ching 道德經"));
}
private boolean hitsIncludeTitle(IndexSearcher searcher, TopDocs hits, String title)
throws IOException {
for (ScoreDoc match : hits.scoreDocs) {
Document doc = searcher.doc(match.doc);
if (title.equals(doc.get("title"))) {
return true;
}
}
return false;
}
@Test
public void testPhraseQuery() throws IOException {
Directory dir = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41,
new WhitespaceAnalyzer(Version.LUCENE_41));
IndexWriter writer = new IndexWriter(dir, config);
Document doc = new Document();
doc.add(new TextField("field",
"the quick brown fox jumped over the lazy dog",
Store.YES));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// "quick" のすぐ後に "fox" が出現するドキュメントを検索
assertFalse(matched(searcher, new String[]{"quick", "fox"}, 0));
// "quick" と "fox" が距離1語以内で出現するドキュメントを検索
assertTrue(matched(searcher, new String[]{"quick", "fox"}, 1));
// "quick" と "jumped" と "lazy" が距離3語以内で出現するドキュメントを検索
assertFalse(matched(searcher, new String[]{"quick", "jumped", "lazy"}, 3));
// "quick" と "jumped" と "lazy" が距離4語以内で出現するドキュメントを検索
assertTrue(matched(searcher, new String[]{"quick", "jumped", "lazy"}, 4));
}
private boolean matched(IndexSearcher searcher, String[] phrase, int slop) throws IOException {
PhraseQuery query = new PhraseQuery();
query.setSlop(slop);
for (String word : phrase) {
query.add(new Term("field", word));
}
TopDocs docs = searcher.search(query, 10);
return docs.totalHits > 0;
}
@Test
public void testWildcardQuery() throws IOException {
Directory dir = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41,
new WhitespaceAnalyzer(Version.LUCENE_41));
IndexWriter writer = new IndexWriter(dir, config);
String[] fields = new String[]{"wild", "child", "mild", "mildew"};
for (String field : fields) {
Document doc = new Document();
doc.add(new TextField("contents", field, Store.YES));
writer.addDocument(doc);
}
writer.close();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// パターン "?ild*" にマッチするドキュメントを検索
WildcardQuery query = new WildcardQuery(new Term("contents", "?ild*"));
TopDocs docs = searcher.search(query, 10);
assertEquals(3, docs.totalHits);
assertEquals(docs.scoreDocs[0].score, docs.scoreDocs[1].score, 0.0);
assertEquals(docs.scoreDocs[1].score, docs.scoreDocs[2].score, 0.0);
}
@Test
public void testFuzzyQuery() throws IOException {
Directory dir = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41,
new WhitespaceAnalyzer(Version.LUCENE_41));
IndexWriter writer = new IndexWriter(dir, config);
String[] fields = new String[]{"fuzzy", "wuzzy"};
for (String field : fields) {
Document doc = new Document();
doc.add(new TextField("contents", field, Store.YES));
writer.addDocument(doc);
}
writer.close();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// "wuzza" と編集距離が近いドキュメントを検索
FuzzyQuery query = new FuzzyQuery(new Term("contents", "wuzza"));
TopDocs docs = searcher.search(query, 10);
assertEquals(2, docs.totalHits);
assertTrue(docs.scoreDocs[0].score != docs.scoreDocs[1].score);
// "wuzzy" がより "wuzza" に近いのでスコアが高い
Document doc = searcher.doc(docs.scoreDocs[0].doc);
assertEquals("wuzzy", doc.get("contents"));
}
}
// テスト用インデックス作成クラス
// データは Lucene in Action サポートサイトからダウンロードできる
// http://www.manning.com/hatcher3/
class BookDataIndexer {
private Directory directory;
private String baseDir;
public BookDataIndexer(Directory directory, String baseDir) {
this.directory = directory;
this.baseDir = baseDir;
}
public void indexBookData() throws IOException {
File d = new File(baseDir);
IndexWriter writer = getWriter();
indexBookData(writer, d);
writer.close();
}
private void indexBookData(IndexWriter writer, File file) throws IOException {
if (file.isDirectory()) {
for (File f : file.listFiles()) {
indexBookData(writer, f);
}
} else {
String category = file.getParent().substring(baseDir.length())
.replace(File.separatorChar, '/');
Document doc = getDocument(category, file);
writer.addDocument(doc);
}
}
private IndexWriter getWriter() throws IOException {
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41,
new StandardAnalyzer(Version.LUCENE_41));
IndexWriter writer = new IndexWriter(directory, config);
return writer;
}
private Document getDocument(String category, File file) throws IOException {
Properties props = new Properties();
props.load(new FileInputStream(file));
String isbn = props.getProperty("isbn");
String title = props.getProperty("title");
String author = props.getProperty("author");
String url = props.getProperty("url");
String subject = props.getProperty("subject");
String pubmonth = props.getProperty("pubmonth");
Document doc = new Document();
doc.add(new StringField("isbn", isbn, Store.YES));
doc.add(new StringField("category", category, Store.YES));
doc.add(new StringField("title", title, Store.YES));
doc.add(new StringField("title2", title.toLowerCase(), Store.NO));
for (String val : author.split(",")) {
doc.add(new StringField("author", val, Store.YES));
}
doc.add(new StoredField("url", url));
doc.add(new TextField("subject", subject, Store.NO));
doc.add(new IntField("pubmonth", Integer.parseInt(pubmonth), Store.YES));
String contents = title + " " + subject + " " + author;
doc.add(new TextField("contents", contents, Store.NO));
return doc;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment