Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Last active November 18, 2017 00:39
Show Gist options
  • Save mocobeta/5213521 to your computer and use it in GitHub Desktop.
Save mocobeta/5213521 to your computer and use it in GitHub Desktop.
Lucene in Action Chapter 5 : Sorting Example
package example;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
public class BookDataIndexer {
private Directory directory;
private String baseDir;
public BookDataIndexer(Directory directory, String baseDir) {
this.directory = directory;
this.baseDir = baseDir;
}
public void indexBookData() throws IOException {
File d = new File(baseDir);
IndexWriter writer = getWriter();
indexBookData(writer, d);
writer.close();
}
private void indexBookData(IndexWriter writer, File file) throws IOException {
if (file.isDirectory()) {
for (File f : file.listFiles()) {
indexBookData(writer, f);
}
} else {
String category = file.getParent().substring(baseDir.length())
.replace(File.separatorChar, '/');
Document doc = getDocument(category, file);
writer.addDocument(doc);
}
}
private IndexWriter getWriter() throws IOException {
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41,
new StandardAnalyzer(Version.LUCENE_41));
IndexWriter writer = new IndexWriter(directory, config);
return writer;
}
private Document getDocument(String category, File file) throws IOException {
Properties props = new Properties();
props.load(new FileInputStream(file));
String isbn = props.getProperty("isbn");
String title = props.getProperty("title");
String author = props.getProperty("author");
String url = props.getProperty("url");
String subject = props.getProperty("subject");
String pubmonth = props.getProperty("pubmonth");
Document doc = new Document();
doc.add(new StringField("isbn", isbn, Store.YES));
doc.add(new StringField("category", category, Store.YES));
doc.add(new StringField("title", title, Store.YES));
doc.add(new StringField("title2", title.toLowerCase(), Store.NO));
for (String val : author.split(",")) {
doc.add(new StringField("author", val, Store.YES));
}
doc.add(new StoredField("url", url));
doc.add(new TextField("subject", subject, Store.NO));
doc.add(new IntField("pubmonth", Integer.parseInt(pubmonth), Store.YES));
String contents = title + " " + subject + " " + author;
doc.add(new TextField("contents", contents, Store.NO));
return doc;
}
}
1. スコア順でソート の結果
Results for: *:* (contents:java contents:action) sorted by <score>
Title pubmonth id score
Java Development with Ant 200208 5 0.65223
/technology/computers/programming
Lucene in Action 200406 7 0.46424
/technology/computers/programming
Tapestry in Action 200403 9 0.46424
/technology/computers/programming
JUnit in Action 200310 6 0.42828
/technology/computers/programming
A Modern Art of Education 198106 0 0.14221
/education/pedagogy
Imperial Secrets of Health... 199401 1 0.14221
/health/alternative/chinese
Tao Te Ching 道德經 198810 2 0.14221
/philosophy/eastern
Gödel, Escher, Bach: an Et... 197903 3 0.14221
/technology/computers/ai
Mindstorms 198001 4 0.14221
/technology/computers/programming/education
Extreme Programming Explained 199910 8 0.14221
/technology/computers/programming/methodology
The Pragmatic Programmer 199910 10 0.14221
/technology/computers/programming
2. インデックス順でソート の結果
Results for: *:* (contents:java contents:action) sorted by <doc>
Title pubmonth id score
A Modern Art of Education 198106 0 0.14221
/education/pedagogy
Imperial Secrets of Health... 199401 1 0.14221
/health/alternative/chinese
Tao Te Ching 道德經 198810 2 0.14221
/philosophy/eastern
Gödel, Escher, Bach: an Et... 197903 3 0.14221
/technology/computers/ai
Mindstorms 198001 4 0.14221
/technology/computers/programming/education
Java Development with Ant 200208 5 0.65223
/technology/computers/programming
JUnit in Action 200310 6 0.42828
/technology/computers/programming
Lucene in Action 200406 7 0.46424
/technology/computers/programming
Extreme Programming Explained 199910 8 0.14221
/technology/computers/programming/methodology
Tapestry in Action 200403 9 0.46424
/technology/computers/programming
The Pragmatic Programmer 199910 10 0.14221
/technology/computers/programming
3. 'category'フィールド(文字列)の昇順でソート の結果
Results for: *:* (contents:java contents:action) sorted by <string: "category">
Title pubmonth id score
A Modern Art of Education 198106 0 0.14221
/education/pedagogy
Imperial Secrets of Health... 199401 1 0.14221
/health/alternative/chinese
Tao Te Ching 道德經 198810 2 0.14221
/philosophy/eastern
Gödel, Escher, Bach: an Et... 197903 3 0.14221
/technology/computers/ai
Java Development with Ant 200208 5 0.65223
/technology/computers/programming
JUnit in Action 200310 6 0.42828
/technology/computers/programming
Lucene in Action 200406 7 0.46424
/technology/computers/programming
Tapestry in Action 200403 9 0.46424
/technology/computers/programming
The Pragmatic Programmer 199910 10 0.14221
/technology/computers/programming
Mindstorms 198001 4 0.14221
/technology/computers/programming/education
Extreme Programming Explained 199910 8 0.14221
/technology/computers/programming/methodology
4. 'pubmonth'フィールド(整数)の降順でソート の結果
Results for: *:* (contents:java contents:action) sorted by <int: "pubmonth">!
Title pubmonth id score
Lucene in Action 200406 7 0.46424
/technology/computers/programming
Tapestry in Action 200403 9 0.46424
/technology/computers/programming
JUnit in Action 200310 6 0.42828
/technology/computers/programming
Java Development with Ant 200208 5 0.65223
/technology/computers/programming
Extreme Programming Explained 199910 8 0.14221
/technology/computers/programming/methodology
The Pragmatic Programmer 199910 10 0.14221
/technology/computers/programming
Imperial Secrets of Health... 199401 1 0.14221
/health/alternative/chinese
Tao Te Ching 道德經 198810 2 0.14221
/philosophy/eastern
A Modern Art of Education 198106 0 0.14221
/education/pedagogy
Mindstorms 198001 4 0.14221
/technology/computers/programming/education
Gödel, Escher, Bach: an Et... 197903 3 0.14221
/technology/computers/ai
5. 第2, 第3, ... ソートを与える例 の結果
Results for: *:* (contents:java contents:action) sorted by <string: "category">,<score>,<int: "pubmonth">!
Title pubmonth id score
A Modern Art of Education 198106 0 0.14221
/education/pedagogy
Imperial Secrets of Health... 199401 1 0.14221
/health/alternative/chinese
Tao Te Ching 道德經 198810 2 0.14221
/philosophy/eastern
Gödel, Escher, Bach: an Et... 197903 3 0.14221
/technology/computers/ai
Java Development with Ant 200208 5 0.65223
/technology/computers/programming
Lucene in Action 200406 7 0.46424
/technology/computers/programming
Tapestry in Action 200403 9 0.46424
/technology/computers/programming
JUnit in Action 200310 6 0.42828
/technology/computers/programming
The Pragmatic Programmer 199910 10 0.14221
/technology/computers/programming
Mindstorms 198001 4 0.14221
/technology/computers/programming/education
Extreme Programming Explained 199910 8 0.14221
/technology/computers/programming/methodology
6. 配列で第2, 第3, ... ソートを配列で与える例 の結果
Results for: *:* (contents:java contents:action) sorted by <score>,<string: "category">
Title pubmonth id score
Java Development with Ant 200208 5 0.65223
/technology/computers/programming
Lucene in Action 200406 7 0.46424
/technology/computers/programming
Tapestry in Action 200403 9 0.46424
/technology/computers/programming
JUnit in Action 200310 6 0.42828
/technology/computers/programming
A Modern Art of Education 198106 0 0.14221
/education/pedagogy
Imperial Secrets of Health... 199401 1 0.14221
/health/alternative/chinese
Tao Te Ching 道德經 198810 2 0.14221
/philosophy/eastern
Gödel, Escher, Bach: an Et... 197903 3 0.14221
/technology/computers/ai
The Pragmatic Programmer 199910 10 0.14221
/technology/computers/programming
Mindstorms 198001 4 0.14221
/technology/computers/programming/education
Extreme Programming Explained 199910 8 0.14221
/technology/computers/programming/methodology
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import com.ibm.icu.text.DecimalFormat;
import example.BookDataIndexer;
public class SortTest {
private Directory directory;
public SortTest() throws IOException {
directory = new RAMDirectory();
new BookDataIndexer(directory, "bookdata").indexBookData();
}
public void displayResults(Query query, Sort sort) throws IOException {
IndexReader r = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(r);
// search()メソッドコール
// 第1引数: Queryオブジェクト
// 第2引数: Filterオブジェクト (ここではフィルタをかけないので null)
// 第3引数: Sortオブジェクト
// 第4引数: ヒットした各ドキュメントのスコアを計算、返却する場合は true
// 第5引数: ヒットしたドキュメントのうち、最大のスコアを返却する場合は false
TopDocs results = searcher.search(query, null, 20, sort, true, false);
System.out.println("Results for: " +
query.toString() + " sorted by " + sort);
System.out.println(
StringUtils.rightPad("Title", 30) +
StringUtils.rightPad("pubmonth", 10) +
StringUtils.center("id", 4) +
StringUtils.center("score", 15));
DecimalFormat scoreFormatter = new DecimalFormat("0.#####");
for (ScoreDoc sd : results.scoreDocs) {
int docID = sd.doc;
float score = sd.score;
Document doc = searcher.doc(docID);
System.out.println(
StringUtils.rightPad(
StringUtils.abbreviate(doc.get("title"), 29), 30) +
StringUtils.rightPad(doc.get("pubmonth"), 10) +
StringUtils.center("" + docID, 4) +
StringUtils.leftPad(
scoreFormatter.format(score), 12));
System.out.println(" " + doc.get("category"));
// System.out.print(searcher.explain(query, docID));
}
r.close();
}
public static void main(String[] args) throws Exception {
Query allBooks = new MatchAllDocsQuery();
QueryParser parser = new QueryParser(Version.LUCENE_41, "contents", new StandardAnalyzer(Version.LUCENE_41));
BooleanQuery query = new BooleanQuery();
query.add(allBooks, Occur.SHOULD);
query.add(parser.parse("java OR action"), Occur.SHOULD);
SortTest test = new SortTest();
/** 1. スコア順でソート */
test.displayResults(query, Sort.RELEVANCE);
/** 2. インデックス順でソート */
test.displayResults(query, Sort.INDEXORDER);
/** 3. 'category'フィールド(文字列)の昇順でソート */
test.displayResults(query, new Sort(new SortField("category", SortField.Type.STRING)));
/** 4. 'pubmonth'フィールド(整数)の降順でソート*/
test.displayResults(query, new Sort(new SortField("pubmonth", SortField.Type.INT, true)));
/** 5. 第2, 第3, ... ソートを与えることも可能。*/
test.displayResults(query,
new Sort(new SortField("category", SortField.Type.STRING),
SortField.FIELD_SCORE,
new SortField("pubmonth", SortField.Type.INT, true)));
/** 配列で第2, 第3, ... ソートを配列で与える例 */
test.displayResults(query,
new Sort(new SortField[]{
SortField.FIELD_SCORE,
new SortField("category", SortField.Type.STRING)}));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment