Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Last active August 29, 2015 14:06
Show Gist options
  • Save mocobeta/57a8f61250468180607d to your computer and use it in GitHub Desktop.
Save mocobeta/57a8f61250468180607d to your computer and use it in GitHub Desktop.
(Lucene) Highlighter のサンプル
package higlighter;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.SimpleFragListBuilder;
import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class FastVectorHighlighterSample {
private static String idxdir = "index";
public static void main(String[] args) {
try {
// IndexSearcher 作成
Directory dir = FSDirectory.open(new File(idxdir));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// 検索
Analyzer analyzer = new WhitespaceAnalyzer();
QueryParser parser = new QueryParser("text", analyzer);
// Query query = parser.parse("Neverland");
Query query = parser.parse("\"Peter Pan\"");
// Query query = parser.parse("\"Peter Pan\" OR \"Tinker Bell\"");
TopDocs hits = searcher.search(query, 10);
// FastVectorHighlighter 作成
FastVectorHighlighter highlighter = new FastVectorHighlighter();
// for multicolored highlighting
/*
FragListBuilder fragListBuilder = new SimpleFragListBuilder();
FragmentsBuilder fragmentsBuilder = new SimpleFragmentsBuilder(
BaseFragmentsBuilder.COLORED_PRE_TAGS,
BaseFragmentsBuilder.COLORED_POST_TAGS);
FastVectorHighlighter highlighter = new FastVectorHighlighter(
true, true, fragListBuilder, fragmentsBuilder);
*/
FieldQuery fieldQuery = highlighter.getFieldQuery(query);
for (int i = 0; i < hits.scoreDocs.length; i++) {
int docid = hits.scoreDocs[i].doc;
Document doc = searcher.doc(docid);
String chapNum = doc.get("chapter");
String title = doc.get("title");
System.out.println("Chapter " + chapNum + " : " + title);
// Highlighter で検索キーワード周辺の文字列(フラグメント)を取得
String[] frags = highlighter.getBestFragments(fieldQuery, reader, docid, "text", 100, 3);
for (String frag : frags) {
System.out.println(" " + frag);
}
}
dir.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
package higlighter;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class HighlighterSample {
private static String idxdir = "index";
public static void main(String[] args) {
try {
// IndexSearcher 作成
Directory dir = FSDirectory.open(new File(idxdir));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// 検索
Analyzer analyzer = new WhitespaceAnalyzer();
QueryParser parser = new QueryParser("text", analyzer);
Query query = parser.parse("Neverland");
// Query query = parser.parse("\"Peter Pan\"");
TopDocs hits = searcher.search(query, 10);
// Highlighter 作成
// Formatter と Scorer を与える
Formatter formatter = new SimpleHTMLFormatter();
QueryScorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, scorer);
// Fragmenter には SimpleSpanFragmenter を指定
// 固定長(デフォルト100バイト)でフィールドを分割する。
// ただしフレーズクエリなどの場合に、クエリが複数のフラグメントに分断されないようにする
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
for (int i = 0; i < hits.scoreDocs.length; i++) {
int docid = hits.scoreDocs[i].doc;
Document doc = searcher.doc(docid);
String chapNum = doc.get("chapter");
String title = doc.get("title");
System.out.println("Chapter " + chapNum + " : " + title);
String text = doc.get("text");
// Highlighter で検索キーワード周辺の文字列(フラグメント)を取得
// デフォルトの SimpleHTMLFormatter は <B> タグで検索キーワードを囲って返す
// TokenStream が必要なので取得
TokenStream stream = TokenSources.getAnyTokenStream(reader,
docid, "text", analyzer);
String[] frags = highlighter.getBestFragments(stream, text, 5);
for (String frag : frags) {
System.out.println(" " + frag);
}
}
dir.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
package indexer;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class IndexTexts {
private static String datadir = "text/PETER_PAN";
private static String idxdir = "index";
public static void main(String args[]) {
IndexTexts indexer = new IndexTexts();
Directory dir = null;
IndexWriter writer = null;
try {
dir = FSDirectory.open(new File(idxdir));
writer = indexer.getWriter(dir);
indexer.doIndexTexts(writer);
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
writer.close();
} catch (IOException e) {
}
}
}
private void doIndexTexts(IndexWriter writer) throws IOException {
File dataDir = new File(datadir);
for (File file : dataDir.listFiles()) {
if (!file.isFile() || !file.getName().endsWith(".txt")) {
continue;
}
String name = file.getName();
int chapNum = Integer
.parseInt(name.substring(0, name.length() - 4));
BufferedReader reader = new BufferedReader(new FileReader(file));
try {
String title = reader.readLine();
System.out.println(Integer.toString(chapNum) + " " + title);
StringBuffer buf = new StringBuffer();
String line = null;
while ((line = reader.readLine()) != null) {
line = line.trim();
buf.append(line + " ");
}
Document doc = makeDoc(Integer.toString(chapNum), title,
buf.toString());
writer.addDocument(doc);
} finally {
reader.close();
}
}
}
// ハイライト対象フィールド用のフィールド定義
static FieldType contentType = new FieldType();
static {
contentType.setIndexed(true);
contentType.setStored(true);
contentType.setTokenized(true);
// for Highlighter, FastvectorHighlighter
contentType.setStoreTermVectors(true);
contentType.setStoreTermVectorPositions(true);
contentType.setStoreTermVectorOffsets(true);
// for PostingsHighlighter
// contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
}
private Document makeDoc(String chapNum, String title, String text) {
// chapter, title, text の 3 つのフィールドをもつドキュメント
Document doc = new Document();
doc.add(new StringField("chapter", chapNum, Store.YES));
doc.add(new StringField("title", title, Store.YES));
doc.add(new Field("text", text, contentType));
return doc;
}
private IndexWriter getWriter(Directory dir) throws IOException {
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_0,
new WhitespaceAnalyzer());
return new IndexWriter(dir, config);
}
}
package higlighter;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class PostingsHighlighterSample {
private static String idxdir = "index";
public static void main(String[] args) {
try {
// IndexSearcher 作成
Directory dir = FSDirectory.open(new File(idxdir));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
// 検索
Analyzer analyzer = new WhitespaceAnalyzer();
QueryParser parser = new QueryParser("text", analyzer);
Query query = parser.parse("Neverland");
// Query query = parser.parse("\"Peter Pan\"");
// Query query = parser.parse("\"Peter Pan\" OR \"Tinker Bell\"");
TopDocs hits = searcher.search(query, 10);
// Highlighter 作成
PostingsHighlighter highlighter = new PostingsHighlighter();
// ハイライトされたパッセージを取得
// 配列の各要素は TopDocs に格納されたドキュメントにそれぞれ対応する
// なお適切なパッセージが見つからなかった場合、ドキュメントの最初の文が格納される
String[] highlights = highlighter.highlight("text", query, searcher, hits);
for (int i = 0; i < hits.scoreDocs.length; i++) {
int docid = hits.scoreDocs[i].doc;
Document doc = searcher.doc(docid);
String chapNum = doc.get("chapter");
String title = doc.get("title");
System.out.println("Chapter " + chapNum + " : " + title);
// PostingsHighlighter.highlit() の結果配列から、対応するパッセージを得る
String passage = highlights[i];
System.out.println(" " + passage);
}
dir.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment