Last active
August 29, 2015 14:06
-
-
Save mocobeta/57a8f61250468180607d to your computer and use it in GitHub Desktop.
(Lucene) Highlighter のサンプル
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package higlighter; | |
import java.io.File; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.queryparser.classic.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; | |
import org.apache.lucene.search.vectorhighlight.FieldQuery; | |
import org.apache.lucene.search.vectorhighlight.FragListBuilder; | |
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; | |
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder; | |
import org.apache.lucene.search.vectorhighlight.SimpleFragListBuilder; | |
import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
public class FastVectorHighlighterSample { | |
private static String idxdir = "index"; | |
public static void main(String[] args) { | |
try { | |
// IndexSearcher 作成 | |
Directory dir = FSDirectory.open(new File(idxdir)); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// 検索 | |
Analyzer analyzer = new WhitespaceAnalyzer(); | |
QueryParser parser = new QueryParser("text", analyzer); | |
// Query query = parser.parse("Neverland"); | |
Query query = parser.parse("\"Peter Pan\""); | |
// Query query = parser.parse("\"Peter Pan\" OR \"Tinker Bell\""); | |
TopDocs hits = searcher.search(query, 10); | |
// FastVectorHighlighter 作成 | |
FastVectorHighlighter highlighter = new FastVectorHighlighter(); | |
// for multicolored highlighting | |
/* | |
FragListBuilder fragListBuilder = new SimpleFragListBuilder(); | |
FragmentsBuilder fragmentsBuilder = new SimpleFragmentsBuilder( | |
BaseFragmentsBuilder.COLORED_PRE_TAGS, | |
BaseFragmentsBuilder.COLORED_POST_TAGS); | |
FastVectorHighlighter highlighter = new FastVectorHighlighter( | |
true, true, fragListBuilder, fragmentsBuilder); | |
*/ | |
FieldQuery fieldQuery = highlighter.getFieldQuery(query); | |
for (int i = 0; i < hits.scoreDocs.length; i++) { | |
int docid = hits.scoreDocs[i].doc; | |
Document doc = searcher.doc(docid); | |
String chapNum = doc.get("chapter"); | |
String title = doc.get("title"); | |
System.out.println("Chapter " + chapNum + " : " + title); | |
// Highlighter で検索キーワード周辺の文字列(フラグメント)を取得 | |
String[] frags = highlighter.getBestFragments(fieldQuery, reader, docid, "text", 100, 3); | |
for (String frag : frags) { | |
System.out.println(" " + frag); | |
} | |
} | |
dir.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package higlighter; | |
import java.io.File; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.queryparser.classic.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.search.highlight.Formatter; | |
import org.apache.lucene.search.highlight.Fragmenter; | |
import org.apache.lucene.search.highlight.Highlighter; | |
import org.apache.lucene.search.highlight.QueryScorer; | |
import org.apache.lucene.search.highlight.SimpleHTMLFormatter; | |
import org.apache.lucene.search.highlight.SimpleSpanFragmenter; | |
import org.apache.lucene.search.highlight.TokenSources; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
public class HighlighterSample { | |
private static String idxdir = "index"; | |
public static void main(String[] args) { | |
try { | |
// IndexSearcher 作成 | |
Directory dir = FSDirectory.open(new File(idxdir)); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// 検索 | |
Analyzer analyzer = new WhitespaceAnalyzer(); | |
QueryParser parser = new QueryParser("text", analyzer); | |
Query query = parser.parse("Neverland"); | |
// Query query = parser.parse("\"Peter Pan\""); | |
TopDocs hits = searcher.search(query, 10); | |
// Highlighter 作成 | |
// Formatter と Scorer を与える | |
Formatter formatter = new SimpleHTMLFormatter(); | |
QueryScorer scorer = new QueryScorer(query); | |
Highlighter highlighter = new Highlighter(formatter, scorer); | |
// Fragmenter には SimpleSpanFragmenter を指定 | |
// 固定長(デフォルト100バイト)でフィールドを分割する。 | |
// ただしフレーズクエリなどの場合に、クエリが複数のフラグメントに分断されないようにする | |
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); | |
highlighter.setTextFragmenter(fragmenter); | |
for (int i = 0; i < hits.scoreDocs.length; i++) { | |
int docid = hits.scoreDocs[i].doc; | |
Document doc = searcher.doc(docid); | |
String chapNum = doc.get("chapter"); | |
String title = doc.get("title"); | |
System.out.println("Chapter " + chapNum + " : " + title); | |
String text = doc.get("text"); | |
// Highlighter で検索キーワード周辺の文字列(フラグメント)を取得 | |
// デフォルトの SimpleHTMLFormatter は <B> タグで検索キーワードを囲って返す | |
// TokenStream が必要なので取得 | |
TokenStream stream = TokenSources.getAnyTokenStream(reader, | |
docid, "text", analyzer); | |
String[] frags = highlighter.getBestFragments(stream, text, 5); | |
for (String frag : frags) { | |
System.out.println(" " + frag); | |
} | |
} | |
dir.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package indexer; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.Field.Store; | |
import org.apache.lucene.document.FieldType; | |
import org.apache.lucene.document.StringField; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import org.apache.lucene.util.Version; | |
public class IndexTexts { | |
private static String datadir = "text/PETER_PAN"; | |
private static String idxdir = "index"; | |
public static void main(String args[]) { | |
IndexTexts indexer = new IndexTexts(); | |
Directory dir = null; | |
IndexWriter writer = null; | |
try { | |
dir = FSDirectory.open(new File(idxdir)); | |
writer = indexer.getWriter(dir); | |
indexer.doIndexTexts(writer); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} finally { | |
try { | |
writer.close(); | |
} catch (IOException e) { | |
} | |
} | |
} | |
private void doIndexTexts(IndexWriter writer) throws IOException { | |
File dataDir = new File(datadir); | |
for (File file : dataDir.listFiles()) { | |
if (!file.isFile() || !file.getName().endsWith(".txt")) { | |
continue; | |
} | |
String name = file.getName(); | |
int chapNum = Integer | |
.parseInt(name.substring(0, name.length() - 4)); | |
BufferedReader reader = new BufferedReader(new FileReader(file)); | |
try { | |
String title = reader.readLine(); | |
System.out.println(Integer.toString(chapNum) + " " + title); | |
StringBuffer buf = new StringBuffer(); | |
String line = null; | |
while ((line = reader.readLine()) != null) { | |
line = line.trim(); | |
buf.append(line + " "); | |
} | |
Document doc = makeDoc(Integer.toString(chapNum), title, | |
buf.toString()); | |
writer.addDocument(doc); | |
} finally { | |
reader.close(); | |
} | |
} | |
} | |
// ハイライト対象フィールド用のフィールド定義 | |
static FieldType contentType = new FieldType(); | |
static { | |
contentType.setIndexed(true); | |
contentType.setStored(true); | |
contentType.setTokenized(true); | |
// for Highlighter, FastvectorHighlighter | |
contentType.setStoreTermVectors(true); | |
contentType.setStoreTermVectorPositions(true); | |
contentType.setStoreTermVectorOffsets(true); | |
// for PostingsHighlighter | |
// contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); | |
} | |
private Document makeDoc(String chapNum, String title, String text) { | |
// chapter, title, text の 3 つのフィールドをもつドキュメント | |
Document doc = new Document(); | |
doc.add(new StringField("chapter", chapNum, Store.YES)); | |
doc.add(new StringField("title", title, Store.YES)); | |
doc.add(new Field("text", text, contentType)); | |
return doc; | |
} | |
private IndexWriter getWriter(Directory dir) throws IOException { | |
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_0, | |
new WhitespaceAnalyzer()); | |
return new IndexWriter(dir, config); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package higlighter; | |
import java.io.File; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.queryparser.classic.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.search.postingshighlight.PostingsHighlighter; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
public class PostingsHighlighterSample { | |
private static String idxdir = "index"; | |
public static void main(String[] args) { | |
try { | |
// IndexSearcher 作成 | |
Directory dir = FSDirectory.open(new File(idxdir)); | |
IndexReader reader = DirectoryReader.open(dir); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// 検索 | |
Analyzer analyzer = new WhitespaceAnalyzer(); | |
QueryParser parser = new QueryParser("text", analyzer); | |
Query query = parser.parse("Neverland"); | |
// Query query = parser.parse("\"Peter Pan\""); | |
// Query query = parser.parse("\"Peter Pan\" OR \"Tinker Bell\""); | |
TopDocs hits = searcher.search(query, 10); | |
// Highlighter 作成 | |
PostingsHighlighter highlighter = new PostingsHighlighter(); | |
// ハイライトされたパッセージを取得 | |
// 配列の各要素は TopDocs に格納されたドキュメントにそれぞれ対応する | |
// なお適切なパッセージが見つからなかった場合、ドキュメントの最初の文が格納される | |
String[] highlights = highlighter.highlight("text", query, searcher, hits); | |
for (int i = 0; i < hits.scoreDocs.length; i++) { | |
int docid = hits.scoreDocs[i].doc; | |
Document doc = searcher.doc(docid); | |
String chapNum = doc.get("chapter"); | |
String title = doc.get("title"); | |
System.out.println("Chapter " + chapNum + " : " + title); | |
// PostingsHighlighter.highlit() の結果配列から、対応するパッセージを得る | |
String passage = highlights[i]; | |
System.out.println(" " + passage); | |
} | |
dir.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment