Last active
December 21, 2020 05:33
-
-
Save mocobeta/e627c7ab90bc2917b743 to your computer and use it in GitHub Desktop.
Hello Lucene! (5.0)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.ja.JapaneseAnalyzer; | |
import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute; | |
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | |
import org.apache.lucene.analysis.util.CharArraySet; | |
import java.io.IOException; | |
import java.io.StringReader; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
public class HelloKuromoji { | |
public static final String[] contents = { | |
"太陽は群馬県になりました。つまり太陽系は群馬県系です。" | |
}; | |
private static final String[] stoptags = { | |
"記号-一般", "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット" | |
}; | |
private Analyzer analyzer = new JapaneseAnalyzer(null, | |
JapaneseTokenizer.DEFAULT_MODE, | |
CharArraySet.EMPTY_SET, new HashSet(Arrays.asList(stoptags))); | |
public void displayTokenStream() throws IOException { | |
for (String content : contents) { | |
System.out.println("\n" + content); | |
System.out.println("===================================================================="); | |
StringReader reader = new StringReader(content); | |
TokenStream stream = analyzer.tokenStream("", reader); | |
stream.reset(); // must call TokenStream#reset() | |
displayTokens(stream); | |
stream.close(); | |
} | |
} | |
private void displayTokens(TokenStream stream) throws IOException { | |
System.out.println("|テキスト\t|開始\t|終了\t|読み\t\t|品詞"); | |
System.out.println("--------------------------------------------------------------------"); | |
while(stream.incrementToken()) { | |
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); | |
ReadingAttribute rAtt = stream.getAttribute(ReadingAttribute.class); | |
OffsetAttribute oAtt = stream.getAttribute(OffsetAttribute.class); | |
PartOfSpeechAttribute psAtt = stream.getAttribute(PartOfSpeechAttribute.class); | |
String text = termAtt.toString(); | |
String yomi = rAtt.getReading(); | |
int sOffset = oAtt.startOffset(); | |
int eOffset = oAtt.endOffset(); | |
String pos = psAtt.getPartOfSpeech(); | |
System.out.println( | |
"|" + text + "\t\t" + | |
"|" + Integer.toString(sOffset) + "\t" + | |
"|" + Integer.toString(eOffset) + "\t" + | |
"|" + yomi + "\t\t" + | |
"|" + pos + "\t" | |
); | |
} | |
} | |
public static void main(String[] args) throws IOException { | |
HelloKuromoji test = new HelloKuromoji(); | |
test.displayTokenStream(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.ja.JapaneseAnalyzer; | |
import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.queryparser.classic.ParseException; | |
import org.apache.lucene.queryparser.classic.QueryParser; | |
import org.apache.lucene.search.*; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.RAMDirectory; | |
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.util.HashSet; | |
public class HelloLucene { | |
private static final String FIELD_CONTENT = "content"; | |
private static final Directory directory = new RAMDirectory(); | |
private static final Analyzer analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.Mode.NORMAL, null, new HashSet<String>()); | |
private static final QueryParser qp = new QueryParser(FIELD_CONTENT, analyzer); | |
private static final String[] contents = { | |
"カツオはサザエの弟", "サザエはワカメの姉", "ワカメはカツオの妹", | |
"カツオは長男", "サザエは長女", "ワカメは次女", | |
"マスオはサザエの夫", "波平は舟の夫", "タラちゃんのパパはマスオ", | |
"サザエとマスオは夫婦", "波平はタラちゃんの祖父", "舟はカツオの母", | |
"マスオはカツオの義兄", "カツオはタラちゃんの叔父", "舟はワカメの母" | |
}; | |
public static void main(String[] args) throws Exception { | |
makeIndex(); | |
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); | |
String q = null; | |
while (q == null || !q.equals("q")) { | |
System.out.println("\n検索質問 (qで終了) > "); | |
System.out.flush(); | |
q = reader.readLine(); | |
if (!q.equals("q")) { | |
searchIndex(q); | |
} | |
} | |
reader.close(); | |
if (directory != null) { | |
directory.close(); | |
} | |
} | |
// インデックス作成メソッド | |
private static void makeIndex() throws IOException { | |
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer)); | |
for (String s : contents) { | |
Document doc = new Document(); | |
doc.add(new Field(FIELD_CONTENT, s, Field.Store.YES, Field.Index.ANALYZED)); | |
writer.addDocument(doc); | |
} | |
writer.close(); | |
} | |
// 検索メソッド | |
private static void searchIndex(final String q) throws IOException, ParseException { | |
IndexReader r = DirectoryReader.open(directory); | |
IndexSearcher searcher = new IndexSearcher(r); | |
Query query = qp.parse(q); | |
TopScoreDocCollector results = TopScoreDocCollector.create(10); | |
searcher.search(query, results); | |
TopDocs docs = results.topDocs(); | |
System.out.println(Integer.toString(docs.totalHits) + "件ヒットしました。"); | |
ScoreDoc[] hits = docs.scoreDocs; | |
for (ScoreDoc hit : hits) { | |
Document doc = searcher.doc(hit.doc); | |
System.out.println(doc.get(FIELD_CONTENT)); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.ja.JapaneseAnalyzer; | |
import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
import org.apache.lucene.analysis.util.CharArraySet; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.index.*; | |
import org.apache.lucene.search.DocIdSetIterator; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.lucene.util.BytesRef; | |
import java.io.IOException; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
public class IndexingTest { | |
private static final String[] contents = { | |
"カツオはサザエの弟", "サザエはワカメの姉", "ワカメはカツオの妹", | |
"カツオは長男", "サザエは長女", "ワカメは次女", | |
"マスオはサザエの夫", "波平は舟の夫", "タラちゃんのパパはマスオ", | |
"サザエとマスオは夫婦", "波平はタラちゃんの祖父", "舟はカツオの母", | |
"マスオはカツオの義兄", "カツオはタラちゃんの叔父", "舟はワカメの母" | |
}; | |
private static final String[] stoptags = { | |
"記号-一般", "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット" | |
}; | |
private static final String F_ID = "id"; | |
private static final String F_CONTENT = "contents"; | |
private Analyzer analyzer; | |
private Directory directory; | |
IndexingTest() throws IOException { | |
analyzer = new JapaneseAnalyzer(null, | |
JapaneseTokenizer.DEFAULT_MODE, | |
CharArraySet.EMPTY_SET, new HashSet(Arrays.asList(stoptags))); | |
directory = new RAMDirectory(); | |
makeIndex(); | |
} | |
private LeafReader getFirstLeafReader() throws IOException { | |
// XXX サンプルの簡略化のため、最初に見つかった LeafReader を返している | |
// XXX 本来は、すべての leaf を辿る or SlowCompositeReaderWrapper で DirectoryReader をラップする | |
return DirectoryReader.open(directory).leaves().get(0).reader(); | |
} | |
private IndexWriter getIndexWriter() throws IOException { | |
return new IndexWriter(directory, new IndexWriterConfig(analyzer)); | |
} | |
private void makeIndex() throws IOException { | |
IndexWriter writer = null; | |
try { | |
writer = getIndexWriter(); | |
int i = 1; | |
for (String content : contents) { | |
Document doc = new Document(); | |
doc.add(new Field(F_ID, Integer.toString(i++), Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_CONTENT, content, Field.Store.YES, Field.Index.ANALYZED)); | |
writer.addDocument(doc); | |
} | |
} finally { | |
if (writer != null) | |
writer.close(); | |
} | |
} | |
/** 指定された語を含むドキュメントをインデックスから削除する */ | |
private void deleteDocuments(String term) throws IOException { | |
IndexWriter writer = null; | |
try { | |
writer = getIndexWriter(); | |
writer.deleteDocuments(new Term(F_CONTENT, term)); | |
writer.commit(); | |
} finally { | |
if (writer != null) | |
writer.close(); | |
} | |
} | |
public static void main(String[] args) throws Exception { | |
IndexingTest irt = new IndexingTest(); | |
// "ワカメ"を含むドキュメントを削除しておく | |
irt.deleteDocuments("ワカメ"); | |
// IndexReader(LeafReader)取得 | |
LeafReader reader = irt.getFirstLeafReader(); | |
// 検索可能なドキュメントの数 | |
System.out.println("Docs : " + Integer.toString(reader.numDocs())); | |
// インデックスから削除されたドキュメントの数 | |
System.out.println("Deleted Docs: " + Integer.toString(reader.numDeletedDocs())); | |
// 削除されたドキュメントを含む、全ドキュメント数 | |
System.out.println("Max Docs : " + Integer.toString(reader.maxDoc())); | |
System.out.println(""); | |
System.out.println("---- All Documents ----"); | |
for (int i = 0; i < reader.maxDoc(); i++) { | |
Document doc = reader.document(i); | |
System.out.println(doc.get(F_ID) + " : " + doc.get(F_CONTENT)); | |
} | |
System.out.println(""); | |
System.out.println("---- Documents contain 'カツオ' ----"); | |
// "contents" フィールドの Terms 取得 | |
Terms terms = reader.terms(F_CONTENT); | |
// TermsEnum 取得 & "カツオ" という単語(term)が見つかるまで seek | |
TermsEnum te = terms.iterator(null); | |
boolean found = te.seekExact(new BytesRef("カツオ")); | |
if (!found) { | |
// TermsEnum#seekExact() は、見つからなかったら false を返す | |
System.out.println("Not Found."); | |
} else { | |
// 単語にひもづく DocsEnum 取得 | |
DocsEnum de = te.docs(reader.getLiveDocs(), null); | |
// DocsEnum をたどり、ドキュメントIDとドキュメント本体を取得 | |
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { | |
Document doc = reader.document(de.docID()); | |
System.out.println(doc.get(F_ID) + " : " + doc.get(F_CONTENT)); | |
} | |
} | |
reader.close(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment