Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Last active December 21, 2020 05:33
Show Gist options
  • Save mocobeta/e627c7ab90bc2917b743 to your computer and use it in GitHub Desktop.
Save mocobeta/e627c7ab90bc2917b743 to your computer and use it in GitHub Desktop.
Hello Lucene! (5.0)
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashSet;
public class HelloKuromoji {
public static final String[] contents = {
"太陽は群馬県になりました。つまり太陽系は群馬県系です。"
};
private static final String[] stoptags = {
"記号-一般", "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット"
};
private Analyzer analyzer = new JapaneseAnalyzer(null,
JapaneseTokenizer.DEFAULT_MODE,
CharArraySet.EMPTY_SET, new HashSet(Arrays.asList(stoptags)));
public void displayTokenStream() throws IOException {
for (String content : contents) {
System.out.println("\n" + content);
System.out.println("====================================================================");
StringReader reader = new StringReader(content);
TokenStream stream = analyzer.tokenStream("", reader);
stream.reset(); // must call TokenStream#reset()
displayTokens(stream);
stream.close();
}
}
private void displayTokens(TokenStream stream) throws IOException {
System.out.println("|テキスト\t|開始\t|終了\t|読み\t\t|品詞");
System.out.println("--------------------------------------------------------------------");
while(stream.incrementToken()) {
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
ReadingAttribute rAtt = stream.getAttribute(ReadingAttribute.class);
OffsetAttribute oAtt = stream.getAttribute(OffsetAttribute.class);
PartOfSpeechAttribute psAtt = stream.getAttribute(PartOfSpeechAttribute.class);
String text = termAtt.toString();
String yomi = rAtt.getReading();
int sOffset = oAtt.startOffset();
int eOffset = oAtt.endOffset();
String pos = psAtt.getPartOfSpeech();
System.out.println(
"|" + text + "\t\t" +
"|" + Integer.toString(sOffset) + "\t" +
"|" + Integer.toString(eOffset) + "\t" +
"|" + yomi + "\t\t" +
"|" + pos + "\t"
);
}
}
public static void main(String[] args) throws IOException {
HelloKuromoji test = new HelloKuromoji();
test.displayTokenStream();
}
}
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
public class HelloLucene {
private static final String FIELD_CONTENT = "content";
private static final Directory directory = new RAMDirectory();
private static final Analyzer analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.Mode.NORMAL, null, new HashSet<String>());
private static final QueryParser qp = new QueryParser(FIELD_CONTENT, analyzer);
private static final String[] contents = {
"カツオはサザエの弟", "サザエはワカメの姉", "ワカメはカツオの妹",
"カツオは長男", "サザエは長女", "ワカメは次女",
"マスオはサザエの夫", "波平は舟の夫", "タラちゃんのパパはマスオ",
"サザエとマスオは夫婦", "波平はタラちゃんの祖父", "舟はカツオの母",
"マスオはカツオの義兄", "カツオはタラちゃんの叔父", "舟はワカメの母"
};
public static void main(String[] args) throws Exception {
makeIndex();
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String q = null;
while (q == null || !q.equals("q")) {
System.out.println("\n検索質問 (qで終了) > ");
System.out.flush();
q = reader.readLine();
if (!q.equals("q")) {
searchIndex(q);
}
}
reader.close();
if (directory != null) {
directory.close();
}
}
// インデックス作成メソッド
private static void makeIndex() throws IOException {
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer));
for (String s : contents) {
Document doc = new Document();
doc.add(new Field(FIELD_CONTENT, s, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
writer.close();
}
// 検索メソッド
private static void searchIndex(final String q) throws IOException, ParseException {
IndexReader r = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(r);
Query query = qp.parse(q);
TopScoreDocCollector results = TopScoreDocCollector.create(10);
searcher.search(query, results);
TopDocs docs = results.topDocs();
System.out.println(Integer.toString(docs.totalHits) + "件ヒットしました。");
ScoreDoc[] hits = docs.scoreDocs;
for (ScoreDoc hit : hits) {
Document doc = searcher.doc(hit.doc);
System.out.println(doc.get(FIELD_CONTENT));
}
}
}
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
public class IndexingTest {
private static final String[] contents = {
"カツオはサザエの弟", "サザエはワカメの姉", "ワカメはカツオの妹",
"カツオは長男", "サザエは長女", "ワカメは次女",
"マスオはサザエの夫", "波平は舟の夫", "タラちゃんのパパはマスオ",
"サザエとマスオは夫婦", "波平はタラちゃんの祖父", "舟はカツオの母",
"マスオはカツオの義兄", "カツオはタラちゃんの叔父", "舟はワカメの母"
};
private static final String[] stoptags = {
"記号-一般", "記号-読点", "記号-句点", "記号-空白", "記号-括弧開", "記号-括弧閉", "記号-アルファベット"
};
private static final String F_ID = "id";
private static final String F_CONTENT = "contents";
private Analyzer analyzer;
private Directory directory;
IndexingTest() throws IOException {
analyzer = new JapaneseAnalyzer(null,
JapaneseTokenizer.DEFAULT_MODE,
CharArraySet.EMPTY_SET, new HashSet(Arrays.asList(stoptags)));
directory = new RAMDirectory();
makeIndex();
}
 
private LeafReader getFirstLeafReader() throws IOException {
// XXX サンプルの簡略化のため、最初に見つかった LeafReader を返している
// XXX 本来は、すべての leaf を辿る or SlowCompositeReaderWrapper で DirectoryReader をラップする
return DirectoryReader.open(directory).leaves().get(0).reader();
}
private IndexWriter getIndexWriter() throws IOException {
return new IndexWriter(directory, new IndexWriterConfig(analyzer));
}
private void makeIndex() throws IOException {
IndexWriter writer = null;
try {
writer = getIndexWriter();
int i = 1;
for (String content : contents) {
Document doc = new Document();
doc.add(new Field(F_ID, Integer.toString(i++), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_CONTENT, content, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
} finally {
if (writer != null)
writer.close();
}
}
/** 指定された語を含むドキュメントをインデックスから削除する */
private void deleteDocuments(String term) throws IOException {
IndexWriter writer = null;
try {
writer = getIndexWriter();
writer.deleteDocuments(new Term(F_CONTENT, term));
writer.commit();
} finally {
if (writer != null)
writer.close();
}
}
public static void main(String[] args) throws Exception {
IndexingTest irt = new IndexingTest();
// "ワカメ"を含むドキュメントを削除しておく
irt.deleteDocuments("ワカメ");
// IndexReader(LeafReader)取得
LeafReader reader = irt.getFirstLeafReader();
// 検索可能なドキュメントの数
System.out.println("Docs : " + Integer.toString(reader.numDocs()));
// インデックスから削除されたドキュメントの数
System.out.println("Deleted Docs: " + Integer.toString(reader.numDeletedDocs()));
// 削除されたドキュメントを含む、全ドキュメント数
System.out.println("Max Docs : " + Integer.toString(reader.maxDoc()));
System.out.println("");
System.out.println("---- All Documents ----");
for (int i = 0; i < reader.maxDoc(); i++) {
Document doc = reader.document(i);
System.out.println(doc.get(F_ID) + " : " + doc.get(F_CONTENT));
}
System.out.println("");
System.out.println("---- Documents contain 'カツオ' ----");
// "contents" フィールドの Terms 取得
Terms terms = reader.terms(F_CONTENT);
// TermsEnum 取得 & "カツオ" という単語(term)が見つかるまで seek
TermsEnum te = terms.iterator(null);
boolean found = te.seekExact(new BytesRef("カツオ"));
if (!found) {
// TermsEnum#seekExact() は、見つからなかったら false を返す
System.out.println("Not Found.");
} else {
// 単語にひもづく DocsEnum 取得
DocsEnum de = te.docs(reader.getLiveDocs(), null);
// DocsEnum をたどり、ドキュメントIDとドキュメント本体を取得
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
Document doc = reader.document(de.docID());
System.out.println(doc.get(F_ID) + " : " + doc.get(F_CONTENT));
}
}
reader.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment