Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Created June 23, 2012 15:48
Show Gist options
  • Save mocobeta/2978747 to your computer and use it in GitHub Desktop.
Save mocobeta/2978747 to your computer and use it in GitHub Desktop.
Lucene入門 3章 インデックス作成プログラム - Lucene 3.6 バージョン
package indexer;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* 基底Indexerクラス
*/
public abstract class BaseIndexer {
protected static final String DEFAULT_ANALYZER =
"org.apache.lucene.analysis.ja.JapaneseAnalyzer";
protected static final String PROP_ANALYZER = "analyzer";
protected static final String PROP_MERGE_FACTOR = "merge.factor";
protected static final String PROP_MAX_MERGE_DOCS = "max.merge.docs";
protected static final String PROP_MAX_BUFFERED_DOCS = "max.buffered.docs";
protected Analyzer analyzer;
protected String indexDir;
/** Analyzerを取得する */
protected Analyzer getAnalyzer() {
try {
if (analyzer == null) {
// Analyzerクラスを生成
String ana = System.getProperty(PROP_ANALYZER, DEFAULT_ANALYZER);
Class clazz = getClass().getClassLoader().loadClass(ana);
Constructor<Analyzer> constructor = clazz.getConstructor(Version.class);
Object[] args = new Object[]{ Version.LUCENE_36 };
analyzer = constructor.newInstance(args);
System.out.println("* Analyzer : " + ana);
}
return analyzer;
} catch (Exception e) {
throw new IndexerException(e);
}
}
/** インデックス格納先Directoryを取得する */
protected Directory getDirectory() {
try {
Directory d = FSDirectory.open(new File(indexDir));
return d;
} catch (IOException e) {
throw new IndexerException(e);
}
}
/** IndexWriterを取得する */
protected IndexWriter getIndexWriter() {
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, getAnalyzer());
// ポリシー設定
// MergeFactor, MaxMergeDocs, MaxBufferedDocsを設定する (システムプロパティで指定された場合)
LogMergePolicy policy = new LogDocMergePolicy();
String mergeFactor = System.getProperty(PROP_MERGE_FACTOR);
if (mergeFactor != null) {
policy.setMergeFactor(Integer.parseInt(mergeFactor));
}
String maxMergeDocs = System.getProperty(PROP_MAX_MERGE_DOCS);
if (maxMergeDocs != null) {
policy.setMaxMergeDocs(Integer.parseInt(maxMergeDocs));
}
String maxBufferedDocs = System.getProperty(PROP_MAX_BUFFERED_DOCS);
if (maxBufferedDocs != null) {
config.setMaxBufferedDocs(Integer.parseInt(maxBufferedDocs));
}
config.setMergePolicy(policy);
System.out.println("* マージ係数 : " + Integer.toString(policy.getMergeFactor()));
System.out.println("* 最大マージドキュメント数 : " + Integer.toString(policy.getMaxMergeDocs()));
System.out.println("* 最大バッファ内ドキュメント数 : " + Integer.toString(config.getMaxBufferedDocs()));
IndexWriter writer = null;
try {
writer = new IndexWriter(getDirectory(), config);
} catch (IOException e) {
throw new IndexerException(e);
}
return writer;
}
/** インデックスを作成する */
protected void makeIndex() {
long s = System.currentTimeMillis();
Date start = new Date(s);
System.out.println("* 開始時刻 : " + start);
IndexWriter writer = getIndexWriter(); // IndexWriterを取得
begin();
try {
// インデックス対象ドキュメントをすべてインデックスに登録する
while(hasNext()) {
Object record = next();
writer.addDocument(createDocument(record));
}
} catch (IOException e) {
throw new IndexerException(e);
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException e) {}
end();
}
}
long e = System.currentTimeMillis();
Date end = new Date(e);
long elapse = e - s;
System.out.println("* 終了時刻 : " + end + "(elapse " + elapse + " msec)");
}
protected void begin(){};
protected void end(){};
/** 次のドキュメントが存在するか。サブクラスで実装する。 */
protected abstract boolean hasNext();
/** 次のドキュメントを返却する。サブクラスで実装する。 */
protected abstract Object next();
/** ドキュメントをLuceneのDocumentオブジェクトに変換する。サブクラスで実装する。 */
protected abstract Document createDocument(Object record);
public class IndexerException extends RuntimeException {
public IndexerException(Exception e) {
super(e);
}
}
}
package indexer;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.util.NumericUtils;
/**
* サンプル書籍データのIndexer
* データは「Lucene入門」の書籍サポートページからダウンロードできます。
* http://gihyo.jp/book/2006/4-7741-2780-9/support
*/
public class BookIndexer extends BaseIndexer {
public static final String F_PUBLISHER = "publisher";
public static final String F_CATEGORY = "category";
public static final String F_TITLE = "title";
public static final String F_AUTHOR = "author";
public static final String F_PAGES = "pages";
public static final String F_ISBN = "isbn";
public static final String F_DATE = "date";
public static final String F_PRICE = "price";
public static final String F_SUMMARY = "summary";
private String dataDir;
private String[] xmlFiles;
private int num = 0;
private int count = 0;
public static void main(String[] args) {
BookIndexer indexer = new BookIndexer("resource/book-data", "book-index");
indexer.makeIndex();
System.out.println(Integer.toString(indexer.count) + " 件の書籍データを登録しました。");
}
private BookIndexer(String dataDir, String indexDir) {
this.dataDir = dataDir;
this.indexDir = indexDir;
this.xmlFiles = new File(dataDir).list();
}
@Override
protected boolean hasNext() {
while (num < xmlFiles.length) {
if (!xmlFiles[num].endsWith(".xml")) {
num++;
} else {
count++;
return true;
}
}
return false;
}
@Override
protected Object next() {
String file = dataDir + System.getProperty("file.separator") + xmlFiles[num++];
BookInfo bookInfo = createBookInfo(file);
return bookInfo;
}
/** XMLファイルを読み込み、BookInfoオブジェクトを生成する */
private BookInfo createBookInfo(String path) {
BookInfo bookInfo = new BookInfo();
// XMLパースにはStAXを使用
XMLInputFactory factory = XMLInputFactory.newFactory();
XMLStreamReader reader = null;
BufferedInputStream is = null;
try {
is = new BufferedInputStream(new FileInputStream(path));
reader = factory.createXMLStreamReader(is);
String tagName = null;
while (reader.hasNext()) {
int eventType = reader.next();
if (eventType == XMLStreamReader.START_ELEMENT) {
tagName = reader.getName().getLocalPart();
} else if (eventType == XMLStreamReader.END_ELEMENT) {
tagName = null;
} else if (eventType == XMLStreamReader.CHARACTERS) {
if (tagName != null) {
bookInfo.setProperty(tagName, reader.getText());
}
}
}
} catch (IOException e) {
throw new IndexerException(e);
} catch (XMLStreamException e) {
throw new IndexerException(e);
} finally {
if (reader != null) {
try {
reader.close();
} catch (XMLStreamException e) {}
}
if (is != null) {
try {
is.close();
} catch (IOException e) {}
}
}
return bookInfo;
}
@Override
protected Document createDocument(Object record) {
BookInfo bookInfo = (BookInfo)record;
Document doc = new Document();
doc.add(new Field(F_PUBLISHER, bookInfo.getPublisher(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_CATEGORY, bookInfo.getCategory(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(F_TITLE, bookInfo.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(F_AUTHOR, bookInfo.getAuthor(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(F_PAGES, NumericUtils.longToPrefixCoded(bookInfo.getPages()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_ISBN, bookInfo.getIsbn(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_DATE, bookInfo.getDate(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_PRICE, NumericUtils.longToPrefixCoded(bookInfo.getPrice()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_SUMMARY, bookInfo.getSummary(), Field.Store.YES, Field.Index.ANALYZED));
return doc;
}
/** 書籍データを格納するPOJO */
public static class BookInfo {
private String publisher;
private String category;
private String title;
private String author;
private int pages;
private String isbn;
private String date;
private int price;
private String summary;
public void setProperty(String name, String value) {
if (name.equals(F_PUBLISHER)) {
this.setPublisher(value);
} else if (name.equals(F_CATEGORY)) {
this.setCategory(value);
} else if (name.equals(F_TITLE)) {
this.setTitle(value);
} else if (name.equals(F_AUTHOR)) {
this.setAuthor(value);
} else if (name.equals(F_PAGES)) {
this.setPages(Integer.parseInt(value));
} else if (name.equals(F_ISBN)) {
this.setIsbn(value);
} else if (name.equals(F_DATE)) {
this.setDate(value);
} else if (name.equals(F_PRICE)) {
this.setPrice(Integer.parseInt(value));
} else if (name.equals(F_SUMMARY)) {
this.setSummary(value);
}
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public int getPages() {
return pages;
}
public void setPages(int pages) {
this.pages = pages;
}
public String getIsbn() {
return isbn;
}
public void setIsbn(String isbn) {
this.isbn = isbn;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public int getPrice() {
return price;
}
public void setPrice(int price) {
this.price = price;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
}
}
http://mocobeta-backup.tumblr.com/post/25753926431/lucene-3-6
*** インデックス作成時間の測定 ***
- Analyzerクラスを変更
- MaxBufferedDocsを変更
- MergeFactor, MaxMergeDocsはデフォルト値
(注意)VM環境です。また、複数回実行して平均を取ったりしていないです。
【計測環境】
CPU : VCPU コア2個 (ホストのCPUは Intel Core i5-2300)
RAM : 4GB
OS : (VM) Cent OS 6 (64bit)
Java : 1.6.0
==============================================================
BookIndexer
データ件数: 237件
==============================================================
+----------------------------------------------------------------------------------------+
| Analyzer class | MaxBufferedDocs | Indixing time |
+------------------------------------------------------|-----------------|---------------+
| org.apache.lucene.analysis.ja.JapaneseAnalyzer | 10 | 2587 msec |
| | 100 | 2011 msec |
+------------------------------------------------------|-----------------|---------------+
| org.apache.lucene.analysis.gosen.GosenAnalyzer | 10 | 1816 msec |
| | 100 | 1381 msec |
+------------------------------------------------------|-----------------|---------------+
| org.apache.lucene.analysis.cjk.CJKAnalyzer | 10 | 1870 msec |
| | 100 | 1128 msec |
+------------------------------------------------------|-----------------|---------------+
| org.apache.lucene.analysis.standard.StandardAnalyzer | 10 | 1679 msec |
| | 100 | 934 msec |
+----------------------------------------------------------------------------------------+
==============================================================
PostIndexer
データ件数: 21733件
==============================================================
+----------------------------------------------------------------------------------------+
| Analyzer class | MaxBufferedDocs | Indixing time |
+------------------------------------------------------|-----------------|---------------+
| org.apache.lucene.analysis.ja.JapaneseAnalyzer | 10 | 12608 msec |
| | 100 | 8006 msec |
| | 1000 | 7182 msec |
+------------------------------------------------------|-----------------|---------------+
| org.apache.lucene.analysis.gosen.GosenAnalyzer | 10 | 10091 msec |
| | 100 | 5720 msec |
| | 1000 | 4983 msec |
+------------------------------------------------------|-----------------|---------------+
| org.apache.lucene.analysis.cjk.CJKAnalyzer | 10 | 10411 msec |
| | 100 | 5149 msec |
| | 1000 | 3955 msec |
+------------------------------------------------------|-----------------|---------------+
| org.apache.lucene.analysis.standard.StandardAnalyzer | 10 | 9255 msec |
| | 100 | 4186 msec |
| | 1000 | 3362 msec |
+----------------------------------------------------------------------------------------+
package indexer;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.StringTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/**
* 郵便番号データ(郵便局大口事業所等個別番号データ)のIndexer
* http://www.post.japanpost.jp/zipcode/dl/jigyosyo/index-zip.html
*/
public class PostIndexer extends BaseIndexer {
private static final String CHARSET = "UTF-8";
private static final String SEPARATOR = ",";
private static final String QUOTE = "\"";
public static final String F_CODE = "番号";
public static final String F_KANA = "カナ";
public static final String F_NAME = "名称";
public static final String F_ADDR = "住所";
public static final String F_CONTENT = "コンテンツ";
public static final String F_INDZIP = "個別郵便番号";
public static final String F_ZIP = "郵便番号";
public static final String F_POST = "郵便局名";
private String dataFile;
private int count = 0;
private BufferedReader reader;
private String line;
public static void main(String[] args) {
PostIndexer indexer = new PostIndexer("resource/jigyosyo.csv", "post-index");
indexer.makeIndex();
System.out.println(Integer.toString(indexer.count) + " 件の郵便番号データを登録しました。");
}
private PostIndexer(String dataFile, String indexDir) {
this.dataFile = dataFile;
this.indexDir = indexDir;
}
protected void begin() {
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), CHARSET));
} catch (IOException e) {
throw new IndexerException(e);
}
}
protected void end() {
try {
reader.close();
} catch (IOException e) {
throw new IndexerException(e);
}
}
@Override
protected boolean hasNext() {
try {
line = reader.readLine();
if (line != null) {
count++;
return true;
}
return false;
} catch (IOException e) {
throw new IndexerException(e);
}
}
@Override
protected Object next() {
return line;
}
@Override
protected Document createDocument(Object record) {
StringTokenizer tokenizer = new StringTokenizer((String)record, SEPARATOR);
String code = unquote(tokenizer);
String kana = unquote(tokenizer);
String name = unquote(tokenizer);
String addr1 = unquote(tokenizer);
String addr2 = unquote(tokenizer);
String addr3 = unquote(tokenizer);
String addr4 = unquote(tokenizer);
String indZip = unquote(tokenizer);
String zip = unquote(tokenizer);
String post = unquote(tokenizer);
String addr = addr1 + addr2 + addr3 + addr4;
String content = name + " " + addr;
Document doc = new Document();
doc.add(new Field(F_CODE, code, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_KANA, kana, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(F_NAME, name, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(F_ADDR, addr, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(F_CONTENT, content, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field(F_INDZIP, indZip, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_ZIP, zip, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(F_POST, post, Field.Store.YES, Field.Index.NOT_ANALYZED));
return doc;
}
private String unquote(StringTokenizer tokenizer) {
String quoted = tokenizer.nextToken();
if (quoted.startsWith(QUOTE) && quoted.endsWith(QUOTE)) {
return quoted.substring(1, quoted.length() - 1);
}
return quoted;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment