Created
June 23, 2012 15:48
-
-
Save mocobeta/2978747 to your computer and use it in GitHub Desktop.
Lucene入門 3章 インデックス作成プログラム - Lucene 3.6 バージョン
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package indexer; | |
import java.io.File; | |
import java.io.IOException; | |
import java.lang.reflect.Constructor; | |
import java.util.Date; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.index.LogDocMergePolicy; | |
import org.apache.lucene.index.LogMergePolicy; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import org.apache.lucene.util.Version; | |
/** | |
* 基底Indexerクラス | |
*/ | |
public abstract class BaseIndexer { | |
protected static final String DEFAULT_ANALYZER = | |
"org.apache.lucene.analysis.ja.JapaneseAnalyzer"; | |
protected static final String PROP_ANALYZER = "analyzer"; | |
protected static final String PROP_MERGE_FACTOR = "merge.factor"; | |
protected static final String PROP_MAX_MERGE_DOCS = "max.merge.docs"; | |
protected static final String PROP_MAX_BUFFERED_DOCS = "max.buffered.docs"; | |
protected Analyzer analyzer; | |
protected String indexDir; | |
/** Analyzerを取得する */ | |
protected Analyzer getAnalyzer() { | |
try { | |
if (analyzer == null) { | |
// Analyzerクラスを生成 | |
String ana = System.getProperty(PROP_ANALYZER, DEFAULT_ANALYZER); | |
Class clazz = getClass().getClassLoader().loadClass(ana); | |
Constructor<Analyzer> constructor = clazz.getConstructor(Version.class); | |
Object[] args = new Object[]{ Version.LUCENE_36 }; | |
analyzer = constructor.newInstance(args); | |
System.out.println("* Analyzer : " + ana); | |
} | |
return analyzer; | |
} catch (Exception e) { | |
throw new IndexerException(e); | |
} | |
} | |
/** インデックス格納先Directoryを取得する */ | |
protected Directory getDirectory() { | |
try { | |
Directory d = FSDirectory.open(new File(indexDir)); | |
return d; | |
} catch (IOException e) { | |
throw new IndexerException(e); | |
} | |
} | |
/** IndexWriterを取得する */ | |
protected IndexWriter getIndexWriter() { | |
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, getAnalyzer()); | |
// ポリシー設定 | |
// MergeFactor, MaxMergeDocs, MaxBufferedDocsを設定する (システムプロパティで指定された場合) | |
LogMergePolicy policy = new LogDocMergePolicy(); | |
String mergeFactor = System.getProperty(PROP_MERGE_FACTOR); | |
if (mergeFactor != null) { | |
policy.setMergeFactor(Integer.parseInt(mergeFactor)); | |
} | |
String maxMergeDocs = System.getProperty(PROP_MAX_MERGE_DOCS); | |
if (maxMergeDocs != null) { | |
policy.setMaxMergeDocs(Integer.parseInt(maxMergeDocs)); | |
} | |
String maxBufferedDocs = System.getProperty(PROP_MAX_BUFFERED_DOCS); | |
if (maxBufferedDocs != null) { | |
config.setMaxBufferedDocs(Integer.parseInt(maxBufferedDocs)); | |
} | |
config.setMergePolicy(policy); | |
System.out.println("* マージ係数 : " + Integer.toString(policy.getMergeFactor())); | |
System.out.println("* 最大マージドキュメント数 : " + Integer.toString(policy.getMaxMergeDocs())); | |
System.out.println("* 最大バッファ内ドキュメント数 : " + Integer.toString(config.getMaxBufferedDocs())); | |
IndexWriter writer = null; | |
try { | |
writer = new IndexWriter(getDirectory(), config); | |
} catch (IOException e) { | |
throw new IndexerException(e); | |
} | |
return writer; | |
} | |
/** インデックスを作成する */ | |
protected void makeIndex() { | |
long s = System.currentTimeMillis(); | |
Date start = new Date(s); | |
System.out.println("* 開始時刻 : " + start); | |
IndexWriter writer = getIndexWriter(); // IndexWriterを取得 | |
begin(); | |
try { | |
// インデックス対象ドキュメントをすべてインデックスに登録する | |
while(hasNext()) { | |
Object record = next(); | |
writer.addDocument(createDocument(record)); | |
} | |
} catch (IOException e) { | |
throw new IndexerException(e); | |
} finally { | |
if (writer != null) { | |
try { | |
writer.close(); | |
} catch (IOException e) {} | |
end(); | |
} | |
} | |
long e = System.currentTimeMillis(); | |
Date end = new Date(e); | |
long elapse = e - s; | |
System.out.println("* 終了時刻 : " + end + "(elapse " + elapse + " msec)"); | |
} | |
protected void begin(){}; | |
protected void end(){}; | |
/** 次のドキュメントが存在するか。サブクラスで実装する。 */ | |
protected abstract boolean hasNext(); | |
/** 次のドキュメントを返却する。サブクラスで実装する。 */ | |
protected abstract Object next(); | |
/** ドキュメントをLuceneのDocumentオブジェクトに変換する。サブクラスで実装する。 */ | |
protected abstract Document createDocument(Object record); | |
public class IndexerException extends RuntimeException { | |
public IndexerException(Exception e) { | |
super(e); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package indexer; | |
import java.io.BufferedInputStream; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.IOException; | |
import javax.xml.stream.XMLInputFactory; | |
import javax.xml.stream.XMLStreamException; | |
import javax.xml.stream.XMLStreamReader; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.util.NumericUtils; | |
/** | |
* サンプル書籍データのIndexer | |
* データは「Lucene入門」の書籍サポートページからダウンロードできます。 | |
* http://gihyo.jp/book/2006/4-7741-2780-9/support | |
*/ | |
public class BookIndexer extends BaseIndexer { | |
public static final String F_PUBLISHER = "publisher"; | |
public static final String F_CATEGORY = "category"; | |
public static final String F_TITLE = "title"; | |
public static final String F_AUTHOR = "author"; | |
public static final String F_PAGES = "pages"; | |
public static final String F_ISBN = "isbn"; | |
public static final String F_DATE = "date"; | |
public static final String F_PRICE = "price"; | |
public static final String F_SUMMARY = "summary"; | |
private String dataDir; | |
private String[] xmlFiles; | |
private int num = 0; | |
private int count = 0; | |
public static void main(String[] args) { | |
BookIndexer indexer = new BookIndexer("resource/book-data", "book-index"); | |
indexer.makeIndex(); | |
System.out.println(Integer.toString(indexer.count) + " 件の書籍データを登録しました。"); | |
} | |
private BookIndexer(String dataDir, String indexDir) { | |
this.dataDir = dataDir; | |
this.indexDir = indexDir; | |
this.xmlFiles = new File(dataDir).list(); | |
} | |
@Override | |
protected boolean hasNext() { | |
while (num < xmlFiles.length) { | |
if (!xmlFiles[num].endsWith(".xml")) { | |
num++; | |
} else { | |
count++; | |
return true; | |
} | |
} | |
return false; | |
} | |
@Override | |
protected Object next() { | |
String file = dataDir + System.getProperty("file.separator") + xmlFiles[num++]; | |
BookInfo bookInfo = createBookInfo(file); | |
return bookInfo; | |
} | |
/** XMLファイルを読み込み、BookInfoオブジェクトを生成する */ | |
private BookInfo createBookInfo(String path) { | |
BookInfo bookInfo = new BookInfo(); | |
// XMLパースにはStAXを使用 | |
XMLInputFactory factory = XMLInputFactory.newFactory(); | |
XMLStreamReader reader = null; | |
BufferedInputStream is = null; | |
try { | |
is = new BufferedInputStream(new FileInputStream(path)); | |
reader = factory.createXMLStreamReader(is); | |
String tagName = null; | |
while (reader.hasNext()) { | |
int eventType = reader.next(); | |
if (eventType == XMLStreamReader.START_ELEMENT) { | |
tagName = reader.getName().getLocalPart(); | |
} else if (eventType == XMLStreamReader.END_ELEMENT) { | |
tagName = null; | |
} else if (eventType == XMLStreamReader.CHARACTERS) { | |
if (tagName != null) { | |
bookInfo.setProperty(tagName, reader.getText()); | |
} | |
} | |
} | |
} catch (IOException e) { | |
throw new IndexerException(e); | |
} catch (XMLStreamException e) { | |
throw new IndexerException(e); | |
} finally { | |
if (reader != null) { | |
try { | |
reader.close(); | |
} catch (XMLStreamException e) {} | |
} | |
if (is != null) { | |
try { | |
is.close(); | |
} catch (IOException e) {} | |
} | |
} | |
return bookInfo; | |
} | |
@Override | |
protected Document createDocument(Object record) { | |
BookInfo bookInfo = (BookInfo)record; | |
Document doc = new Document(); | |
doc.add(new Field(F_PUBLISHER, bookInfo.getPublisher(), Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_CATEGORY, bookInfo.getCategory(), Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field(F_TITLE, bookInfo.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field(F_AUTHOR, bookInfo.getAuthor(), Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field(F_PAGES, NumericUtils.longToPrefixCoded(bookInfo.getPages()), Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_ISBN, bookInfo.getIsbn(), Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_DATE, bookInfo.getDate(), Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_PRICE, NumericUtils.longToPrefixCoded(bookInfo.getPrice()), Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_SUMMARY, bookInfo.getSummary(), Field.Store.YES, Field.Index.ANALYZED)); | |
return doc; | |
} | |
/** 書籍データを格納するPOJO */ | |
public static class BookInfo { | |
private String publisher; | |
private String category; | |
private String title; | |
private String author; | |
private int pages; | |
private String isbn; | |
private String date; | |
private int price; | |
private String summary; | |
public void setProperty(String name, String value) { | |
if (name.equals(F_PUBLISHER)) { | |
this.setPublisher(value); | |
} else if (name.equals(F_CATEGORY)) { | |
this.setCategory(value); | |
} else if (name.equals(F_TITLE)) { | |
this.setTitle(value); | |
} else if (name.equals(F_AUTHOR)) { | |
this.setAuthor(value); | |
} else if (name.equals(F_PAGES)) { | |
this.setPages(Integer.parseInt(value)); | |
} else if (name.equals(F_ISBN)) { | |
this.setIsbn(value); | |
} else if (name.equals(F_DATE)) { | |
this.setDate(value); | |
} else if (name.equals(F_PRICE)) { | |
this.setPrice(Integer.parseInt(value)); | |
} else if (name.equals(F_SUMMARY)) { | |
this.setSummary(value); | |
} | |
} | |
public String getPublisher() { | |
return publisher; | |
} | |
public void setPublisher(String publisher) { | |
this.publisher = publisher; | |
} | |
public String getCategory() { | |
return category; | |
} | |
public void setCategory(String category) { | |
this.category = category; | |
} | |
public String getTitle() { | |
return title; | |
} | |
public void setTitle(String title) { | |
this.title = title; | |
} | |
public String getAuthor() { | |
return author; | |
} | |
public void setAuthor(String author) { | |
this.author = author; | |
} | |
public int getPages() { | |
return pages; | |
} | |
public void setPages(int pages) { | |
this.pages = pages; | |
} | |
public String getIsbn() { | |
return isbn; | |
} | |
public void setIsbn(String isbn) { | |
this.isbn = isbn; | |
} | |
public String getDate() { | |
return date; | |
} | |
public void setDate(String date) { | |
this.date = date; | |
} | |
public int getPrice() { | |
return price; | |
} | |
public void setPrice(int price) { | |
this.price = price; | |
} | |
public String getSummary() { | |
return summary; | |
} | |
public void setSummary(String summary) { | |
this.summary = summary; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
http://mocobeta-backup.tumblr.com/post/25753926431/lucene-3-6 | |
*** インデックス作成時間の測定 *** | |
- Analyzerクラスを変更 | |
- MaxBufferedDocsを変更 | |
- MergeFactor, MaxMergeDocsはデフォルト値 | |
(注意)VM環境です。また、複数回実行して平均を取ったりしていないです。 | |
【計測環境】 | |
CPU : VCPU コア2個 (ホストのCPUは Intel Core i5-2300) | |
RAM : 4GB | |
OS : (VM) Cent OS 6 (64bit) | |
Java : 1.6.0 | |
============================================================== | |
BookIndexer | |
データ件数: 237件 | |
============================================================== | |
+----------------------------------------------------------------------------------------+ | |
| Analyzer class | MaxBufferedDocs | Indixing time | | |
+------------------------------------------------------|-----------------|---------------+ | |
| org.apache.lucene.analysis.ja.JapaneseAnalyzer | 10 | 2587 msec | | |
| | 100 | 2011 msec | | |
+------------------------------------------------------|-----------------|---------------+ | |
| org.apache.lucene.analysis.gosen.GosenAnalyzer | 10 | 1816 msec | | |
| | 100 | 1381 msec | | |
+------------------------------------------------------|-----------------|---------------+ | |
| org.apache.lucene.analysis.cjk.CJKAnalyzer | 10 | 1870 msec | | |
| | 100 | 1128 msec | | |
+------------------------------------------------------|-----------------|---------------+ | |
| org.apache.lucene.analysis.standard.StandardAnalyzer | 10 | 1679 msec | | |
| | 100 | 934 msec | | |
+----------------------------------------------------------------------------------------+ | |
============================================================== | |
PostIndexer | |
データ件数: 21733件 | |
============================================================== | |
+----------------------------------------------------------------------------------------+ | |
| Analyzer class | MaxBufferedDocs | Indixing time | | |
+------------------------------------------------------|-----------------|---------------+ | |
| org.apache.lucene.analysis.ja.JapaneseAnalyzer | 10 | 12608 msec | | |
| | 100 | 8006 msec | | |
| | 1000 | 7182 msec | | |
+------------------------------------------------------|-----------------|---------------+ | |
| org.apache.lucene.analysis.gosen.GosenAnalyzer | 10 | 10091 msec | | |
| | 100 | 5720 msec | | |
| | 1000 | 4983 msec | | |
+------------------------------------------------------|-----------------|---------------+ | |
| org.apache.lucene.analysis.cjk.CJKAnalyzer | 10 | 10411 msec | | |
| | 100 | 5149 msec | | |
| | 1000 | 3955 msec | | |
+------------------------------------------------------|-----------------|---------------+ | |
| org.apache.lucene.analysis.standard.StandardAnalyzer | 10 | 9255 msec | | |
| | 100 | 4186 msec | | |
| | 1000 | 3362 msec | | |
+----------------------------------------------------------------------------------------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package indexer; | |
import java.io.BufferedReader; | |
import java.io.FileInputStream; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.util.StringTokenizer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
/** | |
* 郵便番号データ(郵便局大口事業所等個別番号データ)のIndexer | |
* http://www.post.japanpost.jp/zipcode/dl/jigyosyo/index-zip.html | |
*/ | |
public class PostIndexer extends BaseIndexer { | |
private static final String CHARSET = "UTF-8"; | |
private static final String SEPARATOR = ","; | |
private static final String QUOTE = "\""; | |
public static final String F_CODE = "番号"; | |
public static final String F_KANA = "カナ"; | |
public static final String F_NAME = "名称"; | |
public static final String F_ADDR = "住所"; | |
public static final String F_CONTENT = "コンテンツ"; | |
public static final String F_INDZIP = "個別郵便番号"; | |
public static final String F_ZIP = "郵便番号"; | |
public static final String F_POST = "郵便局名"; | |
private String dataFile; | |
private int count = 0; | |
private BufferedReader reader; | |
private String line; | |
public static void main(String[] args) { | |
PostIndexer indexer = new PostIndexer("resource/jigyosyo.csv", "post-index"); | |
indexer.makeIndex(); | |
System.out.println(Integer.toString(indexer.count) + " 件の郵便番号データを登録しました。"); | |
} | |
private PostIndexer(String dataFile, String indexDir) { | |
this.dataFile = dataFile; | |
this.indexDir = indexDir; | |
} | |
protected void begin() { | |
try { | |
reader = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), CHARSET)); | |
} catch (IOException e) { | |
throw new IndexerException(e); | |
} | |
} | |
protected void end() { | |
try { | |
reader.close(); | |
} catch (IOException e) { | |
throw new IndexerException(e); | |
} | |
} | |
@Override | |
protected boolean hasNext() { | |
try { | |
line = reader.readLine(); | |
if (line != null) { | |
count++; | |
return true; | |
} | |
return false; | |
} catch (IOException e) { | |
throw new IndexerException(e); | |
} | |
} | |
@Override | |
protected Object next() { | |
return line; | |
} | |
@Override | |
protected Document createDocument(Object record) { | |
StringTokenizer tokenizer = new StringTokenizer((String)record, SEPARATOR); | |
String code = unquote(tokenizer); | |
String kana = unquote(tokenizer); | |
String name = unquote(tokenizer); | |
String addr1 = unquote(tokenizer); | |
String addr2 = unquote(tokenizer); | |
String addr3 = unquote(tokenizer); | |
String addr4 = unquote(tokenizer); | |
String indZip = unquote(tokenizer); | |
String zip = unquote(tokenizer); | |
String post = unquote(tokenizer); | |
String addr = addr1 + addr2 + addr3 + addr4; | |
String content = name + " " + addr; | |
Document doc = new Document(); | |
doc.add(new Field(F_CODE, code, Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_KANA, kana, Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field(F_NAME, name, Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field(F_ADDR, addr, Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field(F_CONTENT, content, Field.Store.YES, Field.Index.ANALYZED)); | |
doc.add(new Field(F_INDZIP, indZip, Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_ZIP, zip, Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
doc.add(new Field(F_POST, post, Field.Store.YES, Field.Index.NOT_ANALYZED)); | |
return doc; | |
} | |
private String unquote(StringTokenizer tokenizer) { | |
String quoted = tokenizer.nextToken(); | |
if (quoted.startsWith(QUOTE) && quoted.endsWith(QUOTE)) { | |
return quoted.substring(1, quoted.length() - 1); | |
} | |
return quoted; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment