mocobeta · June 23, 2012 15:48
diff --git a/BaseIndexer.java b/BaseIndexer.java
 package indexer;

 import java.io.File;
 import java.io.IOException;
 import java.lang.reflect.Constructor;
 import java.util.Date;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogDocMergePolicy;
 import org.apache.lucene.index.LogMergePolicy;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;

 /**
 * 基底Indexerクラス
 */
 public abstract class BaseIndexer {

 	protected static final String DEFAULT_ANALYZER =
 			"org.apache.lucene.analysis.ja.JapaneseAnalyzer";
 	
 	protected static final String PROP_ANALYZER = "analyzer";
 	protected static final String PROP_MERGE_FACTOR = "merge.factor";
 	protected static final String PROP_MAX_MERGE_DOCS = "max.merge.docs";
 	protected static final String PROP_MAX_BUFFERED_DOCS = "max.buffered.docs";
 	
 	protected Analyzer analyzer;
 	protected String indexDir;
 	
 	/** Analyzerを取得する */
 	protected Analyzer getAnalyzer() {
 		try {
 			if (analyzer == null) {
 				// Analyzerクラスを生成
 				String ana = System.getProperty(PROP_ANALYZER, DEFAULT_ANALYZER);
 				Class clazz = getClass().getClassLoader().loadClass(ana);
 				Constructor<Analyzer> constructor = clazz.getConstructor(Version.class);
 				Object[] args = new Object[]{ Version.LUCENE_36 };
 				analyzer = constructor.newInstance(args);
 				System.out.println("* Analyzer : " + ana);
 			}
 			return analyzer;
 		} catch (Exception e) {
 			throw new IndexerException(e);
 		}
 	}
 	
 	/** インデックス格納先Directoryを取得する */
 	protected Directory getDirectory() {
 		try {
 			Directory d = FSDirectory.open(new File(indexDir));
 			return d;
 		} catch (IOException e) {
 			throw new IndexerException(e);
 		}
 	}
 	
 	/** IndexWriterを取得する */
 	protected IndexWriter getIndexWriter() {
 		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, getAnalyzer());
 		// ポリシー設定
 		// MergeFactor, MaxMergeDocs, MaxBufferedDocsを設定する (システムプロパティで指定された場合)
 		LogMergePolicy policy = new LogDocMergePolicy();
 		String mergeFactor = System.getProperty(PROP_MERGE_FACTOR);
 		if (mergeFactor != null) {
 			policy.setMergeFactor(Integer.parseInt(mergeFactor));
 		}
 		String maxMergeDocs = System.getProperty(PROP_MAX_MERGE_DOCS);
 		if (maxMergeDocs != null) {
 			policy.setMaxMergeDocs(Integer.parseInt(maxMergeDocs));
 		}
 		String maxBufferedDocs = System.getProperty(PROP_MAX_BUFFERED_DOCS);
 		if (maxBufferedDocs != null) {
 			config.setMaxBufferedDocs(Integer.parseInt(maxBufferedDocs));
 		}
 		config.setMergePolicy(policy);
 		System.out.println("* マージ係数 : " + Integer.toString(policy.getMergeFactor()));
 		System.out.println("* 最大マージドキュメント数 : " + Integer.toString(policy.getMaxMergeDocs()));
 		System.out.println("* 最大バッファ内ドキュメント数 : " + Integer.toString(config.getMaxBufferedDocs()));
 		
 		IndexWriter writer = null;
 		try {
 			writer = new IndexWriter(getDirectory(), config);
 		} catch (IOException e) {
 			throw new IndexerException(e);
 		}
 		return writer;
 	}
 	
 	/** インデックスを作成する */
 	protected void makeIndex() {
 		long s = System.currentTimeMillis();
 		Date start = new Date(s);
 		System.out.println("* 開始時刻 : " + start);
 		
 		IndexWriter writer = getIndexWriter(); // IndexWriterを取得
 		begin();
 		try {
 			// インデックス対象ドキュメントをすべてインデックスに登録する
 			while(hasNext()) {
 				Object record = next();
 				writer.addDocument(createDocument(record));
 			}
 		} catch (IOException e) {
 			throw new IndexerException(e);
 		} finally {
 			if (writer != null) {
 				try {
 					writer.close();
 				} catch (IOException e) {}
 				end();
 			}
 		}
 		
 		long e = System.currentTimeMillis();
 		Date end = new Date(e);
 		long elapse = e - s;
 		System.out.println("* 終了時刻 : " + end + "(elapse " + elapse + " msec)");
 	}
 	
 	protected void begin(){};
 	protected void end(){};
 	
 	/** 次のドキュメントが存在するか。サブクラスで実装する。 */
 	protected abstract boolean hasNext();
 	/** 次のドキュメントを返却する。サブクラスで実装する。 */
 	protected abstract Object next();
 	/** ドキュメントをLuceneのDocumentオブジェクトに変換する。サブクラスで実装する。 */
 	protected abstract Document createDocument(Object record);
 	
 	public class IndexerException extends RuntimeException {
 		public IndexerException(Exception e) {
 			super(e);
 		}
 	}
 	
 }
diff --git a/BookIndexer.java b/BookIndexer.java
 package indexer;

 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;

 import javax.xml.stream.XMLInputFactory;
 import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.XMLStreamReader;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.util.NumericUtils;

 /**
 * サンプル書籍データのIndexer
 * データは「Lucene入門」の書籍サポートページからダウンロードできます。
 * http://gihyo.jp/book/2006/4-7741-2780-9/support
 */
 public class BookIndexer extends BaseIndexer {

 	public static final String F_PUBLISHER = "publisher";
 	public static final String F_CATEGORY = "category";
 	public static final String F_TITLE = "title";
 	public static final String F_AUTHOR = "author";
 	public static final String F_PAGES = "pages";
 	public static final String F_ISBN = "isbn";
 	public static final String F_DATE = "date";
 	public static final String F_PRICE = "price";
 	public static final String F_SUMMARY = "summary";
 	
 	private String dataDir;
 	private String[] xmlFiles;
 	private int num = 0;
 	private int count = 0;
 	
 	public static void main(String[] args) {
 		BookIndexer indexer = new BookIndexer("resource/book-data", "book-index");
 		indexer.makeIndex();
 		System.out.println(Integer.toString(indexer.count) + " 件の書籍データを登録しました。");
 	}
 	
 	private BookIndexer(String dataDir, String indexDir) {
 		this.dataDir = dataDir;
 		this.indexDir = indexDir;
 		this.xmlFiles = new File(dataDir).list();
 	}
 	
 	@Override
 	protected boolean hasNext() {
 		while (num < xmlFiles.length) {
 			if (!xmlFiles[num].endsWith(".xml")) {
 				num++;
 			} else {
 				count++;
 				return true;
 			}
 		}
 		return false;
 	}

 	@Override
 	protected Object next() {
 		String file = dataDir + System.getProperty("file.separator") + xmlFiles[num++];
 		BookInfo bookInfo = createBookInfo(file);
 		return bookInfo;
 	}

 	/** XMLファイルを読み込み、BookInfoオブジェクトを生成する */
 	private BookInfo createBookInfo(String path) {
 		BookInfo bookInfo = new BookInfo();
 		// XMLパースにはStAXを使用
 		XMLInputFactory factory = XMLInputFactory.newFactory();
 		XMLStreamReader reader = null;
 		BufferedInputStream is = null;
 		try {
 			is = new BufferedInputStream(new FileInputStream(path));
 			reader = factory.createXMLStreamReader(is);
 			String tagName = null;
 			while (reader.hasNext()) {
 				int eventType = reader.next();
 				if (eventType == XMLStreamReader.START_ELEMENT) {
 					tagName = reader.getName().getLocalPart();
 				} else if (eventType == XMLStreamReader.END_ELEMENT) {
 					tagName = null;
 				} else if (eventType == XMLStreamReader.CHARACTERS) {
 					if (tagName != null) {
 						bookInfo.setProperty(tagName, reader.getText());
 					}
 				}
 			}
 		} catch (IOException e) {
 			throw new IndexerException(e);
 		} catch (XMLStreamException e) {
 			throw new IndexerException(e);
 		} finally {
 			if (reader != null) {
 				try {
 					reader.close();
 				} catch (XMLStreamException e) {}
 			}
 			if (is != null) {
 				try {
 					is.close();
 				} catch (IOException e) {}
 			}
 		}
 		return bookInfo;
 	}
 	
 	@Override
 	protected Document createDocument(Object record) {
 		BookInfo bookInfo = (BookInfo)record;
 		Document doc = new Document();
 		doc.add(new Field(F_PUBLISHER, bookInfo.getPublisher(), Field.Store.YES, Field.Index.NOT_ANALYZED));
 		doc.add(new Field(F_CATEGORY, bookInfo.getCategory(), Field.Store.YES, Field.Index.ANALYZED));
 		doc.add(new Field(F_TITLE, bookInfo.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
 		doc.add(new Field(F_AUTHOR, bookInfo.getAuthor(), Field.Store.YES, Field.Index.ANALYZED));
 		doc.add(new Field(F_PAGES, NumericUtils.longToPrefixCoded(bookInfo.getPages()), Field.Store.YES, Field.Index.NOT_ANALYZED));
 		doc.add(new Field(F_ISBN, bookInfo.getIsbn(), Field.Store.YES, Field.Index.NOT_ANALYZED));
 		doc.add(new Field(F_DATE, bookInfo.getDate(), Field.Store.YES, Field.Index.NOT_ANALYZED));
 		doc.add(new Field(F_PRICE, NumericUtils.longToPrefixCoded(bookInfo.getPrice()), Field.Store.YES, Field.Index.NOT_ANALYZED));
 		doc.add(new Field(F_SUMMARY, bookInfo.getSummary(), Field.Store.YES, Field.Index.ANALYZED));
 		return doc;
 	}
 	
 	/** 書籍データを格納するPOJO */
 	public static class BookInfo {
 		private String publisher;
 		private String category;
 		private String title;
 		private String author;
 		private int pages;
 		private String isbn;
 		private String date;
 		private int price;
 		private String summary;
 		
 		public void setProperty(String name, String value) {
 			if (name.equals(F_PUBLISHER)) {
 				this.setPublisher(value);
 			} else if (name.equals(F_CATEGORY)) {
 				this.setCategory(value);
 			} else if (name.equals(F_TITLE)) {
 				this.setTitle(value);
 			} else if (name.equals(F_AUTHOR)) {
 				this.setAuthor(value);
 			} else if (name.equals(F_PAGES)) {
 				this.setPages(Integer.parseInt(value));
 			} else if (name.equals(F_ISBN)) {
 				this.setIsbn(value);
 			} else if (name.equals(F_DATE)) {
 				this.setDate(value);
 			} else if (name.equals(F_PRICE)) {
 				this.setPrice(Integer.parseInt(value));
 			} else if (name.equals(F_SUMMARY)) {
 				this.setSummary(value);
 			}
 		}
 		public String getPublisher() {
 			return publisher;
 		}
 		public void setPublisher(String publisher) {
 			this.publisher = publisher;
 		}
 		public String getCategory() {
 			return category;
 		}
 		public void setCategory(String category) {
 			this.category = category;
 		}
 		public String getTitle() {
 			return title;
 		}
 		public void setTitle(String title) {
 			this.title = title;
 		}
 		public String getAuthor() {
 			return author;
 		}
 		public void setAuthor(String author) {
 			this.author = author;
 		}
 		public int getPages() {
 			return pages;
 		}
 		public void setPages(int pages) {
 			this.pages = pages;
 		}
 		public String getIsbn() {
 			return isbn;
 		}
 		public void setIsbn(String isbn) {
 			this.isbn = isbn;
 		}
 		public String getDate() {
 			return date;
 		}
 		public void setDate(String date) {
 			this.date = date;
 		}
 		public int getPrice() {
 			return price;
 		}
 		public void setPrice(int price) {
 			this.price = price;
 		}
 		public String getSummary() {
 			return summary;
 		}
 		public void setSummary(String summary) {
 			this.summary = summary;
 		}
 	}

 }
diff --git a/gistfile1.txt b/gistfile1.txt
 http://mocobeta-backup.tumblr.com/post/25753926431/lucene-3-6

 *** インデックス作成時間の測定 ***
 - Analyzerクラスを変更
 - MaxBufferedDocsを変更
 - MergeFactor, MaxMergeDocsはデフォルト値

 （注意）VM環境です。また、複数回実行して平均を取ったりしていないです。

 【計測環境】
 CPU : VCPU コア2個 (ホストのCPUは Intel Core i5-2300)
 RAM : 4GB
 OS : (VM) Cent OS 6 (64bit)
 Java : 1.6.0

 ==============================================================
 BookIndexer 
 データ件数: 237件
 ==============================================================

 +----------------------------------------------------------------------------------------+
 | Analyzer class                                       | MaxBufferedDocs | Indixing time |
 +------------------------------------------------------|-----------------|---------------+
 | org.apache.lucene.analysis.ja.JapaneseAnalyzer       | 10              | 2587 msec     |
 |                                                      | 100             | 2011 msec     |
 +------------------------------------------------------|-----------------|---------------+
 | org.apache.lucene.analysis.gosen.GosenAnalyzer       | 10              | 1816 msec     |
 |                                                      | 100             | 1381 msec     |
 +------------------------------------------------------|-----------------|---------------+
 | org.apache.lucene.analysis.cjk.CJKAnalyzer           | 10              | 1870 msec     |
 |                                                      | 100             | 1128 msec     |
 +------------------------------------------------------|-----------------|---------------+
 | org.apache.lucene.analysis.standard.StandardAnalyzer | 10              | 1679 msec     |
 |                                                      | 100             | 934 msec      |
 +----------------------------------------------------------------------------------------+


 ==============================================================
 PostIndexer 
 データ件数: 21733件
 ==============================================================
 +----------------------------------------------------------------------------------------+
 | Analyzer class                                       | MaxBufferedDocs | Indixing time |
 +------------------------------------------------------|-----------------|---------------+
 | org.apache.lucene.analysis.ja.JapaneseAnalyzer       | 10              | 12608 msec    |
 |                                                      | 100             |  8006 msec    |
 |                                                      | 1000            |  7182 msec    |
 +------------------------------------------------------|-----------------|---------------+
 | org.apache.lucene.analysis.gosen.GosenAnalyzer       | 10              | 10091 msec    |
 |                                                      | 100             |  5720 msec    |
 |                                                      | 1000            |  4983 msec    |
 +------------------------------------------------------|-----------------|---------------+
 | org.apache.lucene.analysis.cjk.CJKAnalyzer           | 10              | 10411 msec    |
 |                                                      | 100             |  5149 msec    |
 |                                                      | 1000            |  3955 msec    |           
 +------------------------------------------------------|-----------------|---------------+
 | org.apache.lucene.analysis.standard.StandardAnalyzer | 10              |  9255 msec    |
 |                                                      | 100             |  4186 msec    |
 |                                                      | 1000            |  3362 msec    |
 +----------------------------------------------------------------------------------------+
diff --git a/PostIndexer.java b/PostIndexer.java
 package indexer;

 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.StringTokenizer;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;

 /**
 * 郵便番号データ（郵便局大口事業所等個別番号データ）のIndexer
 * http://www.post.japanpost.jp/zipcode/dl/jigyosyo/index-zip.html
 */
 public class PostIndexer extends BaseIndexer {

 	private static final String CHARSET = "UTF-8";
 	private static final String SEPARATOR = ",";
 	private static final String QUOTE = "\"";
 	public static final String F_CODE = "番号";
 	public static final String F_KANA = "カナ";
 	public static final String F_NAME = "名称";
 	public static final String F_ADDR = "住所";
 	public static final String F_CONTENT = "コンテンツ";
 	public static final String F_INDZIP = "個別郵便番号";
 	public static final String F_ZIP = "郵便番号";
 	public static final String F_POST = "郵便局名";
 	
 	private String dataFile;
 	private int count = 0;
 	private BufferedReader reader;
 	private String line;
 	
 	public static void main(String[] args) {
 		PostIndexer indexer = new PostIndexer("resource/jigyosyo.csv", "post-index");
 		indexer.makeIndex();
 		System.out.println(Integer.toString(indexer.count) + " 件の郵便番号データを登録しました。");
 	}
 	
 	private PostIndexer(String dataFile, String indexDir) {
 		this.dataFile = dataFile;
 		this.indexDir = indexDir;
 	}
 	
 	protected void begin() {
 		try {
 			reader = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), CHARSET));
 		} catch (IOException e) {
 			throw new IndexerException(e);
 		}
 	}
 	
 	protected void end() {
 		try {
 			reader.close();
 		} catch (IOException e) {
 			throw new IndexerException(e);
 		}
 	}

 	@Override
 	protected boolean hasNext() {
 		try {
 			line = reader.readLine();
 			if (line != null) {
 				count++;
 				return true;
 			}
 			return false;
 		} catch (IOException e) {
 			throw new IndexerException(e);
 		}
 	}

 	@Override
 	protected Object next() {
 		return line;
 	}

 	@Override
 	protected Document createDocument(Object record) {
 		StringTokenizer tokenizer = new StringTokenizer((String)record, SEPARATOR);
 		String code = unquote(tokenizer);
 		String kana = unquote(tokenizer);
 		String name = unquote(tokenizer);
 		String addr1 = unquote(tokenizer);
 		String addr2 = unquote(tokenizer);
 		String addr3 = unquote(tokenizer);
 		String addr4 = unquote(tokenizer);
 		String indZip = unquote(tokenizer);
 		String zip = unquote(tokenizer);
 		String post = unquote(tokenizer);
 		
 		String addr = addr1 + addr2 + addr3 + addr4;
 		String content = name + " " + addr;
 		
 		Document doc = new Document();
 		doc.add(new Field(F_CODE, code, Field.Store.YES, Field.Index.NOT_ANALYZED));
 		doc.add(new Field(F_KANA, kana, Field.Store.YES, Field.Index.ANALYZED));
 		doc.add(new Field(F_NAME, name, Field.Store.YES, Field.Index.ANALYZED));
 		doc.add(new Field(F_ADDR, addr, Field.Store.YES, Field.Index.ANALYZED));
 		doc.add(new Field(F_CONTENT, content, Field.Store.YES, Field.Index.ANALYZED));
 		doc.add(new Field(F_INDZIP, indZip, Field.Store.YES, Field.Index.NOT_ANALYZED));
 		doc.add(new Field(F_ZIP, zip, Field.Store.YES, Field.Index.NOT_ANALYZED));
 		doc.add(new Field(F_POST, post, Field.Store.YES, Field.Index.NOT_ANALYZED));
 		return doc;
 	}

 	private String unquote(StringTokenizer tokenizer) {
 		String quoted = tokenizer.nextToken();
 		if (quoted.startsWith(QUOTE) && quoted.endsWith(QUOTE)) {
 			return quoted.substring(1, quoted.length() - 1);
 		}
 		return quoted;
 	}

 }
	package indexer;

	import java.io.File;
	import java.io.IOException;
	import java.lang.reflect.Constructor;
	import java.util.Date;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.index.LogDocMergePolicy;
	import org.apache.lucene.index.LogMergePolicy;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.util.Version;

	/**
	* 基底Indexerクラス
	*/
	public abstract class BaseIndexer {

	protected static final String DEFAULT_ANALYZER =
	"org.apache.lucene.analysis.ja.JapaneseAnalyzer";

	protected static final String PROP_ANALYZER = "analyzer";
	protected static final String PROP_MERGE_FACTOR = "merge.factor";
	protected static final String PROP_MAX_MERGE_DOCS = "max.merge.docs";
	protected static final String PROP_MAX_BUFFERED_DOCS = "max.buffered.docs";

	protected Analyzer analyzer;
	protected String indexDir;

	/** Analyzerを取得する */
	protected Analyzer getAnalyzer() {
	try {
	if (analyzer == null) {
	// Analyzerクラスを生成
	String ana = System.getProperty(PROP_ANALYZER, DEFAULT_ANALYZER);
	Class clazz = getClass().getClassLoader().loadClass(ana);
	Constructor<Analyzer> constructor = clazz.getConstructor(Version.class);
	Object[] args = new Object[]{ Version.LUCENE_36 };
	analyzer = constructor.newInstance(args);
	System.out.println("* Analyzer : " + ana);
	}
	return analyzer;
	} catch (Exception e) {
	throw new IndexerException(e);
	}
	}

	/** インデックス格納先Directoryを取得する */
	protected Directory getDirectory() {
	try {
	Directory d = FSDirectory.open(new File(indexDir));
	return d;
	} catch (IOException e) {
	throw new IndexerException(e);
	}
	}

	/** IndexWriterを取得する */
	protected IndexWriter getIndexWriter() {
	IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, getAnalyzer());
	// ポリシー設定
	// MergeFactor, MaxMergeDocs, MaxBufferedDocsを設定する (システムプロパティで指定された場合)
	LogMergePolicy policy = new LogDocMergePolicy();
	String mergeFactor = System.getProperty(PROP_MERGE_FACTOR);
	if (mergeFactor != null) {
	policy.setMergeFactor(Integer.parseInt(mergeFactor));
	}
	String maxMergeDocs = System.getProperty(PROP_MAX_MERGE_DOCS);
	if (maxMergeDocs != null) {
	policy.setMaxMergeDocs(Integer.parseInt(maxMergeDocs));
	}
	String maxBufferedDocs = System.getProperty(PROP_MAX_BUFFERED_DOCS);
	if (maxBufferedDocs != null) {
	config.setMaxBufferedDocs(Integer.parseInt(maxBufferedDocs));
	}
	config.setMergePolicy(policy);
	System.out.println("* マージ係数 : " + Integer.toString(policy.getMergeFactor()));
	System.out.println("* 最大マージドキュメント数 : " + Integer.toString(policy.getMaxMergeDocs()));
	System.out.println("* 最大バッファ内ドキュメント数 : " + Integer.toString(config.getMaxBufferedDocs()));

	IndexWriter writer = null;
	try {
	writer = new IndexWriter(getDirectory(), config);
	} catch (IOException e) {
	throw new IndexerException(e);
	}
	return writer;
	}

	/** インデックスを作成する */
	protected void makeIndex() {
	long s = System.currentTimeMillis();
	Date start = new Date(s);
	System.out.println("* 開始時刻 : " + start);

	IndexWriter writer = getIndexWriter(); // IndexWriterを取得
	begin();
	try {
	// インデックス対象ドキュメントをすべてインデックスに登録する
	while(hasNext()) {
	Object record = next();
	writer.addDocument(createDocument(record));
	}
	} catch (IOException e) {
	throw new IndexerException(e);
	} finally {
	if (writer != null) {
	try {
	writer.close();
	} catch (IOException e) {}
	end();
	}
	}

	long e = System.currentTimeMillis();
	Date end = new Date(e);
	long elapse = e - s;
	System.out.println("* 終了時刻 : " + end + "(elapse " + elapse + " msec)");
	}

	protected void begin(){};
	protected void end(){};

	/** 次のドキュメントが存在するか。サブクラスで実装する。 */
	protected abstract boolean hasNext();
	/** 次のドキュメントを返却する。サブクラスで実装する。 */
	protected abstract Object next();
	/** ドキュメントをLuceneのDocumentオブジェクトに変換する。サブクラスで実装する。 */
	protected abstract Document createDocument(Object record);

	public class IndexerException extends RuntimeException {
	public IndexerException(Exception e) {
	super(e);
	}
	}

	}
	package indexer;

	import java.io.BufferedInputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;

	import javax.xml.stream.XMLInputFactory;
	import javax.xml.stream.XMLStreamException;
	import javax.xml.stream.XMLStreamReader;

	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.util.NumericUtils;

	/**
	* サンプル書籍データのIndexer
	* データは「Lucene入門」の書籍サポートページからダウンロードできます。
	* http://gihyo.jp/book/2006/4-7741-2780-9/support
	*/
	public class BookIndexer extends BaseIndexer {

	public static final String F_PUBLISHER = "publisher";
	public static final String F_CATEGORY = "category";
	public static final String F_TITLE = "title";
	public static final String F_AUTHOR = "author";
	public static final String F_PAGES = "pages";
	public static final String F_ISBN = "isbn";
	public static final String F_DATE = "date";
	public static final String F_PRICE = "price";
	public static final String F_SUMMARY = "summary";

	private String dataDir;
	private String[] xmlFiles;
	private int num = 0;
	private int count = 0;

	public static void main(String[] args) {
	BookIndexer indexer = new BookIndexer("resource/book-data", "book-index");
	indexer.makeIndex();
	System.out.println(Integer.toString(indexer.count) + " 件の書籍データを登録しました。");
	}

	private BookIndexer(String dataDir, String indexDir) {
	this.dataDir = dataDir;
	this.indexDir = indexDir;
	this.xmlFiles = new File(dataDir).list();
	}

	@Override
	protected boolean hasNext() {
	while (num < xmlFiles.length) {
	if (!xmlFiles[num].endsWith(".xml")) {
	num++;
	} else {
	count++;
	return true;
	}
	}
	return false;
	}

	@Override
	protected Object next() {
	String file = dataDir + System.getProperty("file.separator") + xmlFiles[num++];
	BookInfo bookInfo = createBookInfo(file);
	return bookInfo;
	}

	/** XMLファイルを読み込み、BookInfoオブジェクトを生成する */
	private BookInfo createBookInfo(String path) {
	BookInfo bookInfo = new BookInfo();
	// XMLパースにはStAXを使用
	XMLInputFactory factory = XMLInputFactory.newFactory();
	XMLStreamReader reader = null;
	BufferedInputStream is = null;
	try {
	is = new BufferedInputStream(new FileInputStream(path));
	reader = factory.createXMLStreamReader(is);
	String tagName = null;
	while (reader.hasNext()) {
	int eventType = reader.next();
	if (eventType == XMLStreamReader.START_ELEMENT) {
	tagName = reader.getName().getLocalPart();
	} else if (eventType == XMLStreamReader.END_ELEMENT) {
	tagName = null;
	} else if (eventType == XMLStreamReader.CHARACTERS) {
	if (tagName != null) {
	bookInfo.setProperty(tagName, reader.getText());
	}
	}
	}
	} catch (IOException e) {
	throw new IndexerException(e);
	} catch (XMLStreamException e) {
	throw new IndexerException(e);
	} finally {
	if (reader != null) {
	try {
	reader.close();
	} catch (XMLStreamException e) {}
	}
	if (is != null) {
	try {
	is.close();
	} catch (IOException e) {}
	}
	}
	return bookInfo;
	}

	@Override
	protected Document createDocument(Object record) {
	BookInfo bookInfo = (BookInfo)record;
	Document doc = new Document();
	doc.add(new Field(F_PUBLISHER, bookInfo.getPublisher(), Field.Store.YES, Field.Index.NOT_ANALYZED));
	doc.add(new Field(F_CATEGORY, bookInfo.getCategory(), Field.Store.YES, Field.Index.ANALYZED));
	doc.add(new Field(F_TITLE, bookInfo.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
	doc.add(new Field(F_AUTHOR, bookInfo.getAuthor(), Field.Store.YES, Field.Index.ANALYZED));
	doc.add(new Field(F_PAGES, NumericUtils.longToPrefixCoded(bookInfo.getPages()), Field.Store.YES, Field.Index.NOT_ANALYZED));
	doc.add(new Field(F_ISBN, bookInfo.getIsbn(), Field.Store.YES, Field.Index.NOT_ANALYZED));
	doc.add(new Field(F_DATE, bookInfo.getDate(), Field.Store.YES, Field.Index.NOT_ANALYZED));
	doc.add(new Field(F_PRICE, NumericUtils.longToPrefixCoded(bookInfo.getPrice()), Field.Store.YES, Field.Index.NOT_ANALYZED));
	doc.add(new Field(F_SUMMARY, bookInfo.getSummary(), Field.Store.YES, Field.Index.ANALYZED));
	return doc;
	}

	/** 書籍データを格納するPOJO */
	public static class BookInfo {
	private String publisher;
	private String category;
	private String title;
	private String author;
	private int pages;
	private String isbn;
	private String date;
	private int price;
	private String summary;

	public void setProperty(String name, String value) {
	if (name.equals(F_PUBLISHER)) {
	this.setPublisher(value);
	} else if (name.equals(F_CATEGORY)) {
	this.setCategory(value);
	} else if (name.equals(F_TITLE)) {
	this.setTitle(value);
	} else if (name.equals(F_AUTHOR)) {
	this.setAuthor(value);
	} else if (name.equals(F_PAGES)) {
	this.setPages(Integer.parseInt(value));
	} else if (name.equals(F_ISBN)) {
	this.setIsbn(value);
	} else if (name.equals(F_DATE)) {
	this.setDate(value);
	} else if (name.equals(F_PRICE)) {
	this.setPrice(Integer.parseInt(value));
	} else if (name.equals(F_SUMMARY)) {
	this.setSummary(value);
	}
	}
	public String getPublisher() {
	return publisher;
	}
	public void setPublisher(String publisher) {
	this.publisher = publisher;
	}
	public String getCategory() {
	return category;
	}
	public void setCategory(String category) {
	this.category = category;
	}
	public String getTitle() {
	return title;
	}
	public void setTitle(String title) {
	this.title = title;
	}
	public String getAuthor() {
	return author;
	}
	public void setAuthor(String author) {
	this.author = author;
	}
	public int getPages() {
	return pages;
	}
	public void setPages(int pages) {
	this.pages = pages;
	}
	public String getIsbn() {
	return isbn;
	}
	public void setIsbn(String isbn) {
	this.isbn = isbn;
	}
	public String getDate() {
	return date;
	}
	public void setDate(String date) {
	this.date = date;
	}
	public int getPrice() {
	return price;
	}
	public void setPrice(int price) {
	this.price = price;
	}
	public String getSummary() {
	return summary;
	}
	public void setSummary(String summary) {
	this.summary = summary;
	}
	}

	}
	http://mocobeta-backup.tumblr.com/post/25753926431/lucene-3-6

	* インデックス作成時間の測定 *
	- Analyzerクラスを変更
	- MaxBufferedDocsを変更
	- MergeFactor, MaxMergeDocsはデフォルト値

	（注意）VM環境です。また、複数回実行して平均を取ったりしていないです。

	【計測環境】
	CPU : VCPU コア2個 (ホストのCPUは Intel Core i5-2300)
	RAM : 4GB
	OS : (VM) Cent OS 6 (64bit)
	Java : 1.6.0

	==============================================================
	BookIndexer
	データ件数: 237件
	==============================================================

	+----------------------------------------------------------------------------------------+
	\| Analyzer class \| MaxBufferedDocs \| Indixing time \|
	+------------------------------------------------------\|-----------------\|---------------+
	\| org.apache.lucene.analysis.ja.JapaneseAnalyzer \| 10 \| 2587 msec \|
	\| \| 100 \| 2011 msec \|
	+------------------------------------------------------\|-----------------\|---------------+
	\| org.apache.lucene.analysis.gosen.GosenAnalyzer \| 10 \| 1816 msec \|
	\| \| 100 \| 1381 msec \|
	+------------------------------------------------------\|-----------------\|---------------+
	\| org.apache.lucene.analysis.cjk.CJKAnalyzer \| 10 \| 1870 msec \|
	\| \| 100 \| 1128 msec \|
	+------------------------------------------------------\|-----------------\|---------------+
	\| org.apache.lucene.analysis.standard.StandardAnalyzer \| 10 \| 1679 msec \|
	\| \| 100 \| 934 msec \|
	+----------------------------------------------------------------------------------------+


	==============================================================
	PostIndexer
	データ件数: 21733件
	==============================================================
	+----------------------------------------------------------------------------------------+
	\| Analyzer class \| MaxBufferedDocs \| Indixing time \|
	+------------------------------------------------------\|-----------------\|---------------+
	\| org.apache.lucene.analysis.ja.JapaneseAnalyzer \| 10 \| 12608 msec \|
	\| \| 100 \| 8006 msec \|
	\| \| 1000 \| 7182 msec \|
	+------------------------------------------------------\|-----------------\|---------------+
	\| org.apache.lucene.analysis.gosen.GosenAnalyzer \| 10 \| 10091 msec \|
	\| \| 100 \| 5720 msec \|
	\| \| 1000 \| 4983 msec \|
	+------------------------------------------------------\|-----------------\|---------------+
	\| org.apache.lucene.analysis.cjk.CJKAnalyzer \| 10 \| 10411 msec \|
	\| \| 100 \| 5149 msec \|
	\| \| 1000 \| 3955 msec \|
	+------------------------------------------------------\|-----------------\|---------------+
	\| org.apache.lucene.analysis.standard.StandardAnalyzer \| 10 \| 9255 msec \|
	\| \| 100 \| 4186 msec \|
	\| \| 1000 \| 3362 msec \|
	+----------------------------------------------------------------------------------------+
	package indexer;

	import java.io.BufferedReader;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.util.StringTokenizer;

	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;

	/**
	* 郵便番号データ（郵便局大口事業所等個別番号データ）のIndexer
	* http://www.post.japanpost.jp/zipcode/dl/jigyosyo/index-zip.html
	*/
	public class PostIndexer extends BaseIndexer {

	private static final String CHARSET = "UTF-8";
	private static final String SEPARATOR = ",";
	private static final String QUOTE = "\"";
	public static final String F_CODE = "番号";
	public static final String F_KANA = "カナ";
	public static final String F_NAME = "名称";
	public static final String F_ADDR = "住所";
	public static final String F_CONTENT = "コンテンツ";
	public static final String F_INDZIP = "個別郵便番号";
	public static final String F_ZIP = "郵便番号";
	public static final String F_POST = "郵便局名";

	private String dataFile;
	private int count = 0;
	private BufferedReader reader;
	private String line;

	public static void main(String[] args) {
	PostIndexer indexer = new PostIndexer("resource/jigyosyo.csv", "post-index");
	indexer.makeIndex();
	System.out.println(Integer.toString(indexer.count) + " 件の郵便番号データを登録しました。");
	}

	private PostIndexer(String dataFile, String indexDir) {
	this.dataFile = dataFile;
	this.indexDir = indexDir;
	}

	protected void begin() {
	try {
	reader = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), CHARSET));
	} catch (IOException e) {
	throw new IndexerException(e);
	}
	}

	protected void end() {
	try {
	reader.close();
	} catch (IOException e) {
	throw new IndexerException(e);
	}
	}

	@Override
	protected boolean hasNext() {
	try {
	line = reader.readLine();
	if (line != null) {
	count++;
	return true;
	}
	return false;
	} catch (IOException e) {
	throw new IndexerException(e);
	}
	}

	@Override
	protected Object next() {
	return line;
	}

	@Override
	protected Document createDocument(Object record) {
	StringTokenizer tokenizer = new StringTokenizer((String)record, SEPARATOR);
	String code = unquote(tokenizer);
	String kana = unquote(tokenizer);
	String name = unquote(tokenizer);
	String addr1 = unquote(tokenizer);
	String addr2 = unquote(tokenizer);
	String addr3 = unquote(tokenizer);
	String addr4 = unquote(tokenizer);
	String indZip = unquote(tokenizer);
	String zip = unquote(tokenizer);
	String post = unquote(tokenizer);

	String addr = addr1 + addr2 + addr3 + addr4;
	String content = name + " " + addr;

	Document doc = new Document();
	doc.add(new Field(F_CODE, code, Field.Store.YES, Field.Index.NOT_ANALYZED));
	doc.add(new Field(F_KANA, kana, Field.Store.YES, Field.Index.ANALYZED));
	doc.add(new Field(F_NAME, name, Field.Store.YES, Field.Index.ANALYZED));
	doc.add(new Field(F_ADDR, addr, Field.Store.YES, Field.Index.ANALYZED));
	doc.add(new Field(F_CONTENT, content, Field.Store.YES, Field.Index.ANALYZED));
	doc.add(new Field(F_INDZIP, indZip, Field.Store.YES, Field.Index.NOT_ANALYZED));
	doc.add(new Field(F_ZIP, zip, Field.Store.YES, Field.Index.NOT_ANALYZED));
	doc.add(new Field(F_POST, post, Field.Store.YES, Field.Index.NOT_ANALYZED));
	return doc;
	}

	private String unquote(StringTokenizer tokenizer) {
	String quoted = tokenizer.nextToken();
	if (quoted.startsWith(QUOTE) && quoted.endsWith(QUOTE)) {
	return quoted.substring(1, quoted.length() - 1);
	}
	return quoted;
	}

	}