mocobeta · August 2, 2012 17:38
diff --git a/gistfile1.txt b/gistfile1.txt
 schema.xml は schema version 1.5 に合わせて修正された版を、以下から頂きました。
 http://johtani.jugem.jp/?eid=44 

 --text_ja の analyzer の箇所だけを抜粋--

 <analyzer type="index">
    <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ja.txt"/>
    <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
    <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms-ja.txt" ignoreCase="true" expand="true" 
            tokenizerFactory="solr.JapaneseTokenizerFactory"/>
    <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="pos-deny.txt" enablePositionIncrements="true"/>
    <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords-ja.txt"/>
    <filter class="solr.LowerCaseFilterFactory"/>
 </analyzer>

 -------------------------------------

diff --git a/SolrbookAnalyzer.java b/SolrbookAnalyzer.java
 package test.solrbook;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
 import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute;
 import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.ResourceLoader;
 import org.apache.lucene.util.Version;
 import org.apache.solr.analysis.JapaneseKatakanaStemFilterFactory;
 import org.apache.solr.analysis.JapanesePartOfSpeechStopFilterFactory;
 import org.apache.solr.analysis.JapaneseTokenizerFactory;
 import org.apache.solr.analysis.LowerCaseFilterFactory;
 import org.apache.solr.analysis.MappingCharFilterFactory;
 import org.apache.solr.analysis.StopFilterFactory;
 import org.apache.solr.analysis.SynonymFilterFactory;

 /**
 * 「Apache Solr入門」2章
 * text_ja型のanalyzerと同等の動きをする（はずの）クラス
 * Lucene/Solr 4.0 alphaで動作します。
 */
 public class SolrbookAnalyzer {

 	private ResourceLoader resourceLoader = new MyResourceLoader();
 	
 	public static void main(String[] args) {
 		String content = "ｿｰﾗｰは検索ｴﾝｼﾞﾝです。";
 		SolrbookAnalyzer analyzer = new SolrbookAnalyzer();
 		analyzer.analyze(new StringReader(content));
 	}
 	
 	private void analyze(Reader reader) {
 		CharStream charStream = charFilter(reader);
 		TokenStream tokenizer = tokenizer(charStream);
 		TokenStream stream = filter(tokenizer);
 		try {
 			while(stream.incrementToken()) {
 				CharTermAttribute charAtt = stream.getAttribute(CharTermAttribute.class);
 				BaseFormAttribute bfAtt = stream.getAttribute(BaseFormAttribute.class);
 				InflectionAttribute infAtt = stream.getAttribute(InflectionAttribute.class);
 				ReadingAttribute readAtt = stream.getAttribute(ReadingAttribute.class);
 				PartOfSpeechAttribute posAtt = stream.getAttribute(PartOfSpeechAttribute.class);
 				System.out.println(
 						charAtt + "\t" + // トークン
 						bfAtt.getBaseForm() + "\t" + // 基本形
 						infAtt.getInflectionForm() + "\t" + // 活用形
 						readAtt.getReading() + "\t" + // 読み
 						readAtt.getPronunciation() + "\t" + // 発音
 						posAtt.getPartOfSpeech() // 品詞
 						);
 			}
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
 	}
 	
 	/** charFilter もどき */
 	private CharStream charFilter(Reader reader) {
 		// solr.MappingCharFilterFactory
 		MappingCharFilterFactory factory = new MappingCharFilterFactory();
 		Map<String, String> args = new HashMap<String, String>();
 		args.put("mapping", "mapping-ja.txt");
 		factory.init(args);
 		factory.inform(resourceLoader);
 		CharStream stream = factory.create(CharReader.get(reader));
 		return stream;
 	}
 	
 	/** tokenizer もどき */
 	private TokenStream tokenizer(CharStream charStream) {
 		// solr.JapaneseTokenizerFactory
 		JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
 		Map<String, String> args = new HashMap<String, String>();
 		args.put("mode", "search");
 		factory.init(args);
 		factory.inform(resourceLoader);

 		TokenStream stream = factory.create(charStream);
 		return stream;
 	}
 	
 	/** filter もどき */
 	private TokenStream filter(TokenStream tokenizer) {
 		TokenStream stream = null;

 		// solr.JapaneseKatakanaStemFilterFactory
 		JapaneseKatakanaStemFilterFactory factory1 = new JapaneseKatakanaStemFilterFactory();
 		Map<String, String> args1 = new HashMap<String, String>();
 		args1.put("minimumLength", "4");
 		factory1.init(args1);
 		stream = factory1.create(tokenizer);

 		// solr.SynonymFilterFactory
 		SynonymFilterFactory factory2 = new SynonymFilterFactory();
 		Map<String, String> args2 = new HashMap<String, String>();
 		args2.put("synonyms", "synonyms-ja.txt");
 		args2.put("ignoreCase", "true");
 		args2.put("expand", "true");
 		args2.put("tokenizerFactory", "org.apache.solr.analysis.JapaneseTokenizerFactory");
 		factory2.setLuceneMatchVersion(Version.LUCENE_40);
 		factory2.init(args2);
 		factory2.inform(resourceLoader);
 		stream = factory2.create(stream);
 		
 		// solr.JapanesePartOfSpeechStopFilterFactory
 		JapanesePartOfSpeechStopFilterFactory factory3 = new JapanesePartOfSpeechStopFilterFactory();
 		Map<String, String> args3 = new HashMap<String, String>();
 		args3.put("tags", "pos-deny.txt");
 		args3.put("enablePositionIncrements", "true");
 		factory3.setLuceneMatchVersion(Version.LUCENE_40);
 		factory3.init(args3);
 		factory3.inform(resourceLoader);
 		stream = factory3.create(stream);

 		// solr.StopFilterFactory
 		StopFilterFactory factory4 = new StopFilterFactory();
 		Map<String, String> args4 = new HashMap<String, String>();
 		args4.put("ignoreCase", "true");
 		args4.put("words", "stopwords-ja.txt");
 		factory4.setLuceneMatchVersion(Version.LUCENE_40);
 		factory4.init(args4);
 		factory4.inform(resourceLoader);
 		stream = factory4.create(stream);
 		
 		// solr.LowerCaseFilterFactory
 		LowerCaseFilterFactory factory5 = new LowerCaseFilterFactory();
 		factory5.setLuceneMatchVersion(Version.LUCENE_40);
 		stream = factory5.create(stream);
 		
 		return stream;
 	}
 	
 	/** リソースローダーもどき */
 	class MyResourceLoader implements ResourceLoader {
 		// 設定ファイルを置くディレクトリ
 		// ここに、mapping-ja.txt, pos-deny.txt, stopwords-ja.txt, synonyms-ja.txt を置いておく
 		// 本のサンプル（サポートサイトからダウンロード）をそのまま使用
 		private static final String confdir = "solrbook";
 		
 		@Override
 		public InputStream openResource(String resource) throws IOException {
 			return new FileInputStream(new File(confdir, resource));
 		}

 		@Override
 		public List<String> getLines(String resource) throws IOException {
 			List<String> lines = new ArrayList<String>();
 			File file = new File(confdir, resource);
 			BufferedReader reader = new BufferedReader(new FileReader(file));
 			String line = null;
 			while ((line = reader.readLine()) != null) {
 				if (line.startsWith("#")) continue;
 				if (line.length() == 0) continue;
 				lines.add(line);
 			}
 			reader.close();
 			return lines;
 		}

 		@Override
 		public <T> T newInstance(String cname, Class<T> expectedType,
 				String... subpackages) {
 			// !! とりあえずなにかnewするだけのダメコード
 			T obj = null;
 			try {
 				ClassLoader classLoader = MyResourceLoader.class.getClassLoader();
 				Class<? extends T> clazz = Class.forName(cname, true, classLoader).asSubclass(expectedType);
 				obj = clazz.newInstance();
 			} catch (Exception e) {
 				e.printStackTrace();
 			}
 			return obj;
 		}
 		
 	}
 }
diff --git a/実行結果 b/実行結果
 solr	null	null	null	null	null
 ソーラ	null	null	null	null	null
 検索	null	null	ケンサク	ケンサク	名詞-サ変接続
 エンジン	null	null	エンジン	エンジン	名詞-一般

 （実際にSolrを起動して、管理画面から「Field Analysis」にかけた時と同じ出力となります。）
	schema.xml は schema version 1.5 に合わせて修正された版を、以下から頂きました。
	http://johtani.jugem.jp/?eid=44

	--text_ja の analyzer の箇所だけを抜粋--

	<analyzer type="index">
	<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ja.txt"/>
	<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
	<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
	<filter class="solr.SynonymFilterFactory" synonyms="synonyms-ja.txt" ignoreCase="true" expand="true"
	tokenizerFactory="solr.JapaneseTokenizerFactory"/>
	<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="pos-deny.txt" enablePositionIncrements="true"/>
	<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords-ja.txt"/>
	<filter class="solr.LowerCaseFilterFactory"/>
	</analyzer>

	-------------------------------------
	package test.solrbook;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileReader;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.Reader;
	import java.io.StringReader;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;

	import org.apache.lucene.analysis.CharReader;
	import org.apache.lucene.analysis.CharStream;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
	import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute;
	import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
	import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.util.ResourceLoader;
	import org.apache.lucene.util.Version;
	import org.apache.solr.analysis.JapaneseKatakanaStemFilterFactory;
	import org.apache.solr.analysis.JapanesePartOfSpeechStopFilterFactory;
	import org.apache.solr.analysis.JapaneseTokenizerFactory;
	import org.apache.solr.analysis.LowerCaseFilterFactory;
	import org.apache.solr.analysis.MappingCharFilterFactory;
	import org.apache.solr.analysis.StopFilterFactory;
	import org.apache.solr.analysis.SynonymFilterFactory;

	/**
	* 「Apache Solr入門」2章
	* text_ja型のanalyzerと同等の動きをする（はずの）クラス
	* Lucene/Solr 4.0 alphaで動作します。
	*/
	public class SolrbookAnalyzer {

	private ResourceLoader resourceLoader = new MyResourceLoader();

	public static void main(String[] args) {
	String content = "ｿｰﾗｰは検索ｴﾝｼﾞﾝです。";
	SolrbookAnalyzer analyzer = new SolrbookAnalyzer();
	analyzer.analyze(new StringReader(content));
	}

	private void analyze(Reader reader) {
	CharStream charStream = charFilter(reader);
	TokenStream tokenizer = tokenizer(charStream);
	TokenStream stream = filter(tokenizer);
	try {
	while(stream.incrementToken()) {
	CharTermAttribute charAtt = stream.getAttribute(CharTermAttribute.class);
	BaseFormAttribute bfAtt = stream.getAttribute(BaseFormAttribute.class);
	InflectionAttribute infAtt = stream.getAttribute(InflectionAttribute.class);
	ReadingAttribute readAtt = stream.getAttribute(ReadingAttribute.class);
	PartOfSpeechAttribute posAtt = stream.getAttribute(PartOfSpeechAttribute.class);
	System.out.println(
	charAtt + "\t" + // トークン
	bfAtt.getBaseForm() + "\t" + // 基本形
	infAtt.getInflectionForm() + "\t" + // 活用形
	readAtt.getReading() + "\t" + // 読み
	readAtt.getPronunciation() + "\t" + // 発音
	posAtt.getPartOfSpeech() // 品詞
	);
	}
	} catch (IOException e) {
	e.printStackTrace();
	}
	}

	/** charFilter もどき */
	private CharStream charFilter(Reader reader) {
	// solr.MappingCharFilterFactory
	MappingCharFilterFactory factory = new MappingCharFilterFactory();
	Map<String, String> args = new HashMap<String, String>();
	args.put("mapping", "mapping-ja.txt");
	factory.init(args);
	factory.inform(resourceLoader);
	CharStream stream = factory.create(CharReader.get(reader));
	return stream;
	}

	/** tokenizer もどき */
	private TokenStream tokenizer(CharStream charStream) {
	// solr.JapaneseTokenizerFactory
	JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
	Map<String, String> args = new HashMap<String, String>();
	args.put("mode", "search");
	factory.init(args);
	factory.inform(resourceLoader);

	TokenStream stream = factory.create(charStream);
	return stream;
	}

	/** filter もどき */
	private TokenStream filter(TokenStream tokenizer) {
	TokenStream stream = null;

	// solr.JapaneseKatakanaStemFilterFactory
	JapaneseKatakanaStemFilterFactory factory1 = new JapaneseKatakanaStemFilterFactory();
	Map<String, String> args1 = new HashMap<String, String>();
	args1.put("minimumLength", "4");
	factory1.init(args1);
	stream = factory1.create(tokenizer);

	// solr.SynonymFilterFactory
	SynonymFilterFactory factory2 = new SynonymFilterFactory();
	Map<String, String> args2 = new HashMap<String, String>();
	args2.put("synonyms", "synonyms-ja.txt");
	args2.put("ignoreCase", "true");
	args2.put("expand", "true");
	args2.put("tokenizerFactory", "org.apache.solr.analysis.JapaneseTokenizerFactory");
	factory2.setLuceneMatchVersion(Version.LUCENE_40);
	factory2.init(args2);
	factory2.inform(resourceLoader);
	stream = factory2.create(stream);

	// solr.JapanesePartOfSpeechStopFilterFactory
	JapanesePartOfSpeechStopFilterFactory factory3 = new JapanesePartOfSpeechStopFilterFactory();
	Map<String, String> args3 = new HashMap<String, String>();
	args3.put("tags", "pos-deny.txt");
	args3.put("enablePositionIncrements", "true");
	factory3.setLuceneMatchVersion(Version.LUCENE_40);
	factory3.init(args3);
	factory3.inform(resourceLoader);
	stream = factory3.create(stream);

	// solr.StopFilterFactory
	StopFilterFactory factory4 = new StopFilterFactory();
	Map<String, String> args4 = new HashMap<String, String>();
	args4.put("ignoreCase", "true");
	args4.put("words", "stopwords-ja.txt");
	factory4.setLuceneMatchVersion(Version.LUCENE_40);
	factory4.init(args4);
	factory4.inform(resourceLoader);
	stream = factory4.create(stream);

	// solr.LowerCaseFilterFactory
	LowerCaseFilterFactory factory5 = new LowerCaseFilterFactory();
	factory5.setLuceneMatchVersion(Version.LUCENE_40);
	stream = factory5.create(stream);

	return stream;
	}

	/** リソースローダーもどき */
	class MyResourceLoader implements ResourceLoader {
	// 設定ファイルを置くディレクトリ
	// ここに、mapping-ja.txt, pos-deny.txt, stopwords-ja.txt, synonyms-ja.txt を置いておく
	// 本のサンプル（サポートサイトからダウンロード）をそのまま使用
	private static final String confdir = "solrbook";

	@Override
	public InputStream openResource(String resource) throws IOException {
	return new FileInputStream(new File(confdir, resource));
	}

	@Override
	public List<String> getLines(String resource) throws IOException {
	List<String> lines = new ArrayList<String>();
	File file = new File(confdir, resource);
	BufferedReader reader = new BufferedReader(new FileReader(file));
	String line = null;
	while ((line = reader.readLine()) != null) {
	if (line.startsWith("#")) continue;
	if (line.length() == 0) continue;
	lines.add(line);
	}
	reader.close();
	return lines;
	}

	@Override
	public <T> T newInstance(String cname, Class<T> expectedType,
	String... subpackages) {
	// !! とりあえずなにかnewするだけのダメコード
	T obj = null;
	try {
	ClassLoader classLoader = MyResourceLoader.class.getClassLoader();
	Class<? extends T> clazz = Class.forName(cname, true, classLoader).asSubclass(expectedType);
	obj = clazz.newInstance();
	} catch (Exception e) {
	e.printStackTrace();
	}
	return obj;
	}

	}
	}
	solr null null null null null
	ソーラ null null null null null
	検索 null null ケンサクケンサク名詞-サ変接続
	エンジン null null エンジンエンジン名詞-一般

	（実際にSolrを起動して、管理画面から「Field Analysis」にかけた時と同じ出力となります。）