Created
August 2, 2012 17:38
-
-
Save mocobeta/3238964 to your computer and use it in GitHub Desktop.
Solr カスタムanalyzerもどきを作る
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
schema.xml は schema version 1.5 に合わせて修正された版を、以下から頂きました。 | |
http://johtani.jugem.jp/?eid=44 | |
--text_ja の analyzer の箇所だけを抜粋-- | |
<analyzer type="index"> | |
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ja.txt"/> | |
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> | |
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> | |
<filter class="solr.SynonymFilterFactory" synonyms="synonyms-ja.txt" ignoreCase="true" expand="true" | |
tokenizerFactory="solr.JapaneseTokenizerFactory"/> | |
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="pos-deny.txt" enablePositionIncrements="true"/> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords-ja.txt"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
</analyzer> | |
------------------------------------- | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package test.solrbook; | |
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.Reader; | |
import java.io.StringReader; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import org.apache.lucene.analysis.CharReader; | |
import org.apache.lucene.analysis.CharStream; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute; | |
import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute; | |
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute; | |
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.util.ResourceLoader; | |
import org.apache.lucene.util.Version; | |
import org.apache.solr.analysis.JapaneseKatakanaStemFilterFactory; | |
import org.apache.solr.analysis.JapanesePartOfSpeechStopFilterFactory; | |
import org.apache.solr.analysis.JapaneseTokenizerFactory; | |
import org.apache.solr.analysis.LowerCaseFilterFactory; | |
import org.apache.solr.analysis.MappingCharFilterFactory; | |
import org.apache.solr.analysis.StopFilterFactory; | |
import org.apache.solr.analysis.SynonymFilterFactory; | |
/** | |
* 「Apache Solr入門」2章 | |
* text_ja型のanalyzerと同等の動きをする(はずの)クラス | |
* Lucene/Solr 4.0 alphaで動作します。 | |
*/ | |
public class SolrbookAnalyzer { | |
private ResourceLoader resourceLoader = new MyResourceLoader(); | |
public static void main(String[] args) { | |
String content = "ソーラーは検索エンジンです。"; | |
SolrbookAnalyzer analyzer = new SolrbookAnalyzer(); | |
analyzer.analyze(new StringReader(content)); | |
} | |
private void analyze(Reader reader) { | |
CharStream charStream = charFilter(reader); | |
TokenStream tokenizer = tokenizer(charStream); | |
TokenStream stream = filter(tokenizer); | |
try { | |
while(stream.incrementToken()) { | |
CharTermAttribute charAtt = stream.getAttribute(CharTermAttribute.class); | |
BaseFormAttribute bfAtt = stream.getAttribute(BaseFormAttribute.class); | |
InflectionAttribute infAtt = stream.getAttribute(InflectionAttribute.class); | |
ReadingAttribute readAtt = stream.getAttribute(ReadingAttribute.class); | |
PartOfSpeechAttribute posAtt = stream.getAttribute(PartOfSpeechAttribute.class); | |
System.out.println( | |
charAtt + "\t" + // トークン | |
bfAtt.getBaseForm() + "\t" + // 基本形 | |
infAtt.getInflectionForm() + "\t" + // 活用形 | |
readAtt.getReading() + "\t" + // 読み | |
readAtt.getPronunciation() + "\t" + // 発音 | |
posAtt.getPartOfSpeech() // 品詞 | |
); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
/** charFilter もどき */ | |
private CharStream charFilter(Reader reader) { | |
// solr.MappingCharFilterFactory | |
MappingCharFilterFactory factory = new MappingCharFilterFactory(); | |
Map<String, String> args = new HashMap<String, String>(); | |
args.put("mapping", "mapping-ja.txt"); | |
factory.init(args); | |
factory.inform(resourceLoader); | |
CharStream stream = factory.create(CharReader.get(reader)); | |
return stream; | |
} | |
/** tokenizer もどき */ | |
private TokenStream tokenizer(CharStream charStream) { | |
// solr.JapaneseTokenizerFactory | |
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); | |
Map<String, String> args = new HashMap<String, String>(); | |
args.put("mode", "search"); | |
factory.init(args); | |
factory.inform(resourceLoader); | |
TokenStream stream = factory.create(charStream); | |
return stream; | |
} | |
/** filter もどき */ | |
private TokenStream filter(TokenStream tokenizer) { | |
TokenStream stream = null; | |
// solr.JapaneseKatakanaStemFilterFactory | |
JapaneseKatakanaStemFilterFactory factory1 = new JapaneseKatakanaStemFilterFactory(); | |
Map<String, String> args1 = new HashMap<String, String>(); | |
args1.put("minimumLength", "4"); | |
factory1.init(args1); | |
stream = factory1.create(tokenizer); | |
// solr.SynonymFilterFactory | |
SynonymFilterFactory factory2 = new SynonymFilterFactory(); | |
Map<String, String> args2 = new HashMap<String, String>(); | |
args2.put("synonyms", "synonyms-ja.txt"); | |
args2.put("ignoreCase", "true"); | |
args2.put("expand", "true"); | |
args2.put("tokenizerFactory", "org.apache.solr.analysis.JapaneseTokenizerFactory"); | |
factory2.setLuceneMatchVersion(Version.LUCENE_40); | |
factory2.init(args2); | |
factory2.inform(resourceLoader); | |
stream = factory2.create(stream); | |
// solr.JapanesePartOfSpeechStopFilterFactory | |
JapanesePartOfSpeechStopFilterFactory factory3 = new JapanesePartOfSpeechStopFilterFactory(); | |
Map<String, String> args3 = new HashMap<String, String>(); | |
args3.put("tags", "pos-deny.txt"); | |
args3.put("enablePositionIncrements", "true"); | |
factory3.setLuceneMatchVersion(Version.LUCENE_40); | |
factory3.init(args3); | |
factory3.inform(resourceLoader); | |
stream = factory3.create(stream); | |
// solr.StopFilterFactory | |
StopFilterFactory factory4 = new StopFilterFactory(); | |
Map<String, String> args4 = new HashMap<String, String>(); | |
args4.put("ignoreCase", "true"); | |
args4.put("words", "stopwords-ja.txt"); | |
factory4.setLuceneMatchVersion(Version.LUCENE_40); | |
factory4.init(args4); | |
factory4.inform(resourceLoader); | |
stream = factory4.create(stream); | |
// solr.LowerCaseFilterFactory | |
LowerCaseFilterFactory factory5 = new LowerCaseFilterFactory(); | |
factory5.setLuceneMatchVersion(Version.LUCENE_40); | |
stream = factory5.create(stream); | |
return stream; | |
} | |
/** リソースローダーもどき */ | |
class MyResourceLoader implements ResourceLoader { | |
// 設定ファイルを置くディレクトリ | |
// ここに、mapping-ja.txt, pos-deny.txt, stopwords-ja.txt, synonyms-ja.txt を置いておく | |
// 本のサンプル(サポートサイトからダウンロード)をそのまま使用 | |
private static final String confdir = "solrbook"; | |
@Override | |
public InputStream openResource(String resource) throws IOException { | |
return new FileInputStream(new File(confdir, resource)); | |
} | |
@Override | |
public List<String> getLines(String resource) throws IOException { | |
List<String> lines = new ArrayList<String>(); | |
File file = new File(confdir, resource); | |
BufferedReader reader = new BufferedReader(new FileReader(file)); | |
String line = null; | |
while ((line = reader.readLine()) != null) { | |
if (line.startsWith("#")) continue; | |
if (line.length() == 0) continue; | |
lines.add(line); | |
} | |
reader.close(); | |
return lines; | |
} | |
@Override | |
public <T> T newInstance(String cname, Class<T> expectedType, | |
String... subpackages) { | |
// !! とりあえずなにかnewするだけのダメコード | |
T obj = null; | |
try { | |
ClassLoader classLoader = MyResourceLoader.class.getClassLoader(); | |
Class<? extends T> clazz = Class.forName(cname, true, classLoader).asSubclass(expectedType); | |
obj = clazz.newInstance(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
return obj; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
solr null null null null null | |
ソーラ null null null null null | |
検索 null null ケンサク ケンサク 名詞-サ変接続 | |
エンジン null null エンジン エンジン 名詞-一般 | |
(実際にSolrを起動して、管理画面から「Field Analysis」にかけた時と同じ出力となります。) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment