Last active
December 11, 2017 03:25
-
-
Save hkurokawa/925d8ce831f35a0883e7fcb286e0ead9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.ja.JapaneseAnalyzer | |
import org.apache.lucene.analysis.ja.JapaneseTokenizer | |
import org.apache.lucene.analysis.ja.dict.UserDictionary | |
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute | |
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute | |
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute | |
import org.junit.Test | |
import java.io.StringReader | |
class KuromojiAnalysisTest { | |
@Test | |
fun testKuromojiAnalysis() { | |
// 「ヴァンドーム」もしくは「ヴァンドーム青山」を辞書から抜くと同じ結果になる | |
// val dict = """ | |
// |ももクロフリル,ももクロフリル,モモクロフリル,カスタム名詞 | |
// |ももクロ,ももクロ,モモクロ,カスタム名詞 | |
// |フリル,フリル,フリル,カスタム名詞 | |
// """.trimMargin() | |
tokenize("関ジャニ 応答セヨ") // -> 応答/セ/ヨ | |
tokenize("関ジャニ 応答セヨ ") // -> 応答/セヨ | |
} | |
} | |
fun tokenize(s: String, userDict: String? = null) { | |
val dict = userDict?.let { UserDictionary.open(StringReader(it)) } | |
val mode = JapaneseTokenizer.Mode.SEARCH | |
val stopSet = JapaneseAnalyzer.getDefaultStopSet() | |
val stopTags = JapaneseAnalyzer.getDefaultStopTags() | |
JapaneseAnalyzer(dict, mode, stopSet, stopTags).use({ analyzer -> | |
analyzer.tokenStream("", StringReader(s)).use({ tokenStream -> | |
val baseAttr = tokenStream.addAttribute(BaseFormAttribute::class.java) | |
val charAttr = tokenStream.addAttribute(CharTermAttribute::class.java) | |
val posAttr = tokenStream.addAttribute(PartOfSpeechAttribute::class.java) | |
val readAttr = tokenStream.addAttribute(ReadingAttribute::class.java) | |
tokenStream.reset() | |
while (tokenStream.incrementToken()) { | |
val text = charAttr.toString() // 単語 | |
val baseForm = baseAttr.baseForm // 原型 | |
val reading = readAttr.reading // 読み | |
val partOfSpeech = posAttr.partOfSpeech // 品詞 | |
println("$text\t|\t$baseForm\t|\t$reading\t|\t$partOfSpeech") | |
} | |
}) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment