Skip to content

Instantly share code, notes, and snippets.

@hkurokawa
Last active December 11, 2017 03:25
Show Gist options
  • Save hkurokawa/925d8ce831f35a0883e7fcb286e0ead9 to your computer and use it in GitHub Desktop.
Save hkurokawa/925d8ce831f35a0883e7fcb286e0ead9 to your computer and use it in GitHub Desktop.
import org.apache.lucene.analysis.ja.JapaneseAnalyzer
import org.apache.lucene.analysis.ja.JapaneseTokenizer
import org.apache.lucene.analysis.ja.dict.UserDictionary
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.junit.Test
import java.io.StringReader
class KuromojiAnalysisTest {
@Test
fun testKuromojiAnalysis() {
// 「ヴァンドーム」もしくは「ヴァンドーム青山」を辞書から抜くと同じ結果になる
// val dict = """
// |ももクロフリル,ももクロフリル,モモクロフリル,カスタム名詞
// |ももクロ,ももクロ,モモクロ,カスタム名詞
// |フリル,フリル,フリル,カスタム名詞
// """.trimMargin()
tokenize("関ジャニ 応答セヨ") // -> 応答/セ/ヨ
tokenize("関ジャニ 応答セヨ ") // -> 応答/セヨ
}
}
fun tokenize(s: String, userDict: String? = null) {
val dict = userDict?.let { UserDictionary.open(StringReader(it)) }
val mode = JapaneseTokenizer.Mode.SEARCH
val stopSet = JapaneseAnalyzer.getDefaultStopSet()
val stopTags = JapaneseAnalyzer.getDefaultStopTags()
JapaneseAnalyzer(dict, mode, stopSet, stopTags).use({ analyzer ->
analyzer.tokenStream("", StringReader(s)).use({ tokenStream ->
val baseAttr = tokenStream.addAttribute(BaseFormAttribute::class.java)
val charAttr = tokenStream.addAttribute(CharTermAttribute::class.java)
val posAttr = tokenStream.addAttribute(PartOfSpeechAttribute::class.java)
val readAttr = tokenStream.addAttribute(ReadingAttribute::class.java)
tokenStream.reset()
while (tokenStream.incrementToken()) {
val text = charAttr.toString() // 単語
val baseForm = baseAttr.baseForm // 原型
val reading = readAttr.reading // 読み
val partOfSpeech = posAttr.partOfSpeech // 品詞
println("$text\t|\t$baseForm\t|\t$reading\t|\t$partOfSpeech")
}
})
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment