hkurokawa · December 11, 2017 03:25
diff --git a/KuromojiAnalysisTest.kt b/KuromojiAnalysisTest.kt
 import org.apache.lucene.analysis.ja.JapaneseAnalyzer
 import org.apache.lucene.analysis.ja.JapaneseTokenizer
 import org.apache.lucene.analysis.ja.dict.UserDictionary
 import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute
 import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute
 import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 import org.junit.Test
 import java.io.StringReader

 class KuromojiAnalysisTest {
  @Test
  fun testKuromojiAnalysis() {
    // 「ヴァンドーム」もしくは「ヴァンドーム青山」を辞書から抜くと同じ結果になる
 //    val dict = """
 //    |ももクロフリル,ももクロフリル,モモクロフリル,カスタム名詞
 //    |ももクロ,ももクロ,モモクロ,カスタム名詞
 //    |フリル,フリル,フリル,カスタム名詞
 //    """.trimMargin()
    tokenize("関ジャニ 応答セヨ") // -> 応答/セ/ヨ
    tokenize("関ジャニ 応答セヨ ") // -> 応答/セヨ
  }
 }

 fun tokenize(s: String, userDict: String? = null) {
  val dict = userDict?.let { UserDictionary.open(StringReader(it)) }
  val mode = JapaneseTokenizer.Mode.SEARCH
  val stopSet = JapaneseAnalyzer.getDefaultStopSet()
  val stopTags = JapaneseAnalyzer.getDefaultStopTags()

  JapaneseAnalyzer(dict, mode, stopSet, stopTags).use({ analyzer ->
    analyzer.tokenStream("", StringReader(s)).use({ tokenStream ->

      val baseAttr = tokenStream.addAttribute(BaseFormAttribute::class.java)
      val charAttr = tokenStream.addAttribute(CharTermAttribute::class.java)
      val posAttr = tokenStream.addAttribute(PartOfSpeechAttribute::class.java)
      val readAttr = tokenStream.addAttribute(ReadingAttribute::class.java)

      tokenStream.reset()
      while (tokenStream.incrementToken()) {
        val text = charAttr.toString()                // 単語
        val baseForm = baseAttr.baseForm       // 原型
        val reading = readAttr.reading         // 読み
        val partOfSpeech = posAttr.partOfSpeech    // 品詞

        println("$text\t|\t$baseForm\t|\t$reading\t|\t$partOfSpeech")
      }
    })
  })
 }
	import org.apache.lucene.analysis.ja.JapaneseAnalyzer
	import org.apache.lucene.analysis.ja.JapaneseTokenizer
	import org.apache.lucene.analysis.ja.dict.UserDictionary
	import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute
	import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute
	import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
	import org.junit.Test
	import java.io.StringReader

	class KuromojiAnalysisTest {
	@Test
	fun testKuromojiAnalysis() {
	// 「ヴァンドーム」もしくは「ヴァンドーム青山」を辞書から抜くと同じ結果になる
	// val dict = """
	// \|ももクロフリル,ももクロフリル,モモクロフリル,カスタム名詞
	// \|ももクロ,ももクロ,モモクロ,カスタム名詞
	// \|フリル,フリル,フリル,カスタム名詞
	// """.trimMargin()
	tokenize("関ジャニ応答セヨ") // -> 応答/セ/ヨ
	tokenize("関ジャニ応答セヨ ") // -> 応答/セヨ
	}
	}

	fun tokenize(s: String, userDict: String? = null) {
	val dict = userDict?.let { UserDictionary.open(StringReader(it)) }
	val mode = JapaneseTokenizer.Mode.SEARCH
	val stopSet = JapaneseAnalyzer.getDefaultStopSet()
	val stopTags = JapaneseAnalyzer.getDefaultStopTags()

	JapaneseAnalyzer(dict, mode, stopSet, stopTags).use({ analyzer ->
	analyzer.tokenStream("", StringReader(s)).use({ tokenStream ->

	val baseAttr = tokenStream.addAttribute(BaseFormAttribute::class.java)
	val charAttr = tokenStream.addAttribute(CharTermAttribute::class.java)
	val posAttr = tokenStream.addAttribute(PartOfSpeechAttribute::class.java)
	val readAttr = tokenStream.addAttribute(ReadingAttribute::class.java)

	tokenStream.reset()
	while (tokenStream.incrementToken()) {
	val text = charAttr.toString() // 単語
	val baseForm = baseAttr.baseForm // 原型
	val reading = readAttr.reading // 読み
	val partOfSpeech = posAttr.partOfSpeech // 品詞

	println("$text\t\|\t$baseForm\t\|\t$reading\t\|\t$partOfSpeech")
	}
	})
	})
	}