Skip to content

Instantly share code, notes, and snippets.

@yamakk
Created December 6, 2011 11:56
Show Gist options
  • Save yamakk/1437942 to your computer and use it in GitHub Desktop.
Save yamakk/1437942 to your computer and use it in GitHub Desktop.
mymecab.scala
import java.text.Normalizer
import scala.collection.mutable.ArrayBuffer
import org.chasen.mecab.{MeCab, Tagger, Node}
/*
scalac mymecab.scala
scala MyMeCabTest
ArrayBuffer(サイト, リンクフリー, リンクフリー)
*/
class MyMeCab {
System.loadLibrary("MeCab")
val tagger = new Tagger
def parse(s: String) : ArrayBuffer[String] = {
var termbuffer = new ArrayBuffer[String]
var node:Node = tagger.parseToNode(normalize(s))
while(node != null){
val feature = node.getFeature()
val ft = feature.split(",")
if(ft(0) == "名詞"){
if( ft(1) == "一般" || ft(1) == "固有名詞"){
termbuffer += node.getSurface()
}
}
node = node.getNext()
}
termbuffer
}
def normalize(s:String):String = {
// unicode正規化 NFKC
var str = s.toLowerCase()
str = str.replace("~", "-").replace("〜", "-")
str = str.replace("−", "-").replace("‐", "-").replace("─", "-")
str = Normalizer.normalize(s, Normalizer.Form.NFKC)
str
}
}
object MyMeCabTest{
def main(args:Array[String]){
val m = new MyMeCab
println(m.parse("当サイトはリンクフリーですが、リンクフリーご一報いただけると幸いです。"))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment