nobusue · November 6, 2011 01:08 · nobusue · Nov 6, 2011
diff --git a/bayes.groovy b/bayes.groovy
 /*
 * ベイジアンフィルタのサンプル(Groovyバージョン)
 * 元ネタ) 機械学習 はじめよう 第3回 ベイジアンフィルタを実装してみよう
 * http://gihyo.jp/dev/serial/01/machine-learning/0003
 *
 * 分かち書きにはGomokuを利用
 * https://github.com/sile/gomoku
 * gomoku-0.0.4.jarをダウンロードし、~/.groovy/lib か <GROOVY_HOME>/lib にコピーしておく
 * 
 * 学習ソースとしてWikipediaのテキストをJsoupで取得
 */
 import net.reduls.gomoku.Tagger
 import net.reduls.gomoku.Morpheme

 @Grab(group='org.jsoup', module='jsoup', version='1.6.1')
 import org.jsoup.*

 def url  //ジャンル判定対象
 if(args && args[0]) {
  try{
    url = args[0]
    url.toURL()
  }catch(e){
    println "Invalid URL: ${url}"
    return
  }
 } else {
  println "Usage: groovy bayes.groovy <url>"
  return
 }

 vocabularies = [] as Set
 wordcount = [:].withDefault{ [:].withDefault{0} }
 catcount = [:].withDefault{0}

 def getwords(sentence){
 def result = Tagger.parse(sentence)
 //def words = result*.find{ it.feature =~ /名詞|動詞|形容詞|形容動詞|感動詞|副詞|連体詞/ }.surface
 def words = result*.find{ it.feature =~ /名詞/ }.surface
 return words.collect{ it.toLowerCase() }
 }

 def wordcountup(word, cat){
  wordcount[cat][word] += 1
  vocabularies << word
 }

 def catcountup(cat){
  catcount[cat] += 1
 }

 def train(doc, cat){
  word = getwords(doc)
  word.each{ w ->
    wordcountup(w, cat)
    catcountup(cat)
  }
 }

 def priorprob(cat){
  return catcount[cat] / catcount.values().sum()
 }

 def incategory(word, cat){
  if( wordcount[cat].containsKey(word) ) {
    //println "Hit word=${word}, cat=${cat}, count=${wordcount[cat][word]}"
    return wordcount[cat][word]
  }
  else return 0
 }

 def wordprob(word, cat){
  def prob = (incategory(word, cat) + 1.0) /
    (wordcount[cat].values().sum() + vocabularies.size())
  return prob
 }

 def score(word, cat){
  score = Math.log( priorprob(cat) )
  word.each{ w ->
    score += Math.log(wordprob(w, cat))
  }
  return score
 }

 def classifier(doc){
  def best = ''
  def max = Integer.MIN_VALUE
  word = getwords(doc)

  catcount.each{ catkey,catval ->
    def prob = score(word, catkey)

    //println "${catkey}=>${prob}: ${doc}"
    if(prob > max) {
      max = prob
      best = catkey
    }
  }
  return best
 }

 def getOrRestore(name, tmp){
  def file = new File("${tmp}/${name}Exp.txt")
  if(file.exists()){
    return file.text
  } else {
    def exp = Jsoup.connect("http://ja.wikipedia.org/wiki/${URLEncoder.encode(name,'UTF-8')}").get().text()
    file.withWriter{ it.print exp }
    return exp
  }
 }

 def home = System.getProperty('user.home')
 def tmp = home + '/tmp'

 def pythonExp = getOrRestore('Python', tmp)
 train(pythonExp, 'Python')

 def rubyExp = getOrRestore('Ruby', tmp)
 train(rubyExp, 'Ruby')

 def groovyExp = getOrRestore('Groovy', tmp)
 train(groovyExp, 'Groovy')

 def words = Jsoup.connect(url).get().text()
 println "${url} => 推定カテゴリ: ${classifier(words)}"
	/*
	* ベイジアンフィルタのサンプル(Groovyバージョン)
	* 元ネタ) 機械学習はじめよう第3回ベイジアンフィルタを実装してみよう
	* http://gihyo.jp/dev/serial/01/machine-learning/0003
	*
	* 分かち書きにはGomokuを利用
	* https://github.com/sile/gomoku
	* gomoku-0.0.4.jarをダウンロードし、~/.groovy/lib か <GROOVY_HOME>/lib にコピーしておく
	*
	* 学習ソースとしてWikipediaのテキストをJsoupで取得
	*/
	import net.reduls.gomoku.Tagger
	import net.reduls.gomoku.Morpheme

	@Grab(group='org.jsoup', module='jsoup', version='1.6.1')
	import org.jsoup.*

	def url //ジャンル判定対象
	if(args && args[0]) {
	try{
	url = args[0]
	url.toURL()
	}catch(e){
	println "Invalid URL: ${url}"
	return
	}
	} else {
	println "Usage: groovy bayes.groovy <url>"
	return
	}

	vocabularies = [] as Set
	wordcount = [:].withDefault{ [:].withDefault{0} }
	catcount = [:].withDefault{0}

	def getwords(sentence){
	def result = Tagger.parse(sentence)
	//def words = result*.find{ it.feature =~ /名詞\|動詞\|形容詞\|形容動詞\|感動詞\|副詞\|連体詞/ }.surface
	def words = result*.find{ it.feature =~ /名詞/ }.surface
	return words.collect{ it.toLowerCase() }
	}

	def wordcountup(word, cat){
	wordcount[cat][word] += 1
	vocabularies << word
	}

	def catcountup(cat){
	catcount[cat] += 1
	}

	def train(doc, cat){
	word = getwords(doc)
	word.each{ w ->
	wordcountup(w, cat)
	catcountup(cat)
	}
	}

	def priorprob(cat){
	return catcount[cat] / catcount.values().sum()
	}

	def incategory(word, cat){
	if( wordcount[cat].containsKey(word) ) {
	//println "Hit word=${word}, cat=${cat}, count=${wordcount[cat][word]}"
	return wordcount[cat][word]
	}
	else return 0
	}

	def wordprob(word, cat){
	def prob = (incategory(word, cat) + 1.0) /
	(wordcount[cat].values().sum() + vocabularies.size())
	return prob
	}

	def score(word, cat){
	score = Math.log( priorprob(cat) )
	word.each{ w ->
	score += Math.log(wordprob(w, cat))
	}
	return score
	}

	def classifier(doc){
	def best = ''
	def max = Integer.MIN_VALUE
	word = getwords(doc)

	catcount.each{ catkey,catval ->
	def prob = score(word, catkey)

	//println "${catkey}=>${prob}: ${doc}"
	if(prob > max) {
	max = prob
	best = catkey
	}
	}
	return best
	}

	def getOrRestore(name, tmp){
	def file = new File("${tmp}/${name}Exp.txt")
	if(file.exists()){
	return file.text
	} else {
	def exp = Jsoup.connect("http://ja.wikipedia.org/wiki/${URLEncoder.encode(name,'UTF-8')}").get().text()
	file.withWriter{ it.print exp }
	return exp
	}
	}

	def home = System.getProperty('user.home')
	def tmp = home + '/tmp'

	def pythonExp = getOrRestore('Python', tmp)
	train(pythonExp, 'Python')

	def rubyExp = getOrRestore('Ruby', tmp)
	train(rubyExp, 'Ruby')

	def groovyExp = getOrRestore('Groovy', tmp)
	train(groovyExp, 'Groovy')

	def words = Jsoup.connect(url).get().text()
	println "${url} => 推定カテゴリ: ${classifier(words)}"