Created
November 6, 2011 01:08
-
-
Save nobusue/1342303 to your computer and use it in GitHub Desktop.
指定したURLの内容からジャンルをベイズ推定: groovy bayes.groovy <url>
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* ベイジアンフィルタのサンプル(Groovyバージョン) | |
* 元ネタ) 機械学習 はじめよう 第3回 ベイジアンフィルタを実装してみよう | |
* http://gihyo.jp/dev/serial/01/machine-learning/0003 | |
* | |
* 分かち書きにはGomokuを利用 | |
* https://github.com/sile/gomoku | |
* gomoku-0.0.4.jarをダウンロードし、~/.groovy/lib か <GROOVY_HOME>/lib にコピーしておく | |
* | |
* 学習ソースとしてWikipediaのテキストをJsoupで取得 | |
*/ | |
import net.reduls.gomoku.Tagger | |
import net.reduls.gomoku.Morpheme | |
@Grab(group='org.jsoup', module='jsoup', version='1.6.1') | |
import org.jsoup.* | |
def url //ジャンル判定対象 | |
if(args && args[0]) { | |
try{ | |
url = args[0] | |
url.toURL() | |
}catch(e){ | |
println "Invalid URL: ${url}" | |
return | |
} | |
} else { | |
println "Usage: groovy bayes.groovy <url>" | |
return | |
} | |
vocabularies = [] as Set | |
wordcount = [:].withDefault{ [:].withDefault{0} } | |
catcount = [:].withDefault{0} | |
def getwords(sentence){ | |
def result = Tagger.parse(sentence) | |
//def words = result*.find{ it.feature =~ /名詞|動詞|形容詞|形容動詞|感動詞|副詞|連体詞/ }.surface | |
def words = result*.find{ it.feature =~ /名詞/ }.surface | |
return words.collect{ it.toLowerCase() } | |
} | |
def wordcountup(word, cat){ | |
wordcount[cat][word] += 1 | |
vocabularies << word | |
} | |
def catcountup(cat){ | |
catcount[cat] += 1 | |
} | |
def train(doc, cat){ | |
word = getwords(doc) | |
word.each{ w -> | |
wordcountup(w, cat) | |
catcountup(cat) | |
} | |
} | |
def priorprob(cat){ | |
return catcount[cat] / catcount.values().sum() | |
} | |
def incategory(word, cat){ | |
if( wordcount[cat].containsKey(word) ) { | |
//println "Hit word=${word}, cat=${cat}, count=${wordcount[cat][word]}" | |
return wordcount[cat][word] | |
} | |
else return 0 | |
} | |
def wordprob(word, cat){ | |
def prob = (incategory(word, cat) + 1.0) / | |
(wordcount[cat].values().sum() + vocabularies.size()) | |
return prob | |
} | |
def score(word, cat){ | |
score = Math.log( priorprob(cat) ) | |
word.each{ w -> | |
score += Math.log(wordprob(w, cat)) | |
} | |
return score | |
} | |
def classifier(doc){ | |
def best = '' | |
def max = Integer.MIN_VALUE | |
word = getwords(doc) | |
catcount.each{ catkey,catval -> | |
def prob = score(word, catkey) | |
//println "${catkey}=>${prob}: ${doc}" | |
if(prob > max) { | |
max = prob | |
best = catkey | |
} | |
} | |
return best | |
} | |
def getOrRestore(name, tmp){ | |
def file = new File("${tmp}/${name}Exp.txt") | |
if(file.exists()){ | |
return file.text | |
} else { | |
def exp = Jsoup.connect("http://ja.wikipedia.org/wiki/${URLEncoder.encode(name,'UTF-8')}").get().text() | |
file.withWriter{ it.print exp } | |
return exp | |
} | |
} | |
def home = System.getProperty('user.home') | |
def tmp = home + '/tmp' | |
def pythonExp = getOrRestore('Python', tmp) | |
train(pythonExp, 'Python') | |
def rubyExp = getOrRestore('Ruby', tmp) | |
train(rubyExp, 'Ruby') | |
def groovyExp = getOrRestore('Groovy', tmp) | |
train(groovyExp, 'Groovy') | |
def words = Jsoup.connect(url).get().text() | |
println "${url} => 推定カテゴリ: ${classifier(words)}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
学習ソースはWikipediaからスクレイピングして ~/tmp/ 以下にキャッシュしています。
このサンプルではGroovy/Ruby/Pythonの分類に対応しています。