Skip to content

Instantly share code, notes, and snippets.

@phelrine
Created November 17, 2011 15:17
Show Gist options
  • Save phelrine/1373375 to your computer and use it in GitHub Desktop.
Save phelrine/1373375 to your computer and use it in GitHub Desktop.
単純ベイズ分類
# -*- coding: utf-8 -*-
require 'MeCab'
require 'json'
require 'readline'
class NaiveBayesClassifier
@@FEATURES = ["名詞", "形容詞", "動詞"]
def initialize(train)
@users = {}
@users_count = Hash.new(0)
@total_count = 0
@mecab = MeCab::Tagger.new
train.each{|tweet|
parse(tweet["text"]).each{|word| count(tweet["user"], word)} if tweet["text"]
}
end
def parse(s)
words = []
node = @mecab.parseToNode(s)
while node do
f = node.feature.force_encoding("UTF-8").split(",")[0]
if @@FEATURES.include? f
surface = node.surface.force_encoding("UTF-8")
words << surface unless surface =~ /@/
end
node = node.next
end
words
end
def count(user, word)
@total_count += 1
@users_count[user] += 1
count = @users.fetch(user, Hash.new(0))
count[word] += 1
@users[user] = count
end
def classify(input)
@users.map{|k, v| [k, posterior(k, parse(input))]}.max{|a, b|
a[1] <=> b[1]
}[0]
end
def posterior(user, words)
Math.log(prior(user)) + likelihood(words, user)
end
def likelihood(words, user)
words.reduce(0){|l, w| l + Math.log(word_prob(w, user))}
end
def prior(user)
@users_count[user].to_f / @total_count
end
def word_prob(word, user)
@users[user].fetch(word, 1).to_f / @users_count[user]
end
end
nbc = NaiveBayesClassifier.new(JSON.parse(open(ARGV[0]).read))
while buf = Readline.readline("> ", true)
puts nbc.classify(buf)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment