Skip to content

Instantly share code, notes, and snippets.

@ybenjo
Created March 30, 2012 09:51
Show Gist options
  • Save ybenjo/2250443 to your computer and use it in GitHub Desktop.
Save ybenjo/2250443 to your computer and use it in GitHub Desktop.
hmm
# -*- coding: utf-8 -*-
# Practice of Viterbi Algorithm
# Data from
# http://www.chokkan.org/lectures/2011nlp/03nlp.pdf
class HMM
def initialize
@all_x = [ ]
@all_y = [ ]
@n_x_y = Hash.new{0.0}
@n_y_y = Hash.new{0.0}
@n_y = Hash.new{0}
@n_prev_y = Hash.new{0}
end
# format: [[word, label], ..., ]
# example: [["I", "S"], ["use", "V"], ...]
def parse_sequence(labeled_sequence)
labeled_sequence.each_with_index do |e, i|
x, y = e.map{|x|x.to_sym}
@all_x.push x
@all_y.push y
@n_x_y[x => y] += 1
if i > 0
prev_y = labeled_sequence[i - 1].last.to_sym
@n_y_y[prev_y => y] += 1
@n_prev_y[prev_y] += 1
end
@n_y[y] += 1
end
end
def train(data)
data.each do |labeled_sequence|
parse_sequence(labeled_sequence)
end
@all_x.uniq!
@all_y.uniq!
@p_x_y = Hash.new{0.0}
@p_y_y = Hash.new{0.0}
@all_y.each do |y_from|
@all_x.each do |x|
@p_x_y[x => y_from] = @n_x_y[x => y_from] / @n_y[y_from]
end
@all_y.each do |y_to|
@p_y_y[y_from => y_to] = @n_y_y[y_from => y_to] / @n_prev_y[y_from]
end
end
end
# For tests.
def calc_predefined_data
@all_x = [:brown, :promises, :free]
@all_y = [:noun, :verb, :adj]
@p_x_y = {
{:brown => :noun} => 0.3,
{:brown => :verb} => 0.2,
{:brown => :adj} => 0.5,
{:promises => :noun} => 0.3,
{:promises => :verb} => 0.4,
{:promises => :adj} => 0.1,
{:free => :noun} => 0.4,
{:free => :verb} => 0.4,
{:free => :adj} => 0.4
}
@p_y_y = {
{:noun => :noun} => 0.4,
{:verb => :noun} => 0.7,
{:adj => :noun} => 0.5,
{:noun => :verb} => 0.5,
{:verb => :verb} => 0.1,
{:adj => :verb} => 0.2,
{:noun => :adj} => 0.1,
{:verb => :adj} => 0.2,
{:adj => :adj} => 0.3
}
end
def viterbi(query)
unlabeled_sequence = query.map{|e|e.to_sym}
@score = { }
@max_score_label = { }
# 先頭のスコアを初期化
@all_y.each do |y|
x = unlabeled_sequence.first
@score[y => 0] = @p_x_y[x => y]
end
1.upto(unlabeled_sequence.size - 1) do |i|
x = unlabeled_sequence[i]
@all_y.each do |y_to|
tmp = { }
@all_y.each do |y_from|
# 本来なら logsumexp とか使う
tmp[y_from] = @score[y_from => i - 1] * @p_y_y[y_from => y_to] * @p_x_y[x => y_to]
end
label, value = tmp.to_a.sort{|y, x|x[1] <=> y[1]}.first
@score[y_to => i] = value
@max_score_label[y_to => i] = label
end
end
return_sequence = [ ]
lim = unlabeled_sequence.size - 1
key, final_value = @score.select{|k, v|k.values.include?(lim)}.to_a.sort{|y, x| x[1] <=> y[1]}.first
label = key.to_a.first.first
return_sequence.push label
next_label = @max_score_label[label => lim]
(lim - 1).downto(0) do |i|
return_sequence.push next_label
next_label = @max_score_label[next_label => i]
end
return return_sequence.reverse, final_value
end
end
h = HMM.new
train_data = [
[["ビール", "晴れ"], ["ビール", "晴れ"], ["熱燗", "曇"], ["お湯割り", "雪"]],
[["ビール", "晴れ"], ["ビール", "晴れ"], ["熱燗", "曇"], ["お湯割り", "曇"], ["熱燗", "曇"]],
[["ビール", "晴れ"], ["熱燗", "曇"], ["熱燗", "雪"], ["お湯割り", "雪"], ["お湯割り", "雪"], ["お湯割り", "雪"]]
]
h.train(train_data)
p h.viterbi(["ビール", "ビール", "熱燗", "熱燗", "お湯割り", "ビール"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment