ybenjo · March 30, 2012 09:51
diff --git a/hmm.rb b/hmm.rb
 # -*- coding: utf-8 -*-
 # Practice of Viterbi Algorithm
 # Data from
 # http://www.chokkan.org/lectures/2011nlp/03nlp.pdf

 class HMM
  def initialize
    @all_x = [ ]
    @all_y = [ ]
    @n_x_y = Hash.new{0.0}
    @n_y_y = Hash.new{0.0}
    @n_y = Hash.new{0}
    @n_prev_y = Hash.new{0}
  end

  # format: [[word, label], ..., ]
  # example: [["I", "S"], ["use", "V"], ...]
  def parse_sequence(labeled_sequence)
    labeled_sequence.each_with_index do |e, i|
      x, y = e.map{|x|x.to_sym}
      @all_x.push x
      @all_y.push y
      @n_x_y[x => y] += 1
      if i > 0
        prev_y = labeled_sequence[i - 1].last.to_sym
        @n_y_y[prev_y => y] += 1
        @n_prev_y[prev_y] += 1
      end
      @n_y[y] += 1
    end
  end

  def train(data)
    data.each do |labeled_sequence|
      parse_sequence(labeled_sequence)
    end
    @all_x.uniq!
    @all_y.uniq!

    @p_x_y = Hash.new{0.0}
    @p_y_y = Hash.new{0.0}
    @all_y.each do |y_from|
      @all_x.each do |x|
        @p_x_y[x => y_from] = @n_x_y[x => y_from]  / @n_y[y_from]
      end
      
      @all_y.each do |y_to|
        @p_y_y[y_from => y_to] = @n_y_y[y_from => y_to]  / @n_prev_y[y_from]
      end
    end
  end

  # For tests.
  def calc_predefined_data
    @all_x = [:brown, :promises, :free]
    @all_y = [:noun, :verb, :adj]
    
    @p_x_y = {
      {:brown => :noun} => 0.3,
      {:brown => :verb} => 0.2,
      {:brown => :adj} => 0.5,
      {:promises => :noun} => 0.3,
      {:promises => :verb} => 0.4,
      {:promises => :adj} => 0.1,
      {:free => :noun} => 0.4,
      {:free => :verb} => 0.4,
      {:free => :adj} => 0.4
    }
    
    @p_y_y = {
      {:noun => :noun} => 0.4,
      {:verb => :noun} => 0.7,
      {:adj => :noun} => 0.5,
      {:noun => :verb} => 0.5,
      {:verb => :verb} => 0.1,
      {:adj => :verb} => 0.2,
      {:noun => :adj} => 0.1,
      {:verb => :adj} => 0.2,
      {:adj => :adj} => 0.3
    }
  end
  
  def viterbi(query)
    unlabeled_sequence = query.map{|e|e.to_sym}
    @score = { }
    @max_score_label = { }
    
    #　先頭のスコアを初期化
    @all_y.each do |y|
      x = unlabeled_sequence.first
      @score[y => 0] = @p_x_y[x => y]
    end
    
    1.upto(unlabeled_sequence.size - 1) do |i|
      x = unlabeled_sequence[i]
      @all_y.each do |y_to|
        tmp = { }
        @all_y.each do |y_from|
          # 本来なら logsumexp とか使う
          tmp[y_from] = @score[y_from => i - 1] * @p_y_y[y_from => y_to] * @p_x_y[x => y_to]
        end
        label, value = tmp.to_a.sort{|y, x|x[1] <=> y[1]}.first
        @score[y_to => i] = value
        @max_score_label[y_to => i] = label
      end
    end

    return_sequence = [ ]
    lim = unlabeled_sequence.size - 1
    key, final_value = @score.select{|k, v|k.values.include?(lim)}.to_a.sort{|y, x| x[1] <=> y[1]}.first
    label = key.to_a.first.first
    return_sequence.push label
    next_label = @max_score_label[label => lim]
    
    (lim - 1).downto(0) do |i|
      return_sequence.push next_label
      next_label = @max_score_label[next_label => i]
    end
    return return_sequence.reverse, final_value
  end
 end

 h = HMM.new

 train_data = [
              [["ビール", "晴れ"], ["ビール", "晴れ"], ["熱燗", "曇"], ["お湯割り", "雪"]],
              [["ビール", "晴れ"], ["ビール", "晴れ"], ["熱燗", "曇"], ["お湯割り", "曇"], ["熱燗", "曇"]],
              [["ビール", "晴れ"], ["熱燗", "曇"], ["熱燗", "雪"], ["お湯割り", "雪"], ["お湯割り", "雪"], ["お湯割り", "雪"]]
             ]

 h.train(train_data)
 p h.viterbi(["ビール", "ビール", "熱燗", "熱燗", "お湯割り", "ビール"])
	# -- coding: utf-8 --
	# Practice of Viterbi Algorithm
	# Data from
	# http://www.chokkan.org/lectures/2011nlp/03nlp.pdf

	class HMM
	def initialize
	@all_x = [ ]
	@all_y = [ ]
	@n_x_y = Hash.new{0.0}
	@n_y_y = Hash.new{0.0}
	@n_y = Hash.new{0}
	@n_prev_y = Hash.new{0}
	end

	# format: [[word, label], ..., ]
	# example: [["I", "S"], ["use", "V"], ...]
	def parse_sequence(labeled_sequence)
	labeled_sequence.each_with_index do \|e, i\|
	x, y = e.map{\|x\|x.to_sym}
	@all_x.push x
	@all_y.push y
	@n_x_y[x => y] += 1
	if i > 0
	prev_y = labeled_sequence[i - 1].last.to_sym
	@n_y_y[prev_y => y] += 1
	@n_prev_y[prev_y] += 1
	end
	@n_y[y] += 1
	end
	end

	def train(data)
	data.each do \|labeled_sequence\|
	parse_sequence(labeled_sequence)
	end
	@all_x.uniq!
	@all_y.uniq!

	@p_x_y = Hash.new{0.0}
	@p_y_y = Hash.new{0.0}
	@all_y.each do \|y_from\|
	@all_x.each do \|x\|
	@p_x_y[x => y_from] = @n_x_y[x => y_from] / @n_y[y_from]
	end

	@all_y.each do \|y_to\|
	@p_y_y[y_from => y_to] = @n_y_y[y_from => y_to] / @n_prev_y[y_from]
	end
	end
	end

	# For tests.
	def calc_predefined_data
	@all_x = [:brown, :promises, :free]
	@all_y = [:noun, :verb, :adj]

	@p_x_y = {
	{:brown => :noun} => 0.3,
	{:brown => :verb} => 0.2,
	{:brown => :adj} => 0.5,
	{:promises => :noun} => 0.3,
	{:promises => :verb} => 0.4,
	{:promises => :adj} => 0.1,
	{:free => :noun} => 0.4,
	{:free => :verb} => 0.4,
	{:free => :adj} => 0.4
	}

	@p_y_y = {
	{:noun => :noun} => 0.4,
	{:verb => :noun} => 0.7,
	{:adj => :noun} => 0.5,
	{:noun => :verb} => 0.5,
	{:verb => :verb} => 0.1,
	{:adj => :verb} => 0.2,
	{:noun => :adj} => 0.1,
	{:verb => :adj} => 0.2,
	{:adj => :adj} => 0.3
	}
	end

	def viterbi(query)
	unlabeled_sequence = query.map{\|e\|e.to_sym}
	@score = { }
	@max_score_label = { }

	#　先頭のスコアを初期化
	@all_y.each do \|y\|
	x = unlabeled_sequence.first
	@score[y => 0] = @p_x_y[x => y]
	end

	1.upto(unlabeled_sequence.size - 1) do \|i\|
	x = unlabeled_sequence[i]
	@all_y.each do \|y_to\|
	tmp = { }
	@all_y.each do \|y_from\|
	# 本来なら logsumexp とか使う
	tmp[y_from] = @score[y_from => i - 1] * @p_y_y[y_from => y_to] * @p_x_y[x => y_to]
	end
	label, value = tmp.to_a.sort{\|y, x\|x[1] <=> y[1]}.first
	@score[y_to => i] = value
	@max_score_label[y_to => i] = label
	end
	end

	return_sequence = [ ]
	lim = unlabeled_sequence.size - 1
	key, final_value = @score.select{\|k, v\|k.values.include?(lim)}.to_a.sort{\|y, x\| x[1] <=> y[1]}.first
	label = key.to_a.first.first
	return_sequence.push label
	next_label = @max_score_label[label => lim]

	(lim - 1).downto(0) do \|i\|
	return_sequence.push next_label
	next_label = @max_score_label[next_label => i]
	end
	return return_sequence.reverse, final_value
	end
	end

	h = HMM.new

	train_data = [
	[["ビール", "晴れ"], ["ビール", "晴れ"], ["熱燗", "曇"], ["お湯割り", "雪"]],
	[["ビール", "晴れ"], ["ビール", "晴れ"], ["熱燗", "曇"], ["お湯割り", "曇"], ["熱燗", "曇"]],
	[["ビール", "晴れ"], ["熱燗", "曇"], ["熱燗", "雪"], ["お湯割り", "雪"], ["お湯割り", "雪"], ["お湯割り", "雪"]]
	]

	h.train(train_data)
	p h.viterbi(["ビール", "ビール", "熱燗", "熱燗", "お湯割り", "ビール"])