mitmul · March 26, 2013 08:59
diff --git a/10_arms.rb b/10_arms.rb
 require "gnuplot"
 require "narray"

 def draw_chart(x, y)
  Gnuplot.open do |gp|
    Gnuplot::Plot.new(gp) do |plot|
      y.each do |name, value|
        if x.size == value.size
          plot.data << Gnuplot::DataSet.new([x, value]) do |ds|
            ds.with = "lines"
            ds.title = name
          end
        end
      end
    end
  end
 end

 def make_hist(prob_array)
  hist = Array.new(100, 0)
  prob_array[0].each do |prob|
    hist[prob*10.to_i + 50] += 1
  end

  draw_chart(NArray[0..hist.size-1].to_a, {"Normal Distribution!!!!" => hist})
 end

 def nrand
  12.times.inject(0){|a, i| a += rand} - 6.0
 end

 def greedy(try, reward_array, exp)
  q_t_a = Array.new(10, 0.0)
  r_sum = Array.new(10, 0.0)
  result = 0.0
  choices = Array.new(10, 0)
  path = []

  # 真に最適な手
  exp_reward_array = []
  (0..9).each do |i|
    exp_reward_array << reward_array[i][exp]
  end
  opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
  opt_select = 0
  opt_path = []

  try.times do |t|
    choice = NVector[q_t_a].flatten.sort_index[-1]

    opt_select += 1 if choice == opt_choice
    choices[choice] += 1

    reward = nrand + reward_array[choice][exp]
    r_sum[choice] += reward
    result += reward
    q_t_a[choice] = r_sum[choice] / choices[choice]

    path << reward
    opt_path << opt_select.to_f / t.to_f
  end

  [path, opt_path]
 end

 def epsilon_greedy(try, reward_array, exp, epsilon)
  q_t_a = Array.new(10, 5.0)
  r_sum = Array.new(10, 0.0)
  result = 0.0
  choices = Array.new(10, 0)
  path = []

  # 真に最適な手
  exp_reward_array = []
  (0..9).each do |i|
    exp_reward_array << reward_array[i][exp]
  end
  opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
  opt_select = 0
  opt_path = []

  try.times do |t|
    choice = 0
    if rand <= epsilon ? true : false
      choice = rand(10)
    else
      choice = NVector[q_t_a].flatten.sort_index[-1]
    end

    opt_select += 1 if choice == opt_choice
    choices[choice] += 1

    reward = nrand + reward_array[choice][exp]
    r_sum[choice] += reward
    result += reward
    q_t_a[choice] = r_sum[choice] / choices[choice]

    path << reward
    opt_path << opt_select.to_f / t.to_f
  end
  [path, opt_path]
 end

 def get_occurrence_id(distribution)
  occur_id = rand(distribution.size)
  prob_sum = 0.0
  r = rand
  distribution.each_with_index do |d, i|
    if prob_sum <= r && r < prob_sum + d
      occur_id = i
      break
    end
    prob_sum += d
  end
  occur_id
 end

 def softmax(try, reward_array, exp, tau)
  q_t_a = Array.new(10, 0.0)
  r_sum = Array.new(10, 0.0)
  result = 0.0
  choices = Array.new(10, 0)
  path = []

  # 真に最適な手
  exp_reward_array = []
  (0..9).each do |i|
    exp_reward_array << reward_array[i][exp]
  end
  opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
  opt_select = 0
  opt_path = []

  choice_prob = []

  try.times do |t|
    # 各行動の選択確率（Boltzmann分布）
    denominator = q_t_a.inject(0){|sum, qta| sum += Math.exp(qta / tau)}
    if denominator.infinite?
      q_t_a.each_with_index do |qta, i|
        choice_prob[i] = 0
      end
    else
      q_t_a.each_with_index do |qta, i|
        numerator = Math.exp(qta / tau)
        choice_prob[i] = numerator / denominator
      end
    end

    # 選択確率に従って行動を選択
    choice = get_occurrence_id(choice_prob)

    opt_select += 1 if choice == opt_choice
    choices[choice] += 1

    reward = nrand + reward_array[choice][exp]
    r_sum[choice] += reward
    result += reward
    q_t_a[choice] = r_sum[choice] / choices[choice]

    path << reward
    opt_path << opt_select.to_f / t.to_f
  end
  [path, opt_path]
 end

 def greedy_experiment
  try = 1000
  exp = 2000

  # 問題作成
  reward_array = []
  10.times do
    reward_array << exp.times.inject([]){|a, r| a << 12.times.inject(0){|a, i| a += rand} - 6.0}
  end

  # 正解率の推移
  greedy_path = ep_greedy_path = ep2_greedy_path = NVector.float(try)

  # 最適度の推移
  greedy_opt_path = ep_greedy_opt_path = ep2_greedy_opt_path = NVector.float(try)

  # 実験
  exp.times do |i|
    greedy_result = epsilon_greedy(try, reward_array, i, 0.0)
    greedy_path += NVector.to_na(greedy_result[0])
    greedy_opt_path += NVector.to_na(greedy_result[1])

    ep_greedy_result = epsilon_greedy(try, reward_array, i, 0.01)
    ep_greedy_path += NVector.to_na(ep_greedy_result[0])
    ep_greedy_opt_path += NVector.to_na(ep_greedy_result[1])

    ep2_greedy_result = epsilon_greedy(try, reward_array, i, 0.1)
    ep2_greedy_path += NVector.to_na(ep2_greedy_result[0])
    ep2_greedy_opt_path += NVector.to_na(ep2_greedy_result[1])
  end
  greedy_path = (greedy_path / exp).to_a
  greedy_opt_path = (greedy_opt_path / exp).to_a
  ep_greedy_path = (ep_greedy_path / exp).to_a
  ep_greedy_opt_path = (ep_greedy_opt_path / exp).to_a
  ep2_greedy_path = (ep2_greedy_path / exp).to_a
  ep2_greedy_opt_path = (ep2_greedy_opt_path / exp).to_a

  draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_path,
                                     "epsilon(=0.01) greedy" => ep_greedy_path,
                                     "epsilon(=0.1) greedy" => ep2_greedy_path})

  draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_opt_path,
                                     "epsilon(=0.01) greedy" => ep_greedy_opt_path,
                                     "epsilon(=0.1) greedy" => ep2_greedy_opt_path})
 end

 def softmax_experiment
  try = 1000
  exp = 2000

  # 問題作成
  reward_array = []
  10.times do
    reward_array << exp.times.inject([]){|a, r| a << 12.times.inject(0){|a, i| a += rand} - 6.0}
  end

  softmax_path_a = softmax_path_b = softmax_path_c = NVector.float(try)
  softmax_opt_path_a = softmax_opt_path_b = softmax_opt_path_c = NVector.float(try)

  # 実験
  exp.times do |i|
    softmax_result_a = softmax(try, reward_array, i, 0.1)
    softmax_path_a += NVector.to_na(softmax_result_a[0])
    softmax_opt_path_a += NVector.to_na(softmax_result_a[1])

    softmax_result_b = softmax(try, reward_array, i, 0.5)
    softmax_path_b += NVector.to_na(softmax_result_b[0])
    softmax_opt_path_b += NVector.to_na(softmax_result_b[1])

    softmax_result_c = softmax(try, reward_array, i, 1)
    softmax_path_c += NVector.to_na(softmax_result_c[0])
    softmax_opt_path_c += NVector.to_na(softmax_result_c[1])
  end
  softmax_path_a = (softmax_path_a / exp).to_a
  softmax_opt_path_a = (softmax_opt_path_a / exp).to_a
  softmax_path_b = (softmax_path_b / exp).to_a
  softmax_opt_path_b = (softmax_opt_path_b / exp).to_a
  softmax_path_c = (softmax_path_c / exp).to_a
  softmax_opt_path_c = (softmax_opt_path_c / exp).to_a

  draw_chart(NArray[0..try-1].to_a, {"softmax(tau=0.1)" => softmax_path_a,
                                     "softmax(tau=0.5)" => softmax_path_b,
                                     "softmax(tau=1)" => softmax_path_c})

  draw_chart(NArray[0..try-1].to_a, {"softmax(tau=0.1)" => softmax_opt_path_a,
                                     "softmax(tau=0.5)" => softmax_opt_path_b,
                                     "softmax(tau=1)" => softmax_opt_path_c})
 end

 def greedy_softmax_experiment
  try = 1000
  exp = 2000

  # 問題作成
  reward_array = []
  10.times do
    reward_array << exp.times.inject([]){|a, r| a << nrand}
  end

  greedy_path = ep_greedy_path = ep2_greedy_path = NVector.float(try)
  greedy_opt_path = ep_greedy_opt_path = ep2_greedy_opt_path = NVector.float(try)
  softmax_path_a = softmax_path_b = softmax_path_c = NVector.float(try)
  softmax_opt_path_a = softmax_opt_path_b = softmax_opt_path_c = NVector.float(try)

  # 実験
  exp.times do |i|
    greedy_result = epsilon_greedy(try, reward_array, i, 0.0)
    greedy_path += NVector.to_na(greedy_result[0])
    greedy_opt_path += NVector.to_na(greedy_result[1])

    ep_greedy_result = epsilon_greedy(try, reward_array, i, 0.01)
    ep_greedy_path += NVector.to_na(ep_greedy_result[0])
    ep_greedy_opt_path += NVector.to_na(ep_greedy_result[1])

    ep2_greedy_result = epsilon_greedy(try, reward_array, i, 0.1)
    ep2_greedy_path += NVector.to_na(ep2_greedy_result[0])
    ep2_greedy_opt_path += NVector.to_na(ep2_greedy_result[1])

    softmax_result_a = softmax(try, reward_array, i, 0.1)
    softmax_path_a += NVector.to_na(softmax_result_a[0])
    softmax_opt_path_a += NVector.to_na(softmax_result_a[1])

    softmax_result_b = softmax(try, reward_array, i, 0.5)
    softmax_path_b += NVector.to_na(softmax_result_b[0])
    softmax_opt_path_b += NVector.to_na(softmax_result_b[1])

    softmax_result_c = softmax(try, reward_array, i, 1)
    softmax_path_c += NVector.to_na(softmax_result_c[0])
    softmax_opt_path_c += NVector.to_na(softmax_result_c[1])
  end
  greedy_path = (greedy_path / exp).to_a
  greedy_opt_path = (greedy_opt_path / exp).to_a
  ep_greedy_path = (ep_greedy_path / exp).to_a
  ep_greedy_opt_path = (ep_greedy_opt_path / exp).to_a
  ep2_greedy_path = (ep2_greedy_path / exp).to_a
  ep2_greedy_opt_path = (ep2_greedy_opt_path / exp).to_a
  softmax_path_a = (softmax_path_a / exp).to_a
  softmax_opt_path_a = (softmax_opt_path_a / exp).to_a
  softmax_path_b = (softmax_path_b / exp).to_a
  softmax_opt_path_b = (softmax_opt_path_b / exp).to_a
  softmax_path_c = (softmax_path_c / exp).to_a
  softmax_opt_path_c = (softmax_opt_path_c / exp).to_a

  draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_path,
                                     "epsilon(=0.01) greedy" => ep_greedy_path,
                                     "epsilon(=0.1) greedy" => ep2_greedy_path,
                                     "softmax(tau=0.1)" => softmax_path_a,
                                     "softmax(tau=0.5)" => softmax_path_b,
                                     "softmax(tau=1)" => softmax_path_c})
 end

 greedy_softmax_experiment
	require "gnuplot"
	require "narray"

	def draw_chart(x, y)
	Gnuplot.open do \|gp\|
	Gnuplot::Plot.new(gp) do \|plot\|
	y.each do \|name, value\|
	if x.size == value.size
	plot.data << Gnuplot::DataSet.new([x, value]) do \|ds\|
	ds.with = "lines"
	ds.title = name
	end
	end
	end
	end
	end
	end

	def make_hist(prob_array)
	hist = Array.new(100, 0)
	prob_array[0].each do \|prob\|
	hist[prob*10.to_i + 50] += 1
	end

	draw_chart(NArray[0..hist.size-1].to_a, {"Normal Distribution!!!!" => hist})
	end

	def nrand
	12.times.inject(0){\|a, i\| a += rand} - 6.0
	end

	def greedy(try, reward_array, exp)
	q_t_a = Array.new(10, 0.0)
	r_sum = Array.new(10, 0.0)
	result = 0.0
	choices = Array.new(10, 0)
	path = []

	# 真に最適な手
	exp_reward_array = []
	(0..9).each do \|i\|
	exp_reward_array << reward_array[i][exp]
	end
	opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
	opt_select = 0
	opt_path = []

	try.times do \|t\|
	choice = NVector[q_t_a].flatten.sort_index[-1]

	opt_select += 1 if choice == opt_choice
	choices[choice] += 1

	reward = nrand + reward_array[choice][exp]
	r_sum[choice] += reward
	result += reward
	q_t_a[choice] = r_sum[choice] / choices[choice]

	path << reward
	opt_path << opt_select.to_f / t.to_f
	end

	[path, opt_path]
	end

	def epsilon_greedy(try, reward_array, exp, epsilon)
	q_t_a = Array.new(10, 5.0)
	r_sum = Array.new(10, 0.0)
	result = 0.0
	choices = Array.new(10, 0)
	path = []

	# 真に最適な手
	exp_reward_array = []
	(0..9).each do \|i\|
	exp_reward_array << reward_array[i][exp]
	end
	opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
	opt_select = 0
	opt_path = []

	try.times do \|t\|
	choice = 0
	if rand <= epsilon ? true : false
	choice = rand(10)
	else
	choice = NVector[q_t_a].flatten.sort_index[-1]
	end

	opt_select += 1 if choice == opt_choice
	choices[choice] += 1

	reward = nrand + reward_array[choice][exp]
	r_sum[choice] += reward
	result += reward
	q_t_a[choice] = r_sum[choice] / choices[choice]

	path << reward
	opt_path << opt_select.to_f / t.to_f
	end
	[path, opt_path]
	end

	def get_occurrence_id(distribution)
	occur_id = rand(distribution.size)
	prob_sum = 0.0
	r = rand
	distribution.each_with_index do \|d, i\|
	if prob_sum <= r && r < prob_sum + d
	occur_id = i
	break
	end
	prob_sum += d
	end
	occur_id
	end

	def softmax(try, reward_array, exp, tau)
	q_t_a = Array.new(10, 0.0)
	r_sum = Array.new(10, 0.0)
	result = 0.0
	choices = Array.new(10, 0)
	path = []

	# 真に最適な手
	exp_reward_array = []
	(0..9).each do \|i\|
	exp_reward_array << reward_array[i][exp]
	end
	opt_choice = NVector[exp_reward_array].flatten.sort_index[-1]
	opt_select = 0
	opt_path = []

	choice_prob = []

	try.times do \|t\|
	# 各行動の選択確率（Boltzmann分布）
	denominator = q_t_a.inject(0){\|sum, qta\| sum += Math.exp(qta / tau)}
	if denominator.infinite?
	q_t_a.each_with_index do \|qta, i\|
	choice_prob[i] = 0
	end
	else
	q_t_a.each_with_index do \|qta, i\|
	numerator = Math.exp(qta / tau)
	choice_prob[i] = numerator / denominator
	end
	end

	# 選択確率に従って行動を選択
	choice = get_occurrence_id(choice_prob)

	opt_select += 1 if choice == opt_choice
	choices[choice] += 1

	reward = nrand + reward_array[choice][exp]
	r_sum[choice] += reward
	result += reward
	q_t_a[choice] = r_sum[choice] / choices[choice]

	path << reward
	opt_path << opt_select.to_f / t.to_f
	end
	[path, opt_path]
	end

	def greedy_experiment
	try = 1000
	exp = 2000

	# 問題作成
	reward_array = []
	10.times do
	reward_array << exp.times.inject([]){\|a, r\| a << 12.times.inject(0){\|a, i\| a += rand} - 6.0}
	end

	# 正解率の推移
	greedy_path = ep_greedy_path = ep2_greedy_path = NVector.float(try)

	# 最適度の推移
	greedy_opt_path = ep_greedy_opt_path = ep2_greedy_opt_path = NVector.float(try)

	# 実験
	exp.times do \|i\|
	greedy_result = epsilon_greedy(try, reward_array, i, 0.0)
	greedy_path += NVector.to_na(greedy_result[0])
	greedy_opt_path += NVector.to_na(greedy_result[1])

	ep_greedy_result = epsilon_greedy(try, reward_array, i, 0.01)
	ep_greedy_path += NVector.to_na(ep_greedy_result[0])
	ep_greedy_opt_path += NVector.to_na(ep_greedy_result[1])

	ep2_greedy_result = epsilon_greedy(try, reward_array, i, 0.1)
	ep2_greedy_path += NVector.to_na(ep2_greedy_result[0])
	ep2_greedy_opt_path += NVector.to_na(ep2_greedy_result[1])
	end
	greedy_path = (greedy_path / exp).to_a
	greedy_opt_path = (greedy_opt_path / exp).to_a
	ep_greedy_path = (ep_greedy_path / exp).to_a
	ep_greedy_opt_path = (ep_greedy_opt_path / exp).to_a
	ep2_greedy_path = (ep2_greedy_path / exp).to_a
	ep2_greedy_opt_path = (ep2_greedy_opt_path / exp).to_a

	draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_path,
	"epsilon(=0.01) greedy" => ep_greedy_path,
	"epsilon(=0.1) greedy" => ep2_greedy_path})

	draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_opt_path,
	"epsilon(=0.01) greedy" => ep_greedy_opt_path,
	"epsilon(=0.1) greedy" => ep2_greedy_opt_path})
	end

	def softmax_experiment
	try = 1000
	exp = 2000

	# 問題作成
	reward_array = []
	10.times do
	reward_array << exp.times.inject([]){\|a, r\| a << 12.times.inject(0){\|a, i\| a += rand} - 6.0}
	end

	softmax_path_a = softmax_path_b = softmax_path_c = NVector.float(try)
	softmax_opt_path_a = softmax_opt_path_b = softmax_opt_path_c = NVector.float(try)

	# 実験
	exp.times do \|i\|
	softmax_result_a = softmax(try, reward_array, i, 0.1)
	softmax_path_a += NVector.to_na(softmax_result_a[0])
	softmax_opt_path_a += NVector.to_na(softmax_result_a[1])

	softmax_result_b = softmax(try, reward_array, i, 0.5)
	softmax_path_b += NVector.to_na(softmax_result_b[0])
	softmax_opt_path_b += NVector.to_na(softmax_result_b[1])

	softmax_result_c = softmax(try, reward_array, i, 1)
	softmax_path_c += NVector.to_na(softmax_result_c[0])
	softmax_opt_path_c += NVector.to_na(softmax_result_c[1])
	end
	softmax_path_a = (softmax_path_a / exp).to_a
	softmax_opt_path_a = (softmax_opt_path_a / exp).to_a
	softmax_path_b = (softmax_path_b / exp).to_a
	softmax_opt_path_b = (softmax_opt_path_b / exp).to_a
	softmax_path_c = (softmax_path_c / exp).to_a
	softmax_opt_path_c = (softmax_opt_path_c / exp).to_a

	draw_chart(NArray[0..try-1].to_a, {"softmax(tau=0.1)" => softmax_path_a,
	"softmax(tau=0.5)" => softmax_path_b,
	"softmax(tau=1)" => softmax_path_c})

	draw_chart(NArray[0..try-1].to_a, {"softmax(tau=0.1)" => softmax_opt_path_a,
	"softmax(tau=0.5)" => softmax_opt_path_b,
	"softmax(tau=1)" => softmax_opt_path_c})
	end

	def greedy_softmax_experiment
	try = 1000
	exp = 2000

	# 問題作成
	reward_array = []
	10.times do
	reward_array << exp.times.inject([]){\|a, r\| a << nrand}
	end

	greedy_path = ep_greedy_path = ep2_greedy_path = NVector.float(try)
	greedy_opt_path = ep_greedy_opt_path = ep2_greedy_opt_path = NVector.float(try)
	softmax_path_a = softmax_path_b = softmax_path_c = NVector.float(try)
	softmax_opt_path_a = softmax_opt_path_b = softmax_opt_path_c = NVector.float(try)

	# 実験
	exp.times do \|i\|
	greedy_result = epsilon_greedy(try, reward_array, i, 0.0)
	greedy_path += NVector.to_na(greedy_result[0])
	greedy_opt_path += NVector.to_na(greedy_result[1])

	ep_greedy_result = epsilon_greedy(try, reward_array, i, 0.01)
	ep_greedy_path += NVector.to_na(ep_greedy_result[0])
	ep_greedy_opt_path += NVector.to_na(ep_greedy_result[1])

	ep2_greedy_result = epsilon_greedy(try, reward_array, i, 0.1)
	ep2_greedy_path += NVector.to_na(ep2_greedy_result[0])
	ep2_greedy_opt_path += NVector.to_na(ep2_greedy_result[1])

	softmax_result_a = softmax(try, reward_array, i, 0.1)
	softmax_path_a += NVector.to_na(softmax_result_a[0])
	softmax_opt_path_a += NVector.to_na(softmax_result_a[1])

	softmax_result_b = softmax(try, reward_array, i, 0.5)
	softmax_path_b += NVector.to_na(softmax_result_b[0])
	softmax_opt_path_b += NVector.to_na(softmax_result_b[1])

	softmax_result_c = softmax(try, reward_array, i, 1)
	softmax_path_c += NVector.to_na(softmax_result_c[0])
	softmax_opt_path_c += NVector.to_na(softmax_result_c[1])
	end
	greedy_path = (greedy_path / exp).to_a
	greedy_opt_path = (greedy_opt_path / exp).to_a
	ep_greedy_path = (ep_greedy_path / exp).to_a
	ep_greedy_opt_path = (ep_greedy_opt_path / exp).to_a
	ep2_greedy_path = (ep2_greedy_path / exp).to_a
	ep2_greedy_opt_path = (ep2_greedy_opt_path / exp).to_a
	softmax_path_a = (softmax_path_a / exp).to_a
	softmax_opt_path_a = (softmax_opt_path_a / exp).to_a
	softmax_path_b = (softmax_path_b / exp).to_a
	softmax_opt_path_b = (softmax_opt_path_b / exp).to_a
	softmax_path_c = (softmax_path_c / exp).to_a
	softmax_opt_path_c = (softmax_opt_path_c / exp).to_a

	draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_path,
	"epsilon(=0.01) greedy" => ep_greedy_path,
	"epsilon(=0.1) greedy" => ep2_greedy_path,
	"softmax(tau=0.1)" => softmax_path_a,
	"softmax(tau=0.5)" => softmax_path_b,
	"softmax(tau=1)" => softmax_path_c})
	end

	greedy_softmax_experiment