Created
March 26, 2013 08:59
-
-
Save mitmul/5244017 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "gnuplot" | |
require "narray" | |
def draw_chart(x, y) | |
Gnuplot.open do |gp| | |
Gnuplot::Plot.new(gp) do |plot| | |
y.each do |name, value| | |
if x.size == value.size | |
plot.data << Gnuplot::DataSet.new([x, value]) do |ds| | |
ds.with = "lines" | |
ds.title = name | |
end | |
end | |
end | |
end | |
end | |
end | |
def make_hist(prob_array) | |
hist = Array.new(100, 0) | |
prob_array[0].each do |prob| | |
hist[prob*10.to_i + 50] += 1 | |
end | |
draw_chart(NArray[0..hist.size-1].to_a, {"Normal Distribution!!!!" => hist}) | |
end | |
def nrand | |
12.times.inject(0){|a, i| a += rand} - 6.0 | |
end | |
def greedy(try, reward_array, exp) | |
q_t_a = Array.new(10, 0.0) | |
r_sum = Array.new(10, 0.0) | |
result = 0.0 | |
choices = Array.new(10, 0) | |
path = [] | |
# 真に最適な手 | |
exp_reward_array = [] | |
(0..9).each do |i| | |
exp_reward_array << reward_array[i][exp] | |
end | |
opt_choice = NVector[exp_reward_array].flatten.sort_index[-1] | |
opt_select = 0 | |
opt_path = [] | |
try.times do |t| | |
choice = NVector[q_t_a].flatten.sort_index[-1] | |
opt_select += 1 if choice == opt_choice | |
choices[choice] += 1 | |
reward = nrand + reward_array[choice][exp] | |
r_sum[choice] += reward | |
result += reward | |
q_t_a[choice] = r_sum[choice] / choices[choice] | |
path << reward | |
opt_path << opt_select.to_f / t.to_f | |
end | |
[path, opt_path] | |
end | |
def epsilon_greedy(try, reward_array, exp, epsilon) | |
q_t_a = Array.new(10, 5.0) | |
r_sum = Array.new(10, 0.0) | |
result = 0.0 | |
choices = Array.new(10, 0) | |
path = [] | |
# 真に最適な手 | |
exp_reward_array = [] | |
(0..9).each do |i| | |
exp_reward_array << reward_array[i][exp] | |
end | |
opt_choice = NVector[exp_reward_array].flatten.sort_index[-1] | |
opt_select = 0 | |
opt_path = [] | |
try.times do |t| | |
choice = 0 | |
if rand <= epsilon ? true : false | |
choice = rand(10) | |
else | |
choice = NVector[q_t_a].flatten.sort_index[-1] | |
end | |
opt_select += 1 if choice == opt_choice | |
choices[choice] += 1 | |
reward = nrand + reward_array[choice][exp] | |
r_sum[choice] += reward | |
result += reward | |
q_t_a[choice] = r_sum[choice] / choices[choice] | |
path << reward | |
opt_path << opt_select.to_f / t.to_f | |
end | |
[path, opt_path] | |
end | |
def get_occurrence_id(distribution) | |
occur_id = rand(distribution.size) | |
prob_sum = 0.0 | |
r = rand | |
distribution.each_with_index do |d, i| | |
if prob_sum <= r && r < prob_sum + d | |
occur_id = i | |
break | |
end | |
prob_sum += d | |
end | |
occur_id | |
end | |
def softmax(try, reward_array, exp, tau) | |
q_t_a = Array.new(10, 0.0) | |
r_sum = Array.new(10, 0.0) | |
result = 0.0 | |
choices = Array.new(10, 0) | |
path = [] | |
# 真に最適な手 | |
exp_reward_array = [] | |
(0..9).each do |i| | |
exp_reward_array << reward_array[i][exp] | |
end | |
opt_choice = NVector[exp_reward_array].flatten.sort_index[-1] | |
opt_select = 0 | |
opt_path = [] | |
choice_prob = [] | |
try.times do |t| | |
# 各行動の選択確率(Boltzmann分布) | |
denominator = q_t_a.inject(0){|sum, qta| sum += Math.exp(qta / tau)} | |
if denominator.infinite? | |
q_t_a.each_with_index do |qta, i| | |
choice_prob[i] = 0 | |
end | |
else | |
q_t_a.each_with_index do |qta, i| | |
numerator = Math.exp(qta / tau) | |
choice_prob[i] = numerator / denominator | |
end | |
end | |
# 選択確率に従って行動を選択 | |
choice = get_occurrence_id(choice_prob) | |
opt_select += 1 if choice == opt_choice | |
choices[choice] += 1 | |
reward = nrand + reward_array[choice][exp] | |
r_sum[choice] += reward | |
result += reward | |
q_t_a[choice] = r_sum[choice] / choices[choice] | |
path << reward | |
opt_path << opt_select.to_f / t.to_f | |
end | |
[path, opt_path] | |
end | |
def greedy_experiment | |
try = 1000 | |
exp = 2000 | |
# 問題作成 | |
reward_array = [] | |
10.times do | |
reward_array << exp.times.inject([]){|a, r| a << 12.times.inject(0){|a, i| a += rand} - 6.0} | |
end | |
# 正解率の推移 | |
greedy_path = ep_greedy_path = ep2_greedy_path = NVector.float(try) | |
# 最適度の推移 | |
greedy_opt_path = ep_greedy_opt_path = ep2_greedy_opt_path = NVector.float(try) | |
# 実験 | |
exp.times do |i| | |
greedy_result = epsilon_greedy(try, reward_array, i, 0.0) | |
greedy_path += NVector.to_na(greedy_result[0]) | |
greedy_opt_path += NVector.to_na(greedy_result[1]) | |
ep_greedy_result = epsilon_greedy(try, reward_array, i, 0.01) | |
ep_greedy_path += NVector.to_na(ep_greedy_result[0]) | |
ep_greedy_opt_path += NVector.to_na(ep_greedy_result[1]) | |
ep2_greedy_result = epsilon_greedy(try, reward_array, i, 0.1) | |
ep2_greedy_path += NVector.to_na(ep2_greedy_result[0]) | |
ep2_greedy_opt_path += NVector.to_na(ep2_greedy_result[1]) | |
end | |
greedy_path = (greedy_path / exp).to_a | |
greedy_opt_path = (greedy_opt_path / exp).to_a | |
ep_greedy_path = (ep_greedy_path / exp).to_a | |
ep_greedy_opt_path = (ep_greedy_opt_path / exp).to_a | |
ep2_greedy_path = (ep2_greedy_path / exp).to_a | |
ep2_greedy_opt_path = (ep2_greedy_opt_path / exp).to_a | |
draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_path, | |
"epsilon(=0.01) greedy" => ep_greedy_path, | |
"epsilon(=0.1) greedy" => ep2_greedy_path}) | |
draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_opt_path, | |
"epsilon(=0.01) greedy" => ep_greedy_opt_path, | |
"epsilon(=0.1) greedy" => ep2_greedy_opt_path}) | |
end | |
def softmax_experiment | |
try = 1000 | |
exp = 2000 | |
# 問題作成 | |
reward_array = [] | |
10.times do | |
reward_array << exp.times.inject([]){|a, r| a << 12.times.inject(0){|a, i| a += rand} - 6.0} | |
end | |
softmax_path_a = softmax_path_b = softmax_path_c = NVector.float(try) | |
softmax_opt_path_a = softmax_opt_path_b = softmax_opt_path_c = NVector.float(try) | |
# 実験 | |
exp.times do |i| | |
softmax_result_a = softmax(try, reward_array, i, 0.1) | |
softmax_path_a += NVector.to_na(softmax_result_a[0]) | |
softmax_opt_path_a += NVector.to_na(softmax_result_a[1]) | |
softmax_result_b = softmax(try, reward_array, i, 0.5) | |
softmax_path_b += NVector.to_na(softmax_result_b[0]) | |
softmax_opt_path_b += NVector.to_na(softmax_result_b[1]) | |
softmax_result_c = softmax(try, reward_array, i, 1) | |
softmax_path_c += NVector.to_na(softmax_result_c[0]) | |
softmax_opt_path_c += NVector.to_na(softmax_result_c[1]) | |
end | |
softmax_path_a = (softmax_path_a / exp).to_a | |
softmax_opt_path_a = (softmax_opt_path_a / exp).to_a | |
softmax_path_b = (softmax_path_b / exp).to_a | |
softmax_opt_path_b = (softmax_opt_path_b / exp).to_a | |
softmax_path_c = (softmax_path_c / exp).to_a | |
softmax_opt_path_c = (softmax_opt_path_c / exp).to_a | |
draw_chart(NArray[0..try-1].to_a, {"softmax(tau=0.1)" => softmax_path_a, | |
"softmax(tau=0.5)" => softmax_path_b, | |
"softmax(tau=1)" => softmax_path_c}) | |
draw_chart(NArray[0..try-1].to_a, {"softmax(tau=0.1)" => softmax_opt_path_a, | |
"softmax(tau=0.5)" => softmax_opt_path_b, | |
"softmax(tau=1)" => softmax_opt_path_c}) | |
end | |
def greedy_softmax_experiment | |
try = 1000 | |
exp = 2000 | |
# 問題作成 | |
reward_array = [] | |
10.times do | |
reward_array << exp.times.inject([]){|a, r| a << nrand} | |
end | |
greedy_path = ep_greedy_path = ep2_greedy_path = NVector.float(try) | |
greedy_opt_path = ep_greedy_opt_path = ep2_greedy_opt_path = NVector.float(try) | |
softmax_path_a = softmax_path_b = softmax_path_c = NVector.float(try) | |
softmax_opt_path_a = softmax_opt_path_b = softmax_opt_path_c = NVector.float(try) | |
# 実験 | |
exp.times do |i| | |
greedy_result = epsilon_greedy(try, reward_array, i, 0.0) | |
greedy_path += NVector.to_na(greedy_result[0]) | |
greedy_opt_path += NVector.to_na(greedy_result[1]) | |
ep_greedy_result = epsilon_greedy(try, reward_array, i, 0.01) | |
ep_greedy_path += NVector.to_na(ep_greedy_result[0]) | |
ep_greedy_opt_path += NVector.to_na(ep_greedy_result[1]) | |
ep2_greedy_result = epsilon_greedy(try, reward_array, i, 0.1) | |
ep2_greedy_path += NVector.to_na(ep2_greedy_result[0]) | |
ep2_greedy_opt_path += NVector.to_na(ep2_greedy_result[1]) | |
softmax_result_a = softmax(try, reward_array, i, 0.1) | |
softmax_path_a += NVector.to_na(softmax_result_a[0]) | |
softmax_opt_path_a += NVector.to_na(softmax_result_a[1]) | |
softmax_result_b = softmax(try, reward_array, i, 0.5) | |
softmax_path_b += NVector.to_na(softmax_result_b[0]) | |
softmax_opt_path_b += NVector.to_na(softmax_result_b[1]) | |
softmax_result_c = softmax(try, reward_array, i, 1) | |
softmax_path_c += NVector.to_na(softmax_result_c[0]) | |
softmax_opt_path_c += NVector.to_na(softmax_result_c[1]) | |
end | |
greedy_path = (greedy_path / exp).to_a | |
greedy_opt_path = (greedy_opt_path / exp).to_a | |
ep_greedy_path = (ep_greedy_path / exp).to_a | |
ep_greedy_opt_path = (ep_greedy_opt_path / exp).to_a | |
ep2_greedy_path = (ep2_greedy_path / exp).to_a | |
ep2_greedy_opt_path = (ep2_greedy_opt_path / exp).to_a | |
softmax_path_a = (softmax_path_a / exp).to_a | |
softmax_opt_path_a = (softmax_opt_path_a / exp).to_a | |
softmax_path_b = (softmax_path_b / exp).to_a | |
softmax_opt_path_b = (softmax_opt_path_b / exp).to_a | |
softmax_path_c = (softmax_path_c / exp).to_a | |
softmax_opt_path_c = (softmax_opt_path_c / exp).to_a | |
draw_chart(NArray[0..try-1].to_a, {"greedy" => greedy_path, | |
"epsilon(=0.01) greedy" => ep_greedy_path, | |
"epsilon(=0.1) greedy" => ep2_greedy_path, | |
"softmax(tau=0.1)" => softmax_path_a, | |
"softmax(tau=0.5)" => softmax_path_b, | |
"softmax(tau=1)" => softmax_path_c}) | |
end | |
greedy_softmax_experiment |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment