Last active
June 22, 2017 21:59
-
-
Save tsu-nera/e5ba529a04ca02c48697471636c4a32b to your computer and use it in GitHub Desktop.
Q学習法
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym | |
import numpy as np | |
import gym_tic_tac_toe | |
from math import floor | |
env = gym.make('tic_tac_toe-v0') | |
n_states = 3 ** 9 # 状態数 | |
n_actions = 9 # 行動数 | |
eM = 1000 # 評価を行うエピソード数 | |
def q_learning(M, options): | |
# initiate lookup table | |
Q = np.zeros((n_states, n_actions)) | |
results = np.zeros(M) | |
for m in range(M): | |
np.random.seed(np.mod(m, eM)) | |
t = 1 | |
state = env.reset() | |
state3 = state['board'] | |
done = False | |
pstate = 0 | |
paction = 0 | |
while(True): | |
# 状態観測 | |
state10 = encode(state3) | |
# 政策の生成 | |
policy = np.zeros(n_actions) | |
policy = select_policy(options, Q, state10, policy) | |
# 行動の選択および実行 | |
action, state, reward, done = action_train(t, state3, policy) | |
state3 = state['board'] | |
######################## | |
# Q関数の更新(Q学習) | |
# 1ステップ前の状態,行動のQ値を更新 | |
if t > 1: | |
if reward == None: | |
reward = 0 | |
Q[pstate][paction] = \ | |
Q[pstate][paction]+options["alpha"]*(reward-Q[pstate][paction]+options["gamma"]*max(Q[state10])) | |
# ゲーム終了 | |
if done or env.move_generator() == []: | |
if reward == 1: | |
fin = 2 | |
elif reward == -1: | |
fin = 1 | |
elif reward == 0: | |
fin = 3 | |
else: | |
fin = None | |
results[m] = fin | |
break | |
# 状態と行動の記録 | |
pstate = state10 | |
paction = action | |
t += 1 | |
if np.mod(m, eM)==0: | |
partial_results = results[m-eM+1:m] | |
print('%d) Win=%d/%d, Draw=%d/%d, Lose=%d/%d' % (m, | |
len(partial_results[partial_results == 2]), eM, | |
len(partial_results[partial_results == 3]), eM, | |
len(partial_results[partial_results == 1]), eM)) | |
############################################################################### | |
############################################################################### | |
convert = [[0, 1, 2, 3, 4, 5, 6, 7, 8], | |
[2, 1, 0, 5, 4, 3, 8, 7, 6], | |
[6, 3, 0, 7, 4, 1, 8, 5, 2], | |
[0, 3, 8, 1, 4, 7, 2, 5, 8], | |
[8, 7, 6, 5, 4, 3, 2, 1, 0], | |
[6, 7, 8, 3, 4, 5, 0, 1, 2], | |
[2, 5, 8, 1, 4, 7, 0, 3, 6], | |
[8, 5, 2, 7, 4, 1, 6, 3, 0] | |
] | |
power = np.array([3 ** i for i in range(8, -1, -1)], dtype=np.float64) | |
def encode(state3): | |
return encode2(encode1(state3)) | |
def encode1(state3): | |
ret = np.empty(len(state3)) | |
for n, i in enumerate(state3): | |
if i == -1: | |
ret[n] = 1 | |
elif i == 1: | |
ret[n] = 2 | |
else: | |
ret[n] = 0 | |
return ret | |
def encode2(state3): | |
cands = [sum(state3[convert[i]] * power) for i in range(len(convert))] | |
return int(min(cands)) + 1 | |
def select_policy(options, Q, state10, policy): | |
if options['pmode'] == 0: | |
q = Q[state10] | |
v = max(q) | |
a = np.where(q == v)[0][0] | |
policy[a] = 1 | |
elif options['pmode'] == 1: | |
q = Q[state10] | |
v = max(q) | |
a = np.where(q == v)[0][0] | |
policy = np.ones(n_actions) * options['epsilon'] / n_actions | |
policy[a] = 1 - options['epsilon'] + options['epsilon'] / n_actions | |
elif options['pmode'] == 2: | |
policy = np.exp(Q[state10] / options['tau']) / \ | |
sum(np.exp(Q[state10] / options['tau'])) | |
return policy | |
def select_npc_action(step, state3, policy): | |
a = None | |
# first step is always select 0 | |
if step == 1: | |
return [1, 0] | |
else: | |
while 1: | |
random = np.random.rand() | |
cprob = 0 | |
for a in range(n_actions): | |
cprob += policy[a] | |
if random < cprob: | |
break | |
if state3[a] == 0: | |
break | |
return [1, a] | |
def select_enemy_action(state3, moves): | |
reach = False | |
pos = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [1, 5, 8], [0, 4, 8], [2, 4, 6]] | |
a = None | |
for i in range(len(pos)): | |
state_i = state3[pos[i]] | |
val = sum(state_i) | |
num = len(state_i[state_i == 0]) | |
if val == 2 and num == 1: | |
idx = int(state_i[state_i == 0][0]) | |
a = pos[i][idx] | |
if [-1, a] in moves: | |
reach = True | |
break | |
if not reach: | |
while 1: | |
a = floor(np.random.rand() * 8) + 1 | |
if state3[a] == 0: | |
break | |
return [-1, a] | |
def action_train(t, state3, policy): | |
# select action | |
npc_action = select_npc_action(t, state3, policy) | |
# action execute | |
state, reward, done, _ = env.step(npc_action) | |
moves = env.move_generator() | |
if done or moves == []: | |
return npc_action, state, reward, done | |
state32 = encode1(state['board']) | |
enemy_action = select_enemy_action(state32, moves) | |
# action execute | |
state, reward, done, _ = env.step(enemy_action) | |
if not done and reward == 0: | |
reward = None | |
return npc_action, state, reward, done | |
if __name__ == '__main__': | |
# ε- greedy | |
options = {'pmode': 1, 'epsilon': 0.1, 'alpha': 1, 'gamma': 0.9} |
少し勝率があがった。
- Win=593/1000, Draw=114/1000, Lose=292/1000
- Win=595/1000, Draw=112/1000, Lose=292/1000
- Win=570/1000, Draw=98/1000, Lose=331/1000
- Win=593/1000, Draw=101/1000, Lose=305/1000
- Win=553/1000, Draw=108/1000, Lose=338/1000
- Win=582/1000, Draw=89/1000, Lose=328/1000
- Win=600/1000, Draw=102/1000, Lose=297/1000
- Win=572/1000, Draw=105/1000, Lose=322/1000
- Win=597/1000, Draw=101/1000, Lose=301/1000
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
改善しないので、バグっている可能性が高いな。。