tsu-nera · June 22, 2017 21:59 · tsu-nera · Jun 22, 2017 · tsu-nera · Jun 22, 2017
diff --git a/q_tic_tac_toe.py b/q_tic_tac_toe.py
 import gym
 import numpy as np
 import gym_tic_tac_toe
 from math import floor

 env = gym.make('tic_tac_toe-v0')
 n_states = 3 ** 9 # 状態数
 n_actions = 9 # 行動数
 eM = 1000 # 評価を行うエピソード数

 def q_learning(M, options):

    # initiate lookup table
    Q = np.zeros((n_states, n_actions))
    results = np.zeros(M)

    for m in range(M):
        np.random.seed(np.mod(m, eM))
        t = 1
        state = env.reset()
        state3 = state['board']

        done = False

        pstate = 0
        paction = 0

        while(True):
            # 状態観測
            state10 = encode(state3)

            # 政策の生成
            policy = np.zeros(n_actions)
            policy = select_policy(options, Q, state10, policy)

            # 行動の選択および実行
            action, state, reward, done = action_train(t, state3, policy)

            state3 = state['board']

            ########################
            # Q関数の更新（Q学習）
            # 1ステップ前の状態，行動のQ値を更新
            if t > 1:
                if reward == None:
                    reward = 0
                Q[pstate][paction] = \
                    Q[pstate][paction]+options["alpha"]*(reward-Q[pstate][paction]+options["gamma"]*max(Q[state10]))

            # ゲーム終了
            if done or env.move_generator() == []:
                if reward == 1:
                    fin = 2
                elif reward == -1:
                    fin = 1
                elif reward == 0:
                    fin = 3
                else:
                    fin = None
                results[m] = fin
                break

            # 状態と行動の記録
            pstate = state10
            paction = action

            t += 1

        if np.mod(m, eM)==0:
            partial_results = results[m-eM+1:m]
            print('%d) Win=%d/%d, Draw=%d/%d, Lose=%d/%d' % (m,
                  len(partial_results[partial_results == 2]), eM,
                  len(partial_results[partial_results == 3]), eM,
                  len(partial_results[partial_results == 1]), eM))

 ###############################################################################
 ###############################################################################

 convert = [[0, 1, 2, 3, 4, 5, 6, 7, 8],
           [2, 1, 0, 5, 4, 3, 8, 7, 6],
           [6, 3, 0, 7, 4, 1, 8, 5, 2],
           [0, 3, 8, 1, 4, 7, 2, 5, 8],
           [8, 7, 6, 5, 4, 3, 2, 1, 0],
           [6, 7, 8, 3, 4, 5, 0, 1, 2],
           [2, 5, 8, 1, 4, 7, 0, 3, 6],
           [8, 5, 2, 7, 4, 1, 6, 3, 0]
           ]

 power = np.array([3 ** i for i in range(8, -1, -1)], dtype=np.float64)

 def encode(state3):
    return encode2(encode1(state3))

 def encode1(state3):
    ret = np.empty(len(state3))
    for n, i in enumerate(state3):
        if i == -1:
            ret[n] = 1
        elif i == 1:
            ret[n] = 2
        else:
            ret[n] = 0
    return ret


 def encode2(state3):
    cands = [sum(state3[convert[i]] * power) for i in range(len(convert))]
    return int(min(cands)) + 1


 def select_policy(options, Q, state10, policy):
    if options['pmode'] == 0:
        q = Q[state10]
        v = max(q)
        a = np.where(q == v)[0][0]
        policy[a] = 1
    elif options['pmode'] == 1:
        q = Q[state10]
        v = max(q)
        a = np.where(q == v)[0][0]
        policy = np.ones(n_actions) * options['epsilon'] / n_actions
        policy[a] = 1 - options['epsilon'] + options['epsilon'] / n_actions
    elif options['pmode'] == 2:
        policy = np.exp(Q[state10] / options['tau']) / \
                 sum(np.exp(Q[state10] / options['tau']))
    return policy


 def select_npc_action(step, state3, policy):
    a = None
    # first step is always select 0
    if step == 1:
        return [1, 0]
    else:
        while 1:
            random = np.random.rand()
            cprob = 0
            for a in range(n_actions):
                cprob += policy[a]
                if random < cprob:
                    break
            if state3[a] == 0:
                break
    return [1, a]


 def select_enemy_action(state3, moves):
    reach = False
    pos = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [1, 5, 8], [0, 4, 8], [2, 4, 6]]
    a = None
    for i in range(len(pos)):
        state_i = state3[pos[i]]
        val = sum(state_i)
        num = len(state_i[state_i == 0])
        if val == 2 and num == 1:
            idx = int(state_i[state_i == 0][0])
            a = pos[i][idx]
            if [-1, a] in moves:
                reach = True
                break
    if not reach:
        while 1:
            a = floor(np.random.rand() * 8) + 1
            if state3[a] == 0:
                break
    return [-1, a]


 def action_train(t, state3, policy):
    # select action
    npc_action = select_npc_action(t, state3, policy)

    # action execute
    state, reward, done, _ = env.step(npc_action)

    moves = env.move_generator()

    if done or moves == []:
        return npc_action, state, reward, done

    state32 = encode1(state['board'])
    enemy_action = select_enemy_action(state32, moves)

    # action execute
    state, reward, done, _ = env.step(enemy_action)

    if not done and reward == 0:
        reward = None

    return npc_action, state, reward, done


 if __name__ == '__main__':
    # ε- greedy
    options = {'pmode': 1, 'epsilon': 0.1, 'alpha': 1, 'gamma': 0.9}
	import gym
	import numpy as np
	import gym_tic_tac_toe
	from math import floor

	env = gym.make('tic_tac_toe-v0')
	n_states = 3 ** 9 # 状態数
	n_actions = 9 # 行動数
	eM = 1000 # 評価を行うエピソード数

	def q_learning(M, options):

	# initiate lookup table
	Q = np.zeros((n_states, n_actions))
	results = np.zeros(M)

	for m in range(M):
	np.random.seed(np.mod(m, eM))
	t = 1
	state = env.reset()
	state3 = state['board']

	done = False

	pstate = 0
	paction = 0

	while(True):
	# 状態観測
	state10 = encode(state3)

	# 政策の生成
	policy = np.zeros(n_actions)
	policy = select_policy(options, Q, state10, policy)

	# 行動の選択および実行
	action, state, reward, done = action_train(t, state3, policy)

	state3 = state['board']

	########################
	# Q関数の更新（Q学習）
	# 1ステップ前の状態，行動のQ値を更新
	if t > 1:
	if reward == None:
	reward = 0
	Q[pstate][paction] = \
	Q[pstate][paction]+options["alpha"](reward-Q[pstate][paction]+options["gamma"]max(Q[state10]))

	# ゲーム終了
	if done or env.move_generator() == []:
	if reward == 1:
	fin = 2
	elif reward == -1:
	fin = 1
	elif reward == 0:
	fin = 3
	else:
	fin = None
	results[m] = fin
	break

	# 状態と行動の記録
	pstate = state10
	paction = action

	t += 1

	if np.mod(m, eM)==0:
	partial_results = results[m-eM+1:m]
	print('%d) Win=%d/%d, Draw=%d/%d, Lose=%d/%d' % (m,
	len(partial_results[partial_results == 2]), eM,
	len(partial_results[partial_results == 3]), eM,
	len(partial_results[partial_results == 1]), eM))

	###############################################################################
	###############################################################################

	convert = [[0, 1, 2, 3, 4, 5, 6, 7, 8],
	[2, 1, 0, 5, 4, 3, 8, 7, 6],
	[6, 3, 0, 7, 4, 1, 8, 5, 2],
	[0, 3, 8, 1, 4, 7, 2, 5, 8],
	[8, 7, 6, 5, 4, 3, 2, 1, 0],
	[6, 7, 8, 3, 4, 5, 0, 1, 2],
	[2, 5, 8, 1, 4, 7, 0, 3, 6],
	[8, 5, 2, 7, 4, 1, 6, 3, 0]
	]

	power = np.array([3 ** i for i in range(8, -1, -1)], dtype=np.float64)

	def encode(state3):
	return encode2(encode1(state3))

	def encode1(state3):
	ret = np.empty(len(state3))
	for n, i in enumerate(state3):
	if i == -1:
	ret[n] = 1
	elif i == 1:
	ret[n] = 2
	else:
	ret[n] = 0
	return ret


	def encode2(state3):
	cands = [sum(state3[convert[i]] * power) for i in range(len(convert))]
	return int(min(cands)) + 1


	def select_policy(options, Q, state10, policy):
	if options['pmode'] == 0:
	q = Q[state10]
	v = max(q)
	a = np.where(q == v)[0][0]
	policy[a] = 1
	elif options['pmode'] == 1:
	q = Q[state10]
	v = max(q)
	a = np.where(q == v)[0][0]
	policy = np.ones(n_actions) * options['epsilon'] / n_actions
	policy[a] = 1 - options['epsilon'] + options['epsilon'] / n_actions
	elif options['pmode'] == 2:
	policy = np.exp(Q[state10] / options['tau']) / \
	sum(np.exp(Q[state10] / options['tau']))
	return policy


	def select_npc_action(step, state3, policy):
	a = None
	# first step is always select 0
	if step == 1:
	return [1, 0]
	else:
	while 1:
	random = np.random.rand()
	cprob = 0
	for a in range(n_actions):
	cprob += policy[a]
	if random < cprob:
	break
	if state3[a] == 0:
	break
	return [1, a]


	def select_enemy_action(state3, moves):
	reach = False
	pos = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [1, 5, 8], [0, 4, 8], [2, 4, 6]]
	a = None
	for i in range(len(pos)):
	state_i = state3[pos[i]]
	val = sum(state_i)
	num = len(state_i[state_i == 0])
	if val == 2 and num == 1:
	idx = int(state_i[state_i == 0][0])
	a = pos[i][idx]
	if [-1, a] in moves:
	reach = True
	break
	if not reach:
	while 1:
	a = floor(np.random.rand() * 8) + 1
	if state3[a] == 0:
	break
	return [-1, a]


	def action_train(t, state3, policy):
	# select action
	npc_action = select_npc_action(t, state3, policy)

	# action execute
	state, reward, done, _ = env.step(npc_action)

	moves = env.move_generator()

	if done or moves == []:
	return npc_action, state, reward, done

	state32 = encode1(state['board'])
	enemy_action = select_enemy_action(state32, moves)

	# action execute
	state, reward, done, _ = env.step(enemy_action)

	if not done and reward == 0:
	reward = None

	return npc_action, state, reward, done


	if __name__ == '__main__':
	# ε- greedy
	options = {'pmode': 1, 'epsilon': 0.1, 'alpha': 1, 'gamma': 0.9}