Sessa93 · August 25, 2016 11:31
diff --git a/QCartPole.py b/QCartPole.py
 from collections import defaultdict
 import math

 import numpy as np
 import gym

 # Author Andrea Sessa
 # This is a modified version of https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg
 # It uses Q-Learning in place of Temporal Difference, parameters have been tuned to achieve
 # the maximum score in about 1000 less episodes


 FIFTY_DEGREES_IN_RADIANS = 0.87266

 class QLearner:
    def __init__(self, env):
        self.env = env
        self.epsilon = 0.75
        self.Q = []
        self.gamma = 0.95
        self.max_episodes = 4000
        self.max_step = 3000
        self.alpha = 0.5
        self.eps_decay = 0.99

    # Stolen from https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg
    def encode_state(self, state):
        """
        Converts raw continuous state into one of discrete states (see https://webdocs.cs.ualberta.ca/~sutton/book/code/pole.c)

        Args:
            state (list): A raw state, i.e. list of x, x_dot, theta and theta_dot.

        Returns:
            box (int): A discrete state.
        """
        x, x_dot, theta, theta_dot = state
        env = self.env
        x_limit, theta_limit = env.x_threshold, env.theta_threshold_radians

        half_theta_limit = theta_limit/2
        one_twelveth_theta_limit = theta_limit/12
        cart_in_limits = -x_limit < x < x_limit
        pole_in_limits = -theta_limit < theta < theta_limit

        if not cart_in_limits or not pole_in_limits:
            return 0

        box = (1 if x < -0.8 else
               2 if x < 0.8 else
               3)

        if x_dot < -0.5:
            pass
        elif x_dot < 0.5:
            box += 3
        else:
            box += 6

        if theta < -half_theta_limit:
            pass
        elif theta < -one_twelveth_theta_limit:
            box += 9
        elif theta < 0:
            box += 18
        elif theta < one_twelveth_theta_limit:
            box += 27
        elif theta < half_theta_limit:
            box += 36
        else:
            box += 45

        if theta_dot < -FIFTY_DEGREES_IN_RADIANS:
            pass
        elif theta_dot < FIFTY_DEGREES_IN_RADIANS:
            box += 54
        else:
            box += 108

        return box

    # Epsilon-Greedy Policy: with probability 1-epsilon perform a random action
    # otherwise pick the greedy action
    def epsilon_greedy(self, state, q_values, eps):
        a = np.argmax(q_values[state, :])
        if np.random.rand() < eps:
            a = np.random.randint(q_values.shape[1])
        return a

    def learn(self, render=True):
        ave_cumu_r = None
        n_s = 163
        n_a = self.env.action_space.n
        # Initialization of the actio-value function
        self.Q = np.zeros(shape=(n_s,n_a))
        for e in range(self.max_episodes):
            cum_rw = 0
            # Shall i show the pole?
            if render:
                self.env.render()

            s = self.encode_state(self.env.reset())
            for step in range(self.max_step):
                a = self.epsilon_greedy(s, self.Q, self.epsilon)
                sp, rw, done, _ = self.env.step(a)
                sp = self.encode_state(sp)

                # Q-Learning update rule
                self.Q[s,a] = self.Q[s,a] + self.alpha*(rw + (self.gamma*max(self.Q[sp,:])) - self.Q[s,a])
                s = sp
                cum_rw = rw + self.gamma * cum_rw

                #Terminal state!
                if done:
                    k = 0.01
                    if ave_cumu_r is None:
                        ave_cumu_r = cum_rw
                    else:
                        ave_cumu_r = k*cum_rw + (1 - k)*ave_cumu_r
                    if cum_rw > ave_cumu_r:
                        self.epsilon *= self.eps_decay
                    print("Episode {} ended on step {} with reward: {}".format(e, step, ave_cumu_r))
                    break

 def main():
    env = gym.make('CartPole-v0')
    env.monitor.start('/tmp/cartpole-experiment-1', force=True)
    learner = QLearner(env)
    learner.learn()
    env.monitor.close()
    
 if __name__ == "__main__":
    main()
	from collections import defaultdict
	import math

	import numpy as np
	import gym

	# Author Andrea Sessa
	# This is a modified version of https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg
	# It uses Q-Learning in place of Temporal Difference, parameters have been tuned to achieve
	# the maximum score in about 1000 less episodes


	FIFTY_DEGREES_IN_RADIANS = 0.87266

	class QLearner:
	def __init__(self, env):
	self.env = env
	self.epsilon = 0.75
	self.Q = []
	self.gamma = 0.95
	self.max_episodes = 4000
	self.max_step = 3000
	self.alpha = 0.5
	self.eps_decay = 0.99

	# Stolen from https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg
	def encode_state(self, state):
	"""
	Converts raw continuous state into one of discrete states (see https://webdocs.cs.ualberta.ca/~sutton/book/code/pole.c)

	Args:
	state (list): A raw state, i.e. list of x, x_dot, theta and theta_dot.

	Returns:
	box (int): A discrete state.
	"""
	x, x_dot, theta, theta_dot = state
	env = self.env
	x_limit, theta_limit = env.x_threshold, env.theta_threshold_radians

	half_theta_limit = theta_limit/2
	one_twelveth_theta_limit = theta_limit/12
	cart_in_limits = -x_limit < x < x_limit
	pole_in_limits = -theta_limit < theta < theta_limit

	if not cart_in_limits or not pole_in_limits:
	return 0

	box = (1 if x < -0.8 else
	2 if x < 0.8 else
	3)

	if x_dot < -0.5:
	pass
	elif x_dot < 0.5:
	box += 3
	else:
	box += 6

	if theta < -half_theta_limit:
	pass
	elif theta < -one_twelveth_theta_limit:
	box += 9
	elif theta < 0:
	box += 18
	elif theta < one_twelveth_theta_limit:
	box += 27
	elif theta < half_theta_limit:
	box += 36
	else:
	box += 45

	if theta_dot < -FIFTY_DEGREES_IN_RADIANS:
	pass
	elif theta_dot < FIFTY_DEGREES_IN_RADIANS:
	box += 54
	else:
	box += 108

	return box

	# Epsilon-Greedy Policy: with probability 1-epsilon perform a random action
	# otherwise pick the greedy action
	def epsilon_greedy(self, state, q_values, eps):
	a = np.argmax(q_values[state, :])
	if np.random.rand() < eps:
	a = np.random.randint(q_values.shape[1])
	return a

	def learn(self, render=True):
	ave_cumu_r = None
	n_s = 163
	n_a = self.env.action_space.n
	# Initialization of the actio-value function
	self.Q = np.zeros(shape=(n_s,n_a))
	for e in range(self.max_episodes):
	cum_rw = 0
	# Shall i show the pole?
	if render:
	self.env.render()

	s = self.encode_state(self.env.reset())
	for step in range(self.max_step):
	a = self.epsilon_greedy(s, self.Q, self.epsilon)
	sp, rw, done, _ = self.env.step(a)
	sp = self.encode_state(sp)

	# Q-Learning update rule
	self.Q[s,a] = self.Q[s,a] + self.alpha(rw + (self.gammamax(self.Q[sp,:])) - self.Q[s,a])
	s = sp
	cum_rw = rw + self.gamma * cum_rw

	#Terminal state!
	if done:
	k = 0.01
	if ave_cumu_r is None:
	ave_cumu_r = cum_rw
	else:
	ave_cumu_r = kcum_rw + (1 - k)ave_cumu_r
	if cum_rw > ave_cumu_r:
	self.epsilon *= self.eps_decay
	print("Episode {} ended on step {} with reward: {}".format(e, step, ave_cumu_r))
	break

	def main():
	env = gym.make('CartPole-v0')
	env.monitor.start('/tmp/cartpole-experiment-1', force=True)
	learner = QLearner(env)
	learner.learn()
	env.monitor.close()

	if __name__ == "__main__":
	main()