Last active
August 25, 2016 11:31
-
-
Save Sessa93/561932f1034be5282057b6271c4d5f99 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import math | |
import numpy as np | |
import gym | |
# Author Andrea Sessa | |
# This is a modified version of https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg | |
# It uses Q-Learning in place of Temporal Difference, parameters have been tuned to achieve | |
# the maximum score in about 1000 less episodes | |
FIFTY_DEGREES_IN_RADIANS = 0.87266 | |
class QLearner: | |
def __init__(self, env): | |
self.env = env | |
self.epsilon = 0.75 | |
self.Q = [] | |
self.gamma = 0.95 | |
self.max_episodes = 4000 | |
self.max_step = 3000 | |
self.alpha = 0.5 | |
self.eps_decay = 0.99 | |
# Stolen from https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg | |
def encode_state(self, state): | |
""" | |
Converts raw continuous state into one of discrete states (see https://webdocs.cs.ualberta.ca/~sutton/book/code/pole.c) | |
Args: | |
state (list): A raw state, i.e. list of x, x_dot, theta and theta_dot. | |
Returns: | |
box (int): A discrete state. | |
""" | |
x, x_dot, theta, theta_dot = state | |
env = self.env | |
x_limit, theta_limit = env.x_threshold, env.theta_threshold_radians | |
half_theta_limit = theta_limit/2 | |
one_twelveth_theta_limit = theta_limit/12 | |
cart_in_limits = -x_limit < x < x_limit | |
pole_in_limits = -theta_limit < theta < theta_limit | |
if not cart_in_limits or not pole_in_limits: | |
return 0 | |
box = (1 if x < -0.8 else | |
2 if x < 0.8 else | |
3) | |
if x_dot < -0.5: | |
pass | |
elif x_dot < 0.5: | |
box += 3 | |
else: | |
box += 6 | |
if theta < -half_theta_limit: | |
pass | |
elif theta < -one_twelveth_theta_limit: | |
box += 9 | |
elif theta < 0: | |
box += 18 | |
elif theta < one_twelveth_theta_limit: | |
box += 27 | |
elif theta < half_theta_limit: | |
box += 36 | |
else: | |
box += 45 | |
if theta_dot < -FIFTY_DEGREES_IN_RADIANS: | |
pass | |
elif theta_dot < FIFTY_DEGREES_IN_RADIANS: | |
box += 54 | |
else: | |
box += 108 | |
return box | |
# Epsilon-Greedy Policy: with probability 1-epsilon perform a random action | |
# otherwise pick the greedy action | |
def epsilon_greedy(self, state, q_values, eps): | |
a = np.argmax(q_values[state, :]) | |
if np.random.rand() < eps: | |
a = np.random.randint(q_values.shape[1]) | |
return a | |
def learn(self, render=True): | |
ave_cumu_r = None | |
n_s = 163 | |
n_a = self.env.action_space.n | |
# Initialization of the actio-value function | |
self.Q = np.zeros(shape=(n_s,n_a)) | |
for e in range(self.max_episodes): | |
cum_rw = 0 | |
# Shall i show the pole? | |
if render: | |
self.env.render() | |
s = self.encode_state(self.env.reset()) | |
for step in range(self.max_step): | |
a = self.epsilon_greedy(s, self.Q, self.epsilon) | |
sp, rw, done, _ = self.env.step(a) | |
sp = self.encode_state(sp) | |
# Q-Learning update rule | |
self.Q[s,a] = self.Q[s,a] + self.alpha*(rw + (self.gamma*max(self.Q[sp,:])) - self.Q[s,a]) | |
s = sp | |
cum_rw = rw + self.gamma * cum_rw | |
#Terminal state! | |
if done: | |
k = 0.01 | |
if ave_cumu_r is None: | |
ave_cumu_r = cum_rw | |
else: | |
ave_cumu_r = k*cum_rw + (1 - k)*ave_cumu_r | |
if cum_rw > ave_cumu_r: | |
self.epsilon *= self.eps_decay | |
print("Episode {} ended on step {} with reward: {}".format(e, step, ave_cumu_r)) | |
break | |
def main(): | |
env = gym.make('CartPole-v0') | |
env.monitor.start('/tmp/cartpole-experiment-1', force=True) | |
learner = QLearner(env) | |
learner.learn() | |
env.monitor.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment