Skip to content

Instantly share code, notes, and snippets.

@devforfu
Last active August 23, 2016 05:54
Show Gist options
  • Save devforfu/d8bb13fc20092634f2d6ceb9a4da197f to your computer and use it in GitHub Desktop.
Save devforfu/d8bb13fc20092634f2d6ceb9a4da197f to your computer and use it in GitHub Desktop.
Cart Pole Balancing
"""
Combination of Sutton's approach dividing space into boxes with simple
TD-learning algorithm (see basic_rl.py somewhere on gym portal).
Some simulation paramteres are hardcoded and learner is not universal.
"""
from collections import defaultdict
import math
import numpy as np
import gym
FIFTY_DEGREES_IN_RADIANS = 0.87266
class CartPoleLearner:
def __init__(self, env):
self.env = env
def encode_state(self, state):
"""
Converts raw continuous state into one of discreate states (see https://webdocs.cs.ualberta.ca/~sutton/book/code/pole.c)
Args:
state (list): A raw state, i.e. list of x, x_dot, theta and theta_dot.
Returns:
box (int): A discrete state.
"""
x, x_dot, theta, theta_dot = state
env = self.env
x_limit, theta_limit = env.x_threshold, env.theta_threshold_radians
half_theta_limit = theta_limit/2
one_twelveth_theta_limit = theta_limit/12
cart_in_limits = -x_limit < x < x_limit
pole_in_limits = -theta_limit < theta < theta_limit
if not cart_in_limits or not pole_in_limits:
return 0
box = (1 if x < -0.8 else
2 if x < 0.8 else
3)
if x_dot < -0.5:
pass
elif x_dot < 0.5:
box += 3
else:
box += 6
if theta < -half_theta_limit:
pass
elif theta < -one_twelveth_theta_limit:
box += 9
elif theta < 0:
box += 18
elif theta < one_twelveth_theta_limit:
box += 27
elif theta < half_theta_limit:
box += 36
else:
box += 45
if theta_dot < -FIFTY_DEGREES_IN_RADIANS:
pass
elif theta_dot < FIFTY_DEGREES_IN_RADIANS:
box += 54
else:
box += 108
return box
def learn(self):
"""
Solves pole-balancing task using basic version of TD algorithm.
"""
def epsilon_greedy(state, q_values, eps):
a = np.argmax(q_values[state, :])
if np.random.rand() < eps:
a = np.random.randint(q_values.shape[1])
return a
mean, std = 0, 1
episodes = 5000
max_step = 1000
eps = 0.75
alpha = 0.5
beta = 0.0
gamma = 0.95
eps_decay = 0.995
ave_cumu_r = None
env = self.env
n_s = 163
n_a = env.action_space.n
qs = mean + std * np.random.randn(n_s, n_a)
for episode in range(episodes):
cumu_r = 0
curr_s = self.encode_state(env.reset())
curr_a = epsilon_greedy(curr_s, qs, eps)
for step in range(max_step):
# env.render()
raw_s, r, done, _ = env.step(curr_a)
# core part
next_s = self.encode_state(raw_s)
next_a = epsilon_greedy(next_s, qs, eps)
delta = r + gamma*qs[next_s, next_a] - qs[curr_s, curr_a]
qs[curr_s, curr_a] += alpha * delta
curr_s, curr_a = next_s, next_a
# track to decrease epsilon
cumu_r = r + gamma * cumu_r
if done:
kappa = 0.01
if ave_cumu_r is None:
ave_cumu_r = cumu_r
else:
ave_cumu_r = kappa*cumu_r + (1 - kappa)*ave_cumu_r
print("Episode {} ended on step {} with average cumulative "
"reward: {}".format(episode, step, ave_cumu_r))
if cumu_r > ave_cumu_r:
eps *= eps_decay
history.append(ave_cumu_r)
break
def main():
env = gym.make('CartPole-v0')
learner = CartPoleLearner(env)
learner.learn()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment