Skip to content

Instantly share code, notes, and snippets.

@embiem
Created February 9, 2017 11:14
Show Gist options
  • Save embiem/61f25cd693bf2042da552a01f996a592 to your computer and use it in GitHub Desktop.
Save embiem/61f25cd693bf2042da552a01f996a592 to your computer and use it in GitHub Desktop.
OpenAI Gym Q-Learner for the cartpole environment
'''
Very basic Q-Learner that is tailored for the cart-pole environment.
This implementation leaves a lot of space for improvements.
'''
import gym
from gym import wrappers
import math
import random
import numpy as np
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, './tmp/cartpole-experiment-1')
no_episodes = 100
observation_accuracy = 1
epsilon = 1.0
alpha = 0.4
Q = dict()
available_actions = [0, 1]
def get_maxQ_action(state):
maxQ = get_maxQ(state)
maxQ_actions = []
# add all actions with maxQ to actions-list
for action in Q[state].keys():
if Q[state][action] == maxQ:
maxQ_actions.append(action)
# randomly choose one of the maxQ actions
return maxQ_actions[random.randint(0, len(maxQ_actions) - 1)]
def get_maxQ(state):
maxQ = -10000.0
for action in Q[state].keys():
if Q[state][action] > maxQ:
maxQ = Q[state][action]
return maxQ
for i_episode in range(no_episodes):
observation = env.reset()
diff = 0.0
epsilon = math.exp(-1.0 * alpha * i_episode)
is_done = False
for t in range(200):
env.render()
# 1. Build State
state = map(lambda x: round(x,observation_accuracy), observation)
state_str = ' '.join([str(x) for x in state])
# 2. Create Q entry for state
if not(state_str in Q):
Q[state_str] = dict()
for action in available_actions:
Q[state_str][action] = 0.0
# 3. Choose action
if random.random() <= epsilon:
action = available_actions[random.randint(0, len(available_actions) - 1)]
else:
action = get_maxQ_action(state_str)
# 4. do step & receive reward
observation, reward, done, info = env.step(action)
# reward is always 1.0, so I'll just create my own
curr_diff = abs(np.sum(observation))
reward = diff - curr_diff
diff = curr_diff
# 5. learn
Q[state_str][action] += (alpha * (reward - Q[state_str][action]))
# 6. Check if done with this episode
if done:
print("Episode {} finished after {} timesteps".format(i_episode + 1, t+1))
is_done = True
break
if is_done == False:
print("Episode {} COMPLETED".format(i_episode + 1))
env.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment