Skip to content

Instantly share code, notes, and snippets.

@tsu-nera
Last active June 26, 2017 12:54
Show Gist options
  • Select an option

  • Save tsu-nera/bdb9b229b21f00997698378a7b15df08 to your computer and use it in GitHub Desktop.

Select an option

Save tsu-nera/bdb9b229b21f00997698378a7b15df08 to your computer and use it in GitHub Desktop.
import gym
import numpy as np
from qlearning_answer import QLearningAgent
env = gym.make("CartPole-v0")
n_actions = env.action_space.n
def build_state(features):
"""get our features and put all together converting into an integer"""
return int("".join(map(lambda feature: str(int(feature)), features)))
def to_bin(value, bins):
return np.digitize(x=[value], bins=bins)[0]
cart_position_bins = np.linspace(-2.4, 2.4, 2)
cart_velocity_bins = np.linspace(-2, 2, 10)
pole_angle_bins = np.linspace(-0.4, 0.4, 50)
pole_velocity_bins = np.linspace(-3.5, 3.5, 20)
def transform(observation):
# return an int
cart_pos, cart_vel, pole_angle, pole_vel = observation
return build_state([
to_bin(cart_pos, cart_position_bins),
to_bin(cart_vel, cart_velocity_bins),
to_bin(pole_angle, pole_angle_bins),
to_bin(pole_vel, pole_velocity_bins)
])
def play_and_train(env, agent, t_max=10 ** 4):
total_reward = 0.0
s = env.reset()
s = transform(s)
for t in range(t_max):
a = agent.getAction(s)
next_s, r, done, _ = env.step(a)
next_s = transform(next_s)
agent.update(s, a, next_s, r)
s = next_s
total_reward += r
if done:
break
return total_reward
agent = QLearningAgent(alpha=0.1, epsilon=0.25,discount=0.99,
getLegalActions = lambda s: range(n_actions))
rewards = []
for i in range(50000):
rewards.append(play_and_train(env,agent))
agent.epsilon *= 0.9999
env.close()
# qlearningAgents.py
# ------------------
## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
import random,math
import numpy as np
from collections import defaultdict
class QLearningAgent():
"""
Q-Learning Agent
Instance variables you have access to
- self.epsilon (exploration prob)
- self.alpha (learning rate)
- self.discount (discount rate aka gamma)
Functions you should use
- self.getLegalActions(state)
which returns legal actions for a state
- self.getQValue(state,action)
which returns Q(state,action)
- self.setQValue(state,action,value)
which sets Q(state,action) := value
!!!Important!!!
NOTE: please avoid using self._qValues directly to make code cleaner
"""
def __init__(self,alpha,epsilon,discount,getLegalActions):
"We initialize agent and Q-values here."
self.getLegalActions= getLegalActions
self._qValues = defaultdict(lambda:defaultdict(lambda:0))
self.alpha = alpha
self.epsilon = epsilon
self.discount = discount
def getQValue(self, state, action):
"""
Returns Q(state,action)
"""
return self._qValues[state][action]
def setQValue(self,state,action,value):
"""
Sets the Qvalue for [state,action] to the given value
"""
self._qValues[state][action] = value
#---------------------#start of your code#---------------------#
def getValue(self, state):
"""
Returns max_action Q(state,action)
where the max is over legal actions.
"""
possibleActions = self.getLegalActions(state)
#If there are no legal actions, return 0.0
if len(possibleActions) == 0:
return 0.0
best_q = None
"*** YOUR CODE HERE ***"
for a in possibleActions:
v = self.getQValue(state, a)
if best_q == None or v > best_q:
best_q = v
best_action = a
return best_q
def getPolicy(self, state):
"""
Compute the best action to take in a state.
"""
possibleActions = self.getLegalActions(state)
#If there are no legal actions, return None
if len(possibleActions) == 0:
return None
best_q = None
best_action = None
"*** YOUR CODE HERE ***"
for a in possibleActions:
v = self.getQValue(state, a)
if best_q == None or v > best_q:
best_q = v
best_action = a
return best_action
def getAction(self, state):
"""
Compute the action to take in the current state, including exploration.
With probability self.epsilon, we should take a random action.
otherwise - the best policy action (self.getPolicy).
HINT: You might want to use util.flipCoin(prob)
HINT: To pick randomly from a list, use random.choice(list)
"""
# Pick Action
possibleActions = self.getLegalActions(state)
action = None
#If there are no legal actions, return None
if len(possibleActions) == 0:
return None
#agent parameters:
epsilon = self.epsilon
"*** YOUR CODE HERE ***"
if random.random() < epsilon:
action = random.choice(possibleActions)
else:
action = self.getPolicy(state)
return action
def update(self, state, action, nextState, reward):
"""
You should do your Q-Value update here
NOTE: You should never call this function,
it will be called on your behalf
"""
#agent parameters
gamma = self.discount
learning_rate = self.alpha
"*** YOUR CODE HERE ***"
# <the "correct state value", uses reward and the value of next state>
reference_qvalue = reward + gamma * self.getValue(nextState)
updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
self.setQValue(state,action,updated_qvalue)
#---------------------#end of your code#---------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment