tsu-nera · June 26, 2017 12:54
diff --git a/cartpole_qlearning.py b/cartpole_qlearning.py
 import gym
 import numpy as np
 from qlearning_answer import QLearningAgent

 env = gym.make("CartPole-v0")
 n_actions = env.action_space.n

 def build_state(features):
    """get our features and put all together converting into an integer"""
    return int("".join(map(lambda feature: str(int(feature)), features)))

 def to_bin(value, bins):
    return np.digitize(x=[value], bins=bins)[0]

 cart_position_bins = np.linspace(-2.4, 2.4, 2)
 cart_velocity_bins = np.linspace(-2, 2, 10)
 pole_angle_bins = np.linspace(-0.4, 0.4, 50)
 pole_velocity_bins = np.linspace(-3.5, 3.5, 20)

 def transform(observation):
    # return an int
    cart_pos, cart_vel, pole_angle, pole_vel = observation
    return build_state([
        to_bin(cart_pos, cart_position_bins),
        to_bin(cart_vel, cart_velocity_bins),
        to_bin(pole_angle, pole_angle_bins),
        to_bin(pole_vel, pole_velocity_bins)
    ])

 def play_and_train(env, agent, t_max=10 ** 4):
    total_reward = 0.0
    s = env.reset()
    s = transform(s)
    for t in range(t_max):
        a = agent.getAction(s)

        next_s, r, done, _ = env.step(a)
        next_s = transform(next_s)

        agent.update(s, a, next_s, r)

        s = next_s
        total_reward += r
        if done:
            break

    return total_reward

 agent = QLearningAgent(alpha=0.1, epsilon=0.25,discount=0.99,
                       getLegalActions = lambda s: range(n_actions))

 rewards = []
 for i in range(50000):
    rewards.append(play_and_train(env,agent))
    agent.epsilon *= 0.9999

 env.close()
diff --git a/qlearning_answer.py b/qlearning_answer.py
 # qlearningAgents.py
 # ------------------
 ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html

 import random,math

 import numpy as np
 from collections import defaultdict

 class QLearningAgent():
  """
    Q-Learning Agent

    Instance variables you have access to
      - self.epsilon (exploration prob)
      - self.alpha (learning rate)
      - self.discount (discount rate aka gamma)

    Functions you should use
      - self.getLegalActions(state)
        which returns legal actions for a state
      - self.getQValue(state,action)
        which returns Q(state,action)
      - self.setQValue(state,action,value)
        which sets Q(state,action) := value
    
    !!!Important!!!
    NOTE: please avoid using self._qValues directly to make code cleaner
  """
  def __init__(self,alpha,epsilon,discount,getLegalActions):
    "We initialize agent and Q-values here."
    self.getLegalActions= getLegalActions
    self._qValues = defaultdict(lambda:defaultdict(lambda:0))
    self.alpha = alpha
    self.epsilon = epsilon
    self.discount = discount

  def getQValue(self, state, action):
    """
      Returns Q(state,action)
    """
    return self._qValues[state][action]

  def setQValue(self,state,action,value):
    """
      Sets the Qvalue for [state,action] to the given value
    """
    self._qValues[state][action] = value

 #---------------------#start of your code#---------------------#

  def getValue(self, state):
    """
      Returns max_action Q(state,action)
      where the max is over legal actions.
    """
    
    possibleActions = self.getLegalActions(state)
    #If there are no legal actions, return 0.0
    if len(possibleActions) == 0:
      return 0.0

    best_q = None

    "*** YOUR CODE HERE ***"
    for a in possibleActions:
      v = self.getQValue(state, a)
      if best_q == None or v > best_q:
        best_q = v
        best_action = a
    return best_q

  def getPolicy(self, state):
    """
      Compute the best action to take in a state. 
      
    """
    possibleActions = self.getLegalActions(state)

    #If there are no legal actions, return None
    if len(possibleActions) == 0:
      return None

    best_q = None
    best_action = None

    "*** YOUR CODE HERE ***"
    for a in possibleActions:
      v = self.getQValue(state, a)
      if best_q == None or v > best_q:
        best_q = v
        best_action = a
    return best_action

  def getAction(self, state):
    """
      Compute the action to take in the current state, including exploration.  
      
      With probability self.epsilon, we should take a random action.
      otherwise - the best policy action (self.getPolicy).

      HINT: You might want to use util.flipCoin(prob)
      HINT: To pick randomly from a list, use random.choice(list)

    """
    
    # Pick Action
    possibleActions = self.getLegalActions(state)
    action = None
    
    #If there are no legal actions, return None
    if len(possibleActions) == 0:
      return None

    #agent parameters:
    epsilon = self.epsilon

    "*** YOUR CODE HERE ***"
    if random.random() < epsilon:
      action = random.choice(possibleActions)
    else:
      action = self.getPolicy(state)
    
    return action

  def update(self, state, action, nextState, reward):
    """
      You should do your Q-Value update here

      NOTE: You should never call this function,
      it will be called on your behalf


    """
    #agent parameters
    gamma = self.discount
    learning_rate = self.alpha
    
    "*** YOUR CODE HERE ***"
    # <the "correct state value", uses reward and the value of next state>
    reference_qvalue = reward + gamma * self.getValue(nextState)
    
    updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
    self.setQValue(state,action,updated_qvalue)


 #---------------------#end of your code#---------------------#
	import gym
	import numpy as np
	from qlearning_answer import QLearningAgent

	env = gym.make("CartPole-v0")
	n_actions = env.action_space.n

	def build_state(features):
	"""get our features and put all together converting into an integer"""
	return int("".join(map(lambda feature: str(int(feature)), features)))

	def to_bin(value, bins):
	return np.digitize(x=[value], bins=bins)[0]

	cart_position_bins = np.linspace(-2.4, 2.4, 2)
	cart_velocity_bins = np.linspace(-2, 2, 10)
	pole_angle_bins = np.linspace(-0.4, 0.4, 50)
	pole_velocity_bins = np.linspace(-3.5, 3.5, 20)

	def transform(observation):
	# return an int
	cart_pos, cart_vel, pole_angle, pole_vel = observation
	return build_state([
	to_bin(cart_pos, cart_position_bins),
	to_bin(cart_vel, cart_velocity_bins),
	to_bin(pole_angle, pole_angle_bins),
	to_bin(pole_vel, pole_velocity_bins)
	])

	def play_and_train(env, agent, t_max=10 ** 4):
	total_reward = 0.0
	s = env.reset()
	s = transform(s)
	for t in range(t_max):
	a = agent.getAction(s)

	next_s, r, done, _ = env.step(a)
	next_s = transform(next_s)

	agent.update(s, a, next_s, r)

	s = next_s
	total_reward += r
	if done:
	break

	return total_reward

	agent = QLearningAgent(alpha=0.1, epsilon=0.25,discount=0.99,
	getLegalActions = lambda s: range(n_actions))

	rewards = []
	for i in range(50000):
	rewards.append(play_and_train(env,agent))
	agent.epsilon *= 0.9999

	env.close()
	# qlearningAgents.py
	# ------------------
	## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html

	import random,math

	import numpy as np
	from collections import defaultdict

	class QLearningAgent():
	"""
	Q-Learning Agent

	Instance variables you have access to
	- self.epsilon (exploration prob)
	- self.alpha (learning rate)
	- self.discount (discount rate aka gamma)

	Functions you should use
	- self.getLegalActions(state)
	which returns legal actions for a state
	- self.getQValue(state,action)
	which returns Q(state,action)
	- self.setQValue(state,action,value)
	which sets Q(state,action) := value

	!!!Important!!!
	NOTE: please avoid using self._qValues directly to make code cleaner
	"""
	def __init__(self,alpha,epsilon,discount,getLegalActions):
	"We initialize agent and Q-values here."
	self.getLegalActions= getLegalActions
	self._qValues = defaultdict(lambda:defaultdict(lambda:0))
	self.alpha = alpha
	self.epsilon = epsilon
	self.discount = discount

	def getQValue(self, state, action):
	"""
	Returns Q(state,action)
	"""
	return self._qValues[state][action]

	def setQValue(self,state,action,value):
	"""
	Sets the Qvalue for [state,action] to the given value
	"""
	self._qValues[state][action] = value

	#---------------------#start of your code#---------------------#

	def getValue(self, state):
	"""
	Returns max_action Q(state,action)
	where the max is over legal actions.
	"""

	possibleActions = self.getLegalActions(state)
	#If there are no legal actions, return 0.0
	if len(possibleActions) == 0:
	return 0.0

	best_q = None

	"* YOUR CODE HERE *"
	for a in possibleActions:
	v = self.getQValue(state, a)
	if best_q == None or v > best_q:
	best_q = v
	best_action = a
	return best_q

	def getPolicy(self, state):
	"""
	Compute the best action to take in a state.

	"""
	possibleActions = self.getLegalActions(state)

	#If there are no legal actions, return None
	if len(possibleActions) == 0:
	return None

	best_q = None
	best_action = None

	"* YOUR CODE HERE *"
	for a in possibleActions:
	v = self.getQValue(state, a)
	if best_q == None or v > best_q:
	best_q = v
	best_action = a
	return best_action

	def getAction(self, state):
	"""
	Compute the action to take in the current state, including exploration.

	With probability self.epsilon, we should take a random action.
	otherwise - the best policy action (self.getPolicy).

	HINT: You might want to use util.flipCoin(prob)
	HINT: To pick randomly from a list, use random.choice(list)

	"""

	# Pick Action
	possibleActions = self.getLegalActions(state)
	action = None

	#If there are no legal actions, return None
	if len(possibleActions) == 0:
	return None

	#agent parameters:
	epsilon = self.epsilon

	"* YOUR CODE HERE *"
	if random.random() < epsilon:
	action = random.choice(possibleActions)
	else:
	action = self.getPolicy(state)

	return action

	def update(self, state, action, nextState, reward):
	"""
	You should do your Q-Value update here

	NOTE: You should never call this function,
	it will be called on your behalf


	"""
	#agent parameters
	gamma = self.discount
	learning_rate = self.alpha

	"* YOUR CODE HERE *"
	# <the "correct state value", uses reward and the value of next state>
	reference_qvalue = reward + gamma * self.getValue(nextState)

	updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue
	self.setQValue(state,action,updated_qvalue)


	#---------------------#end of your code#---------------------#