Last active
June 26, 2017 12:54
-
-
Save tsu-nera/bdb9b229b21f00997698378a7b15df08 to your computer and use it in GitHub Desktop.
CartPole-v0 Q-Learning https://gym.openai.com/evaluations/eval_XduLYAihRHyfU3R2t5dR2Q
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import gym | |
| import numpy as np | |
| from qlearning_answer import QLearningAgent | |
| env = gym.make("CartPole-v0") | |
| n_actions = env.action_space.n | |
| def build_state(features): | |
| """get our features and put all together converting into an integer""" | |
| return int("".join(map(lambda feature: str(int(feature)), features))) | |
| def to_bin(value, bins): | |
| return np.digitize(x=[value], bins=bins)[0] | |
| cart_position_bins = np.linspace(-2.4, 2.4, 2) | |
| cart_velocity_bins = np.linspace(-2, 2, 10) | |
| pole_angle_bins = np.linspace(-0.4, 0.4, 50) | |
| pole_velocity_bins = np.linspace(-3.5, 3.5, 20) | |
| def transform(observation): | |
| # return an int | |
| cart_pos, cart_vel, pole_angle, pole_vel = observation | |
| return build_state([ | |
| to_bin(cart_pos, cart_position_bins), | |
| to_bin(cart_vel, cart_velocity_bins), | |
| to_bin(pole_angle, pole_angle_bins), | |
| to_bin(pole_vel, pole_velocity_bins) | |
| ]) | |
| def play_and_train(env, agent, t_max=10 ** 4): | |
| total_reward = 0.0 | |
| s = env.reset() | |
| s = transform(s) | |
| for t in range(t_max): | |
| a = agent.getAction(s) | |
| next_s, r, done, _ = env.step(a) | |
| next_s = transform(next_s) | |
| agent.update(s, a, next_s, r) | |
| s = next_s | |
| total_reward += r | |
| if done: | |
| break | |
| return total_reward | |
| agent = QLearningAgent(alpha=0.1, epsilon=0.25,discount=0.99, | |
| getLegalActions = lambda s: range(n_actions)) | |
| rewards = [] | |
| for i in range(50000): | |
| rewards.append(play_and_train(env,agent)) | |
| agent.epsilon *= 0.9999 | |
| env.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # qlearningAgents.py | |
| # ------------------ | |
| ## based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html | |
| import random,math | |
| import numpy as np | |
| from collections import defaultdict | |
| class QLearningAgent(): | |
| """ | |
| Q-Learning Agent | |
| Instance variables you have access to | |
| - self.epsilon (exploration prob) | |
| - self.alpha (learning rate) | |
| - self.discount (discount rate aka gamma) | |
| Functions you should use | |
| - self.getLegalActions(state) | |
| which returns legal actions for a state | |
| - self.getQValue(state,action) | |
| which returns Q(state,action) | |
| - self.setQValue(state,action,value) | |
| which sets Q(state,action) := value | |
| !!!Important!!! | |
| NOTE: please avoid using self._qValues directly to make code cleaner | |
| """ | |
| def __init__(self,alpha,epsilon,discount,getLegalActions): | |
| "We initialize agent and Q-values here." | |
| self.getLegalActions= getLegalActions | |
| self._qValues = defaultdict(lambda:defaultdict(lambda:0)) | |
| self.alpha = alpha | |
| self.epsilon = epsilon | |
| self.discount = discount | |
| def getQValue(self, state, action): | |
| """ | |
| Returns Q(state,action) | |
| """ | |
| return self._qValues[state][action] | |
| def setQValue(self,state,action,value): | |
| """ | |
| Sets the Qvalue for [state,action] to the given value | |
| """ | |
| self._qValues[state][action] = value | |
| #---------------------#start of your code#---------------------# | |
| def getValue(self, state): | |
| """ | |
| Returns max_action Q(state,action) | |
| where the max is over legal actions. | |
| """ | |
| possibleActions = self.getLegalActions(state) | |
| #If there are no legal actions, return 0.0 | |
| if len(possibleActions) == 0: | |
| return 0.0 | |
| best_q = None | |
| "*** YOUR CODE HERE ***" | |
| for a in possibleActions: | |
| v = self.getQValue(state, a) | |
| if best_q == None or v > best_q: | |
| best_q = v | |
| best_action = a | |
| return best_q | |
| def getPolicy(self, state): | |
| """ | |
| Compute the best action to take in a state. | |
| """ | |
| possibleActions = self.getLegalActions(state) | |
| #If there are no legal actions, return None | |
| if len(possibleActions) == 0: | |
| return None | |
| best_q = None | |
| best_action = None | |
| "*** YOUR CODE HERE ***" | |
| for a in possibleActions: | |
| v = self.getQValue(state, a) | |
| if best_q == None or v > best_q: | |
| best_q = v | |
| best_action = a | |
| return best_action | |
| def getAction(self, state): | |
| """ | |
| Compute the action to take in the current state, including exploration. | |
| With probability self.epsilon, we should take a random action. | |
| otherwise - the best policy action (self.getPolicy). | |
| HINT: You might want to use util.flipCoin(prob) | |
| HINT: To pick randomly from a list, use random.choice(list) | |
| """ | |
| # Pick Action | |
| possibleActions = self.getLegalActions(state) | |
| action = None | |
| #If there are no legal actions, return None | |
| if len(possibleActions) == 0: | |
| return None | |
| #agent parameters: | |
| epsilon = self.epsilon | |
| "*** YOUR CODE HERE ***" | |
| if random.random() < epsilon: | |
| action = random.choice(possibleActions) | |
| else: | |
| action = self.getPolicy(state) | |
| return action | |
| def update(self, state, action, nextState, reward): | |
| """ | |
| You should do your Q-Value update here | |
| NOTE: You should never call this function, | |
| it will be called on your behalf | |
| """ | |
| #agent parameters | |
| gamma = self.discount | |
| learning_rate = self.alpha | |
| "*** YOUR CODE HERE ***" | |
| # <the "correct state value", uses reward and the value of next state> | |
| reference_qvalue = reward + gamma * self.getValue(nextState) | |
| updated_qvalue = (1-learning_rate) * self.getQValue(state,action) + learning_rate * reference_qvalue | |
| self.setQValue(state,action,updated_qvalue) | |
| #---------------------#end of your code#---------------------# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment