Last active
July 12, 2019 16:03
-
-
Save alastairparagas/d95442fb35440ab528041355af15eae7 to your computer and use it in GitHub Desktop.
Q Learner with Dyna
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Template for implementing QLearner (c) 2015 Tucker Balch | |
| Copyright 2018, Georgia Institute of Technology (Georgia Tech) | |
| Atlanta, Georgia 30332 | |
| All Rights Reserved | |
| Template code for CS 4646/7646 | |
| Georgia Tech asserts copyright ownership of this template and all derivative | |
| works, including solutions to the projects assigned in this course. Students | |
| and other users of this template code are advised not to share it with others | |
| or to make it available on publicly viewable websites including repositories | |
| such as github and gitlab. This copyright statement should not be removed | |
| or edited. | |
| We do grant permission to share solutions privately with non-students such | |
| as potential employers. However, sharing with other current or future | |
| students of CS 7646 is prohibited and subject to being investigated as a | |
| GT honor code violation. | |
| -----do not edit anything above this line--- | |
| Student Name: Alastair Paragas | |
| GT User ID: aparagas3 | |
| GT ID: 903475508 | |
| """ | |
| import numpy as np | |
| import random as rand | |
| class QLearner(object): | |
| def __init__( | |
| self, num_states=100, num_actions = 4, alpha = 0.2, | |
| gamma = 0.9, rar = 0.5, radr = 0.99, dyna = 0, verbose = False | |
| ): | |
| self.verbose = verbose | |
| self.num_states = num_states | |
| self.num_actions = num_actions | |
| self.alpha = alpha | |
| self.gamma = gamma | |
| self.radr = radr | |
| self.dyna = dyna | |
| self.s = 0 | |
| self.a = 0 | |
| self.rar = rar | |
| # [state, action] -> reward val | |
| self.q_table = np.zeros(shape=(num_states, num_actions)) | |
| # [state, action, state_prime] -> encounter count | |
| # initialize with small non-zero value to avoid div by zero errors | |
| self.t_count_table = np.zeros( | |
| shape=(num_states, num_actions, num_states) | |
| ) | |
| # [state, action] -> expected reward | |
| self.r_table = np.zeros(shape=(num_states, num_actions)) | |
| def author(self): | |
| return 'aparagas3' | |
| def get_next_action(self, s_prime): | |
| if rand.uniform(0, 1) <= self.rar: | |
| next_action = rand.randint(0, self.num_actions-1) | |
| else: | |
| next_action = np.argmax(self.q_table[s_prime]) | |
| return next_action | |
| def hallucinate_experience(self): | |
| if self.dyna == 0: | |
| return | |
| s_a_tuples = np.column_stack( | |
| ( | |
| np.random.randint(0, self.num_states, size=self.dyna), | |
| np.random.randint(0, self.num_actions, size=self.dyna) | |
| ) | |
| ) | |
| max_sprimes = np.argmax(self.t_count_table, axis=2) | |
| # Generate state, action, state' and reward imaginary experiences | |
| # from state and action tuples using transition probability cuboid of | |
| # <state, action, state'> | |
| experience_tuples = np.fromiter((( | |
| s, a, | |
| max_sprimes[s, a], | |
| self.r_table[s, a] | |
| ) for s, a in s_a_tuples), dtype=('u8,u8,u8,f8')) | |
| for s, a, s_prime, r in experience_tuples: | |
| q_s_val = self.q_table[s, a] | |
| q_sprime_maxval = np.amax(self.q_table[s_prime]) | |
| self.q_table[s, a] = ((1 - self.alpha) * q_s_val) + ( | |
| self.alpha * (r + self.gamma * q_sprime_maxval) | |
| ) | |
| def querysetstate(self, s): | |
| """ | |
| @summary: Update the state without updating the Q-table | |
| @param s: The new state | |
| @returns: The selected action | |
| """ | |
| if self.verbose: | |
| print "s =", s, "a =", next_action | |
| self.s = s | |
| self.a = rand.randint(0, self.num_actions-1) | |
| return self.a | |
| def query(self, s_prime, r): | |
| """ | |
| @summary: Update the Q table and return an action | |
| @param s_prime: The new state | |
| @param r: real-valued immediate reward | |
| @returns: The selected action | |
| """ | |
| if self.verbose: | |
| print "s =", s_prime, "a =", next_action, "r =", r | |
| next_action = self.get_next_action(s_prime) | |
| # Update q table | |
| q_s_val = self.q_table[self.s, self.a] | |
| q_sprime_maxval = np.amax(self.q_table[s_prime]) | |
| self.q_table[self.s, self.a] = ((1 - self.alpha) * q_s_val) + ( | |
| self.alpha * (r + self.gamma * q_sprime_maxval) | |
| ) | |
| # Update t-table transition matrix (only with real-world experience) | |
| self.t_count_table[self.s, self.a, s_prime] += 1 | |
| # Update r-table reward matrix (only with real-world experience) | |
| self.r_table[self.s, self.a] = ( | |
| (1 - self.alpha) * self.r_table[self.s, self.a] | |
| ) + (self.alpha * r) | |
| # Update state, action and random action rate | |
| self.s = s_prime | |
| self.a = next_action | |
| self.rar = self.radr * self.rar | |
| self.hallucinate_experience() | |
| return next_action | |
| if __name__=="__main__": | |
| print "Remember Q from Star Trek? Well, this isn't him" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment