Skip to content

Instantly share code, notes, and snippets.

@alastairparagas
Last active July 12, 2019 16:03
Show Gist options
  • Select an option

  • Save alastairparagas/d95442fb35440ab528041355af15eae7 to your computer and use it in GitHub Desktop.

Select an option

Save alastairparagas/d95442fb35440ab528041355af15eae7 to your computer and use it in GitHub Desktop.
Q Learner with Dyna
"""
Template for implementing QLearner (c) 2015 Tucker Balch
Copyright 2018, Georgia Institute of Technology (Georgia Tech)
Atlanta, Georgia 30332
All Rights Reserved
Template code for CS 4646/7646
Georgia Tech asserts copyright ownership of this template and all derivative
works, including solutions to the projects assigned in this course. Students
and other users of this template code are advised not to share it with others
or to make it available on publicly viewable websites including repositories
such as github and gitlab. This copyright statement should not be removed
or edited.
We do grant permission to share solutions privately with non-students such
as potential employers. However, sharing with other current or future
students of CS 7646 is prohibited and subject to being investigated as a
GT honor code violation.
-----do not edit anything above this line---
Student Name: Alastair Paragas
GT User ID: aparagas3
GT ID: 903475508
"""
import numpy as np
import random as rand
class QLearner(object):
def __init__(
self, num_states=100, num_actions = 4, alpha = 0.2,
gamma = 0.9, rar = 0.5, radr = 0.99, dyna = 0, verbose = False
):
self.verbose = verbose
self.num_states = num_states
self.num_actions = num_actions
self.alpha = alpha
self.gamma = gamma
self.radr = radr
self.dyna = dyna
self.s = 0
self.a = 0
self.rar = rar
# [state, action] -> reward val
self.q_table = np.zeros(shape=(num_states, num_actions))
# [state, action, state_prime] -> encounter count
# initialize with small non-zero value to avoid div by zero errors
self.t_count_table = np.zeros(
shape=(num_states, num_actions, num_states)
)
# [state, action] -> expected reward
self.r_table = np.zeros(shape=(num_states, num_actions))
def author(self):
return 'aparagas3'
def get_next_action(self, s_prime):
if rand.uniform(0, 1) <= self.rar:
next_action = rand.randint(0, self.num_actions-1)
else:
next_action = np.argmax(self.q_table[s_prime])
return next_action
def hallucinate_experience(self):
if self.dyna == 0:
return
s_a_tuples = np.column_stack(
(
np.random.randint(0, self.num_states, size=self.dyna),
np.random.randint(0, self.num_actions, size=self.dyna)
)
)
max_sprimes = np.argmax(self.t_count_table, axis=2)
# Generate state, action, state' and reward imaginary experiences
# from state and action tuples using transition probability cuboid of
# <state, action, state'>
experience_tuples = np.fromiter(((
s, a,
max_sprimes[s, a],
self.r_table[s, a]
) for s, a in s_a_tuples), dtype=('u8,u8,u8,f8'))
for s, a, s_prime, r in experience_tuples:
q_s_val = self.q_table[s, a]
q_sprime_maxval = np.amax(self.q_table[s_prime])
self.q_table[s, a] = ((1 - self.alpha) * q_s_val) + (
self.alpha * (r + self.gamma * q_sprime_maxval)
)
def querysetstate(self, s):
"""
@summary: Update the state without updating the Q-table
@param s: The new state
@returns: The selected action
"""
if self.verbose:
print "s =", s, "a =", next_action
self.s = s
self.a = rand.randint(0, self.num_actions-1)
return self.a
def query(self, s_prime, r):
"""
@summary: Update the Q table and return an action
@param s_prime: The new state
@param r: real-valued immediate reward
@returns: The selected action
"""
if self.verbose:
print "s =", s_prime, "a =", next_action, "r =", r
next_action = self.get_next_action(s_prime)
# Update q table
q_s_val = self.q_table[self.s, self.a]
q_sprime_maxval = np.amax(self.q_table[s_prime])
self.q_table[self.s, self.a] = ((1 - self.alpha) * q_s_val) + (
self.alpha * (r + self.gamma * q_sprime_maxval)
)
# Update t-table transition matrix (only with real-world experience)
self.t_count_table[self.s, self.a, s_prime] += 1
# Update r-table reward matrix (only with real-world experience)
self.r_table[self.s, self.a] = (
(1 - self.alpha) * self.r_table[self.s, self.a]
) + (self.alpha * r)
# Update state, action and random action rate
self.s = s_prime
self.a = next_action
self.rar = self.radr * self.rar
self.hallucinate_experience()
return next_action
if __name__=="__main__":
print "Remember Q from Star Trek? Well, this isn't him"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment