alastairparagas · July 12, 2019 16:03
diff --git a/QLearner.py b/QLearner.py
 """ 			  		 			 	 	 		 		 	  		   	  			  	
 Template for implementing QLearner  (c) 2015 Tucker Balch 			  		 			 	 	 		 		 	  		   	  			  	
 			  		 			 	 	 		 		 	  		   	  			  	
 Copyright 2018, Georgia Institute of Technology (Georgia Tech) 			  		 			 	 	 		 		 	  		   	  			  	
 Atlanta, Georgia 30332 			  		 			 	 	 		 		 	  		   	  			  	
 All Rights Reserved 			  		 			 	 	 		 		 	  		   	  			  	
 			  		 			 	 	 		 		 	  		   	  			  	
 Template code for CS 4646/7646 			  		 			 	 	 		 		 	  		   	  			  	
 			  		 			 	 	 		 		 	  		   	  			  	
 Georgia Tech asserts copyright ownership of this template and all derivative 			  		 			 	 	 		 		 	  		   	  			  	
 works, including solutions to the projects assigned in this course. Students 			  		 			 	 	 		 		 	  		   	  			  	
 and other users of this template code are advised not to share it with others 			  		 			 	 	 		 		 	  		   	  			  	
 or to make it available on publicly viewable websites including repositories 			  		 			 	 	 		 		 	  		   	  			  	
 such as github and gitlab.  This copyright statement should not be removed 			  		 			 	 	 		 		 	  		   	  			  	
 or edited. 			  		 			 	 	 		 		 	  		   	  			  	
 			  		 			 	 	 		 		 	  		   	  			  	
 We do grant permission to share solutions privately with non-students such 			  		 			 	 	 		 		 	  		   	  			  	
 as potential employers. However, sharing with other current or future 			  		 			 	 	 		 		 	  		   	  			  	
 students of CS 7646 is prohibited and subject to being investigated as a 			  		 			 	 	 		 		 	  		   	  			  	
 GT honor code violation. 			  		 			 	 	 		 		 	  		   	  			  	
 			  		 			 	 	 		 		 	  		   	  			  	
 -----do not edit anything above this line--- 			  		 			 	 	 		 		 	  		   	  			  	
 			  		 			 	 	 		 		 	  		   	  			  	
 Student Name: Alastair Paragas 			 	 	 		 		 	  		   	  			  	
 GT User ID: aparagas3		  		 			 	 	 		 		 	  		   	  			  	
 GT ID: 903475508		  		 			 	 	 		 		 	  		   	  			  	
 """ 			  		 			 	 	 		 		 	  		   	  			  	
 			  		 			 	 	 		 		 	  		   	  			  	
 import numpy as np
 import random as rand


 class QLearner(object):
  
    def __init__(
      self, num_states=100, num_actions = 4, alpha = 0.2, 
      gamma = 0.9, rar = 0.5,  radr = 0.99, dyna = 0, verbose = False
    ):
      self.verbose = verbose
      self.num_states = num_states
      self.num_actions = num_actions
      self.alpha = alpha
      self.gamma = gamma
      self.radr = radr
      self.dyna = dyna
      
      self.s = 0
      self.a = 0
      self.rar = rar
      
      # [state, action] -> reward val
      self.q_table = np.zeros(shape=(num_states, num_actions))
      
      # [state, action, state_prime] -> encounter count
      # initialize with small non-zero value to avoid div by zero errors
      self.t_count_table = np.zeros(
        shape=(num_states, num_actions, num_states)
      )
      # [state, action] -> expected reward
      self.r_table = np.zeros(shape=(num_states, num_actions))
      
    def author(self):
      return 'aparagas3'
      
    def get_next_action(self, s_prime):
      if rand.uniform(0, 1) <= self.rar:
        next_action = rand.randint(0, self.num_actions-1)
      else:
        next_action = np.argmax(self.q_table[s_prime])
        
      return next_action
    
    def hallucinate_experience(self):
      if self.dyna == 0:
        return
      
      s_a_tuples = np.column_stack(
        (
          np.random.randint(0, self.num_states, size=self.dyna),
          np.random.randint(0, self.num_actions, size=self.dyna)
        )
      )
      max_sprimes = np.argmax(self.t_count_table, axis=2)
      
      # Generate state, action, state' and reward imaginary experiences
      # from state and action tuples using transition probability cuboid of
      # <state, action, state'>
      experience_tuples = np.fromiter(((
        s, a,
        max_sprimes[s, a],
        self.r_table[s, a]
      ) for s, a in s_a_tuples), dtype=('u8,u8,u8,f8'))
      
      for s, a, s_prime, r in experience_tuples:
        q_s_val = self.q_table[s, a]
        q_sprime_maxval = np.amax(self.q_table[s_prime])
        
        self.q_table[s, a] = ((1 - self.alpha) * q_s_val) + (
          self.alpha * (r + self.gamma * q_sprime_maxval)
        )
    
    def querysetstate(self, s): 			  		 			 	 	 		 		 	  		   	  			  	
      """ 			  		 			 	 	 		 		 	  		   	  			  	
      @summary: Update the state without updating the Q-table 			  		 			 	 	 		 		 	  		   	  			  	
      @param s: The new state 			  		 			 	 	 		 		 	  		   	  			  	
      @returns: The selected action 			  		 			 	 	 		 		 	  		   	  			  	
      """ 	
      if self.verbose: 
        print "s =", s, "a =", next_action 
        
      self.s = s 			  		 			 	 	 		 		 	  		   	  			  	
      self.a = rand.randint(0, self.num_actions-1)
      
      return self.a
    
    def query(self, s_prime, r):
      """ 			  		 			 	 	 		 		 	  		   	  			  	
      @summary: Update the Q table and return an action 			  		 			 	 	 		 		 	  		   	  			  	
      @param s_prime: The new state 			  		 			 	 	 		 		 	  		   	  			  	
      @param r: real-valued immediate reward 			  		 			 	 	 		 		 	  		   	  			  	
      @returns: The selected action 			  		 			 	 	 		 		 	  		   	  			  	
      """
      if self.verbose:
        print "s =", s_prime, "a =", next_action, "r =", r
        
      next_action = self.get_next_action(s_prime)
      
      # Update q table
      q_s_val = self.q_table[self.s, self.a]
      q_sprime_maxval = np.amax(self.q_table[s_prime])
      
      self.q_table[self.s, self.a] = ((1 - self.alpha) * q_s_val) + (
        self.alpha * (r + self.gamma * q_sprime_maxval)
      )
      
      # Update t-table transition matrix (only with real-world experience)
      self.t_count_table[self.s, self.a, s_prime] += 1
      
      # Update r-table reward matrix (only with real-world experience)
      self.r_table[self.s, self.a] = (
        (1 - self.alpha) * self.r_table[self.s, self.a]
      ) + (self.alpha * r)
      
      # Update state, action and random action rate
      self.s = s_prime
      self.a = next_action
      self.rar = self.radr * self.rar
      
      self.hallucinate_experience()
        
      return next_action
    

 if __name__=="__main__":
  print "Remember Q from Star Trek? Well, this isn't him"
	"""
	Template for implementing QLearner (c) 2015 Tucker Balch

	Copyright 2018, Georgia Institute of Technology (Georgia Tech)
	Atlanta, Georgia 30332
	All Rights Reserved

	Template code for CS 4646/7646

	Georgia Tech asserts copyright ownership of this template and all derivative
	works, including solutions to the projects assigned in this course. Students
	and other users of this template code are advised not to share it with others
	or to make it available on publicly viewable websites including repositories
	such as github and gitlab. This copyright statement should not be removed
	or edited.

	We do grant permission to share solutions privately with non-students such
	as potential employers. However, sharing with other current or future
	students of CS 7646 is prohibited and subject to being investigated as a
	GT honor code violation.

	-----do not edit anything above this line---

	Student Name: Alastair Paragas
	GT User ID: aparagas3
	GT ID: 903475508
	"""

	import numpy as np
	import random as rand


	class QLearner(object):

	def __init__(
	self, num_states=100, num_actions = 4, alpha = 0.2,
	gamma = 0.9, rar = 0.5, radr = 0.99, dyna = 0, verbose = False
	):
	self.verbose = verbose
	self.num_states = num_states
	self.num_actions = num_actions
	self.alpha = alpha
	self.gamma = gamma
	self.radr = radr
	self.dyna = dyna

	self.s = 0
	self.a = 0
	self.rar = rar

	# [state, action] -> reward val
	self.q_table = np.zeros(shape=(num_states, num_actions))

	# [state, action, state_prime] -> encounter count
	# initialize with small non-zero value to avoid div by zero errors
	self.t_count_table = np.zeros(
	shape=(num_states, num_actions, num_states)
	)
	# [state, action] -> expected reward
	self.r_table = np.zeros(shape=(num_states, num_actions))

	def author(self):
	return 'aparagas3'

	def get_next_action(self, s_prime):
	if rand.uniform(0, 1) <= self.rar:
	next_action = rand.randint(0, self.num_actions-1)
	else:
	next_action = np.argmax(self.q_table[s_prime])

	return next_action

	def hallucinate_experience(self):
	if self.dyna == 0:
	return

	s_a_tuples = np.column_stack(
	(
	np.random.randint(0, self.num_states, size=self.dyna),
	np.random.randint(0, self.num_actions, size=self.dyna)
	)
	)
	max_sprimes = np.argmax(self.t_count_table, axis=2)

	# Generate state, action, state' and reward imaginary experiences
	# from state and action tuples using transition probability cuboid of
	# <state, action, state'>
	experience_tuples = np.fromiter(((
	s, a,
	max_sprimes[s, a],
	self.r_table[s, a]
	) for s, a in s_a_tuples), dtype=('u8,u8,u8,f8'))

	for s, a, s_prime, r in experience_tuples:
	q_s_val = self.q_table[s, a]
	q_sprime_maxval = np.amax(self.q_table[s_prime])

	self.q_table[s, a] = ((1 - self.alpha) * q_s_val) + (
	self.alpha * (r + self.gamma * q_sprime_maxval)
	)

	def querysetstate(self, s):
	"""
	@summary: Update the state without updating the Q-table
	@param s: The new state
	@returns: The selected action
	"""
	if self.verbose:
	print "s =", s, "a =", next_action

	self.s = s
	self.a = rand.randint(0, self.num_actions-1)

	return self.a

	def query(self, s_prime, r):
	"""
	@summary: Update the Q table and return an action
	@param s_prime: The new state
	@param r: real-valued immediate reward
	@returns: The selected action
	"""
	if self.verbose:
	print "s =", s_prime, "a =", next_action, "r =", r

	next_action = self.get_next_action(s_prime)

	# Update q table
	q_s_val = self.q_table[self.s, self.a]
	q_sprime_maxval = np.amax(self.q_table[s_prime])

	self.q_table[self.s, self.a] = ((1 - self.alpha) * q_s_val) + (
	self.alpha * (r + self.gamma * q_sprime_maxval)
	)

	# Update t-table transition matrix (only with real-world experience)
	self.t_count_table[self.s, self.a, s_prime] += 1

	# Update r-table reward matrix (only with real-world experience)
	self.r_table[self.s, self.a] = (
	(1 - self.alpha) * self.r_table[self.s, self.a]
	) + (self.alpha * r)

	# Update state, action and random action rate
	self.s = s_prime
	self.a = next_action
	self.rar = self.radr * self.rar

	self.hallucinate_experience()

	return next_action


	if __name__=="__main__":
	print "Remember Q from Star Trek? Well, this isn't him"
No results found