Skip to content

Instantly share code, notes, and snippets.

@twheys
Last active August 29, 2015 14:24
Show Gist options
  • Save twheys/a912043e81cd51a3f363 to your computer and use it in GitHub Desktop.
Save twheys/a912043e81cd51a3f363 to your computer and use it in GitHub Desktop.
Implementation of the methods described by Richard Sutton in Chapter 2 of his book on reinforcement learning. Includes both greedy and softmax selection as well as reinforcement comparison. Requires python 3, scipy for calculations and plotting.
"""
Implementation of the methods described by Richard Sutton in Chapter 2 of his book on reinforcement learning.
http://webdocs.cs.ualberta.ca/~sutton/book/ebook/the-book.html
"""
__author__ = 'Timothy Heys, [email protected]'
import itertools
from bisect import bisect
import numpy as np
import matplotlib.pyplot as plt
def experiment(select_function, n=10, trials=300, steps=500):
"""
Run an experiment with the n-armed-bandit game.
:param select_function: The select function, greedy or softmax
:param trials: The number of trials to test
:param steps: The number of steps in each trial
"""
av_results = []
for i in range(trials):
print('Running trial #%d' % (1 + i))
av_results.append(trial(Game(n=n, select_function=select_function), steps=steps))
plt.plot(np.average(av_results, axis=0))
plt.xlabel('Trials')
plt.ylabel('% Optimal Action')
plt.show()
print('done')
def trial(game, steps=500):
for i in range(steps):
game.play()
return list(map(
lambda tup: tup[0] / tup[1],
zip(game.rewards, game.max_rewards)
))
class Game(object):
"""
The n-arm-bandit game. http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node15.html
"""
def __init__(self, select_function, n=500, learn_rate=0.1, greedy=0.1, comparison_rate=None, bias=0.0):
self.n = n
# The n-arm bandit lottery potential rewards
self.true_values = np.random.randn(n)
# Initial estimates
self.weights = np.random.rand(n) * bias
self.comparison = np.zeros(n)
# initial stats
self.rewards = []
self.max_rewards = []
self.plays = 0
# Greedy param, lower values will choose actions more greedily
self.learn_rate = learn_rate
self.comparison_rate = comparison_rate
self.greedy = greedy
self.select_function = select_function
def play(self):
# Reward vector for this turn
rewards = np.random.normal(0.5, 0.25, self.n) * self.true_values
# Select action
action_idx = self.select_function(self.weights, self.greedy)
# Update estimates
if not self.comparison_rate:
# reinforcement learning
# http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node18.html
self.weights[action_idx] += self.learn_rate * (rewards[action_idx] - self.weights[action_idx])
else:
# reinforcement comparison
# http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node22.html
self.weights[action_idx] += self.learn_rate * (rewards[action_idx] - self.comparison[action_idx])
self.comparison[action_idx] += self.comparison_rate * (rewards[action_idx] - self.comparison[action_idx])
np_min = np.min(rewards)
self.rewards.append(rewards[action_idx] - np_min)
self.max_rewards.append(np.max(rewards) - np_min)
self.plays += 1
def greedy_select(q, epsilon=0.1):
"""
Greedy select http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node16.html
:param q: Action values to select from
:param epsilon: The greedy parameter as a probability to explore an option other than the greedy option.
:return: The idx of q that was selected
"""
greedy = epsilon < np.random.rand()
argmax = np.argmax(q)
if greedy:
return argmax
action_values = np.ones(q.shape)
action_values[argmax] = 0.
return select_from_dist(action_values / (q.size - 1))
def softmax_select(q, tau=0.1):
"""
Softmax http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node17.html
:param q: Action values to select from
:param tau: The greedy parameter, lower values increase probability of selecting the greedy option
:return: The idx of q that was selected
"""
action_values = np.exp(q / tau) / np.sum(np.exp(q / tau))
return select_from_dist(action_values)
def select_from_dist(action_values):
"""
Chooses a value from a distribution vector.
:param action_values: A vector of probabilities, must be normal so that sum(action_values) == 1
:return: The idx of action_values that was selected
"""
cdf = list(itertools.accumulate(action_values))
return bisect(cdf, np.random.rand())
if __name__ == "__main__":
# experiment(greedy_select)
experiment(softmax_select, n=100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment