Last active
August 29, 2015 14:24
-
-
Save twheys/a912043e81cd51a3f363 to your computer and use it in GitHub Desktop.
Implementation of the methods described by Richard Sutton in Chapter 2 of his book on reinforcement learning. Includes both greedy and softmax selection as well as reinforcement comparison. Requires python 3, scipy for calculations and plotting.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Implementation of the methods described by Richard Sutton in Chapter 2 of his book on reinforcement learning. | |
http://webdocs.cs.ualberta.ca/~sutton/book/ebook/the-book.html | |
""" | |
__author__ = 'Timothy Heys, [email protected]' | |
import itertools | |
from bisect import bisect | |
import numpy as np | |
import matplotlib.pyplot as plt | |
def experiment(select_function, n=10, trials=300, steps=500): | |
""" | |
Run an experiment with the n-armed-bandit game. | |
:param select_function: The select function, greedy or softmax | |
:param trials: The number of trials to test | |
:param steps: The number of steps in each trial | |
""" | |
av_results = [] | |
for i in range(trials): | |
print('Running trial #%d' % (1 + i)) | |
av_results.append(trial(Game(n=n, select_function=select_function), steps=steps)) | |
plt.plot(np.average(av_results, axis=0)) | |
plt.xlabel('Trials') | |
plt.ylabel('% Optimal Action') | |
plt.show() | |
print('done') | |
def trial(game, steps=500): | |
for i in range(steps): | |
game.play() | |
return list(map( | |
lambda tup: tup[0] / tup[1], | |
zip(game.rewards, game.max_rewards) | |
)) | |
class Game(object): | |
""" | |
The n-arm-bandit game. http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node15.html | |
""" | |
def __init__(self, select_function, n=500, learn_rate=0.1, greedy=0.1, comparison_rate=None, bias=0.0): | |
self.n = n | |
# The n-arm bandit lottery potential rewards | |
self.true_values = np.random.randn(n) | |
# Initial estimates | |
self.weights = np.random.rand(n) * bias | |
self.comparison = np.zeros(n) | |
# initial stats | |
self.rewards = [] | |
self.max_rewards = [] | |
self.plays = 0 | |
# Greedy param, lower values will choose actions more greedily | |
self.learn_rate = learn_rate | |
self.comparison_rate = comparison_rate | |
self.greedy = greedy | |
self.select_function = select_function | |
def play(self): | |
# Reward vector for this turn | |
rewards = np.random.normal(0.5, 0.25, self.n) * self.true_values | |
# Select action | |
action_idx = self.select_function(self.weights, self.greedy) | |
# Update estimates | |
if not self.comparison_rate: | |
# reinforcement learning | |
# http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node18.html | |
self.weights[action_idx] += self.learn_rate * (rewards[action_idx] - self.weights[action_idx]) | |
else: | |
# reinforcement comparison | |
# http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node22.html | |
self.weights[action_idx] += self.learn_rate * (rewards[action_idx] - self.comparison[action_idx]) | |
self.comparison[action_idx] += self.comparison_rate * (rewards[action_idx] - self.comparison[action_idx]) | |
np_min = np.min(rewards) | |
self.rewards.append(rewards[action_idx] - np_min) | |
self.max_rewards.append(np.max(rewards) - np_min) | |
self.plays += 1 | |
def greedy_select(q, epsilon=0.1): | |
""" | |
Greedy select http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node16.html | |
:param q: Action values to select from | |
:param epsilon: The greedy parameter as a probability to explore an option other than the greedy option. | |
:return: The idx of q that was selected | |
""" | |
greedy = epsilon < np.random.rand() | |
argmax = np.argmax(q) | |
if greedy: | |
return argmax | |
action_values = np.ones(q.shape) | |
action_values[argmax] = 0. | |
return select_from_dist(action_values / (q.size - 1)) | |
def softmax_select(q, tau=0.1): | |
""" | |
Softmax http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node17.html | |
:param q: Action values to select from | |
:param tau: The greedy parameter, lower values increase probability of selecting the greedy option | |
:return: The idx of q that was selected | |
""" | |
action_values = np.exp(q / tau) / np.sum(np.exp(q / tau)) | |
return select_from_dist(action_values) | |
def select_from_dist(action_values): | |
""" | |
Chooses a value from a distribution vector. | |
:param action_values: A vector of probabilities, must be normal so that sum(action_values) == 1 | |
:return: The idx of action_values that was selected | |
""" | |
cdf = list(itertools.accumulate(action_values)) | |
return bisect(cdf, np.random.rand()) | |
if __name__ == "__main__": | |
# experiment(greedy_select) | |
experiment(softmax_select, n=100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment