twheys · August 29, 2015 14:24
diff --git a/n_armed_bandit.py b/n_armed_bandit.py
 """
 Implementation of the methods described by Richard Sutton in Chapter 2 of his book on reinforcement learning.

 http://webdocs.cs.ualberta.ca/~sutton/book/ebook/the-book.html
 """

 __author__ = 'Timothy Heys, [email protected]'

 import itertools
 from bisect import bisect

 import numpy as np
 import matplotlib.pyplot as plt


 def experiment(select_function, n=10, trials=300, steps=500):
    """
    Run an experiment with the n-armed-bandit game.

    :param select_function: The select function, greedy or softmax
    :param trials: The number of trials to test
    :param steps: The number of steps in each trial
    """
    av_results = []
    for i in range(trials):
        print('Running trial #%d' % (1 + i))
        av_results.append(trial(Game(n=n, select_function=select_function), steps=steps))

    plt.plot(np.average(av_results, axis=0))
    plt.xlabel('Trials')
    plt.ylabel('% Optimal Action')
    plt.show()

    print('done')


 def trial(game, steps=500):
    for i in range(steps):
        game.play()

    return list(map(
        lambda tup: tup[0] / tup[1],
        zip(game.rewards, game.max_rewards)
    ))


 class Game(object):
    """
    The n-arm-bandit game. http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node15.html
    """

    def __init__(self, select_function, n=500, learn_rate=0.1, greedy=0.1, comparison_rate=None, bias=0.0):
        self.n = n
        # The n-arm bandit lottery potential rewards
        self.true_values = np.random.randn(n)

        # Initial estimates
        self.weights = np.random.rand(n) * bias
        self.comparison = np.zeros(n)

        # initial stats
        self.rewards = []
        self.max_rewards = []
        self.plays = 0

        # Greedy param, lower values will choose actions more greedily
        self.learn_rate = learn_rate
        self.comparison_rate = comparison_rate
        self.greedy = greedy
        self.select_function = select_function

    def play(self):
        # Reward vector for this turn
        rewards = np.random.normal(0.5, 0.25, self.n) * self.true_values

        # Select action
        action_idx = self.select_function(self.weights, self.greedy)

        # Update estimates
        if not self.comparison_rate:
            # reinforcement learning
            # http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node18.html
            self.weights[action_idx] += self.learn_rate * (rewards[action_idx] - self.weights[action_idx])

        else:
            # reinforcement comparison
            # http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node22.html
            self.weights[action_idx] += self.learn_rate * (rewards[action_idx] - self.comparison[action_idx])
            self.comparison[action_idx] += self.comparison_rate * (rewards[action_idx] - self.comparison[action_idx])

        np_min = np.min(rewards)
        self.rewards.append(rewards[action_idx] - np_min)
        self.max_rewards.append(np.max(rewards) - np_min)
        self.plays += 1


 def greedy_select(q, epsilon=0.1):
    """
    Greedy select http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node16.html

    :param q: Action values to select from
    :param epsilon: The greedy parameter as a probability to explore an option other than the greedy option.
    :return: The idx of q that was selected
    """
    greedy = epsilon < np.random.rand()
    argmax = np.argmax(q)
    if greedy:
        return argmax

    action_values = np.ones(q.shape)
    action_values[argmax] = 0.
    return select_from_dist(action_values / (q.size - 1))


 def softmax_select(q, tau=0.1):
    """
    Softmax http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node17.html

    :param q: Action values to select from
    :param tau: The greedy parameter, lower values increase probability of selecting the greedy option
    :return: The idx of q that was selected
    """
    action_values = np.exp(q / tau) / np.sum(np.exp(q / tau))
    return select_from_dist(action_values)


 def select_from_dist(action_values):
    """
    Chooses a value from a distribution vector.

    :param action_values: A vector of probabilities, must be normal so that sum(action_values) == 1
    :return: The idx of action_values that was selected
    """
    cdf = list(itertools.accumulate(action_values))
    return bisect(cdf, np.random.rand())


 if __name__ == "__main__":
    # experiment(greedy_select)
    experiment(softmax_select, n=100)
	"""
	Implementation of the methods described by Richard Sutton in Chapter 2 of his book on reinforcement learning.

	http://webdocs.cs.ualberta.ca/~sutton/book/ebook/the-book.html
	"""

	__author__ = 'Timothy Heys, [email protected]'

	import itertools
	from bisect import bisect

	import numpy as np
	import matplotlib.pyplot as plt


	def experiment(select_function, n=10, trials=300, steps=500):
	"""
	Run an experiment with the n-armed-bandit game.

	:param select_function: The select function, greedy or softmax
	:param trials: The number of trials to test
	:param steps: The number of steps in each trial
	"""
	av_results = []
	for i in range(trials):
	print('Running trial #%d' % (1 + i))
	av_results.append(trial(Game(n=n, select_function=select_function), steps=steps))

	plt.plot(np.average(av_results, axis=0))
	plt.xlabel('Trials')
	plt.ylabel('% Optimal Action')
	plt.show()

	print('done')


	def trial(game, steps=500):
	for i in range(steps):
	game.play()

	return list(map(
	lambda tup: tup[0] / tup[1],
	zip(game.rewards, game.max_rewards)
	))


	class Game(object):
	"""
	The n-arm-bandit game. http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node15.html
	"""

	def __init__(self, select_function, n=500, learn_rate=0.1, greedy=0.1, comparison_rate=None, bias=0.0):
	self.n = n
	# The n-arm bandit lottery potential rewards
	self.true_values = np.random.randn(n)

	# Initial estimates
	self.weights = np.random.rand(n) * bias
	self.comparison = np.zeros(n)

	# initial stats
	self.rewards = []
	self.max_rewards = []
	self.plays = 0

	# Greedy param, lower values will choose actions more greedily
	self.learn_rate = learn_rate
	self.comparison_rate = comparison_rate
	self.greedy = greedy
	self.select_function = select_function

	def play(self):
	# Reward vector for this turn
	rewards = np.random.normal(0.5, 0.25, self.n) * self.true_values

	# Select action
	action_idx = self.select_function(self.weights, self.greedy)

	# Update estimates
	if not self.comparison_rate:
	# reinforcement learning
	# http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node18.html
	self.weights[action_idx] += self.learn_rate * (rewards[action_idx] - self.weights[action_idx])

	else:
	# reinforcement comparison
	# http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node22.html
	self.weights[action_idx] += self.learn_rate * (rewards[action_idx] - self.comparison[action_idx])
	self.comparison[action_idx] += self.comparison_rate * (rewards[action_idx] - self.comparison[action_idx])

	np_min = np.min(rewards)
	self.rewards.append(rewards[action_idx] - np_min)
	self.max_rewards.append(np.max(rewards) - np_min)
	self.plays += 1


	def greedy_select(q, epsilon=0.1):
	"""
	Greedy select http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node16.html

	:param q: Action values to select from
	:param epsilon: The greedy parameter as a probability to explore an option other than the greedy option.
	:return: The idx of q that was selected
	"""
	greedy = epsilon < np.random.rand()
	argmax = np.argmax(q)
	if greedy:
	return argmax

	action_values = np.ones(q.shape)
	action_values[argmax] = 0.
	return select_from_dist(action_values / (q.size - 1))


	def softmax_select(q, tau=0.1):
	"""
	Softmax http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node17.html

	:param q: Action values to select from
	:param tau: The greedy parameter, lower values increase probability of selecting the greedy option
	:return: The idx of q that was selected
	"""
	action_values = np.exp(q / tau) / np.sum(np.exp(q / tau))
	return select_from_dist(action_values)


	def select_from_dist(action_values):
	"""
	Chooses a value from a distribution vector.

	:param action_values: A vector of probabilities, must be normal so that sum(action_values) == 1
	:return: The idx of action_values that was selected
	"""
	cdf = list(itertools.accumulate(action_values))
	return bisect(cdf, np.random.rand())


	if __name__ == "__main__":
	# experiment(greedy_select)
	experiment(softmax_select, n=100)