Created
June 3, 2019 06:34
-
-
Save fabrizioc1/8ed4ab982fa1cd1afcc65482611760bc to your computer and use it in GitHub Desktop.
Reinforcement Learning: N-Armed Bandit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import random | |
import numpy as np | |
import matplotlib | |
matplotlib.use('TkAgg') | |
import matplotlib.pyplot as plt | |
# epsilon values | |
EPSILON_VALUES = [0.0, 0.01, 0.1] | |
# number of plays per sample | |
N_PLAYS = 1000 | |
# number of samples per epsilon | |
N_SAMPLES = 2000 | |
# number of options (arms) | |
N_ARMS = 10 | |
# the mean of the distribution from which the biases are drawn | |
BIAS_MEAN = 0.0 | |
# the standard deviation of the distribution from which the biases are drawn | |
BIAS_SIGMA = 1.0 | |
# the standard deviation of the random reward from the arms | |
ARM_SIGMA = 1.0 | |
# initial reward estimate, high number to encourage exploration | |
INIT_REWARD = 0.0 | |
total_rewards = np.zeros((N_PLAYS, len(EPSILON_VALUES))) | |
optimal_action = np.zeros((N_PLAYS, len(EPSILON_VALUES))) | |
# each model is a different epsilon value | |
for model in xrange(0, len(EPSILON_VALUES)): | |
epsilon = EPSILON_VALUES[model] | |
for sample in xrange(0, N_SAMPLES): | |
# random bias per arm | |
bias = np.random.normal(BIAS_MEAN, BIAS_SIGMA, N_ARMS) | |
# find optimal index | |
best_index = np.argmax(bias) | |
# rewards for each arm | |
rewards = np.zeros(N_ARMS) + INIT_REWARD | |
# number of times each arm was played | |
played = np.zeros(N_ARMS) | |
for play in xrange(0, N_PLAYS): | |
# determine if choice is stochastic | |
if random.uniform(0, 1) > epsilon: | |
# deterministic case: | |
# select action with highest reward | |
choice = np.argmax(rewards) | |
else: | |
# stochastic case: | |
# perform exploratory action | |
choice = random.randrange(N_ARMS) | |
played[choice] += 1 | |
current_reward = bias[choice] + random.gauss(0.0, 1.0) + ARM_SIGMA | |
total_rewards[play, model] += current_reward | |
# is this the optimal choice? | |
if choice == best_index: | |
optimal_action[play, model] += 1 | |
# update estimated rewards | |
if played[choice] >= 2: | |
rewards[choice] = (rewards[choice] * (played[choice] - 1) + current_reward) / played[choice] | |
else: | |
rewards[choice] = current_reward | |
# percentage optimal action was chosen per play | |
percent_optimal_action = optimal_action / N_SAMPLES | |
# average reward per play | |
average_reward = total_rewards / N_SAMPLES | |
# plot results | |
x = np.arange(0, N_PLAYS) | |
fig = plt.figure(figsize=(15.0, 15.0)) | |
fig.suptitle("%d-Armed Bandit" % N_ARMS) | |
ax1 = fig.add_subplot(211) | |
ax2 = fig.add_subplot(212) | |
for model in range(0, len(EPSILON_VALUES)): | |
epsilon = EPSILON_VALUES[model] | |
y1 = percent_optimal_action[:,model] | |
ax1.plot(x, y1, label='epsilon=%.3f' % epsilon) | |
y2 = average_reward[:,model] | |
ax2.plot(x, y2, label='epsilon=%.3f' % epsilon) | |
ax1.set_xlabel('Plays') | |
ax1.set_ylabel('% Optimal Action') | |
ax1.legend() | |
ax2.set_xlabel('Plays') | |
ax2.set_ylabel('Average Reward') | |
ax2.legend() | |
plt.show() |
Author
fabrizioc1
commented
Jun 3, 2019
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment