Skip to content

Instantly share code, notes, and snippets.

@fabrizioc1
Created June 3, 2019 06:34
Show Gist options
  • Save fabrizioc1/8ed4ab982fa1cd1afcc65482611760bc to your computer and use it in GitHub Desktop.
Save fabrizioc1/8ed4ab982fa1cd1afcc65482611760bc to your computer and use it in GitHub Desktop.
Reinforcement Learning: N-Armed Bandit
from __future__ import division
import random
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
# epsilon values
EPSILON_VALUES = [0.0, 0.01, 0.1]
# number of plays per sample
N_PLAYS = 1000
# number of samples per epsilon
N_SAMPLES = 2000
# number of options (arms)
N_ARMS = 10
# the mean of the distribution from which the biases are drawn
BIAS_MEAN = 0.0
# the standard deviation of the distribution from which the biases are drawn
BIAS_SIGMA = 1.0
# the standard deviation of the random reward from the arms
ARM_SIGMA = 1.0
# initial reward estimate, high number to encourage exploration
INIT_REWARD = 0.0
total_rewards = np.zeros((N_PLAYS, len(EPSILON_VALUES)))
optimal_action = np.zeros((N_PLAYS, len(EPSILON_VALUES)))
# each model is a different epsilon value
for model in xrange(0, len(EPSILON_VALUES)):
epsilon = EPSILON_VALUES[model]
for sample in xrange(0, N_SAMPLES):
# random bias per arm
bias = np.random.normal(BIAS_MEAN, BIAS_SIGMA, N_ARMS)
# find optimal index
best_index = np.argmax(bias)
# rewards for each arm
rewards = np.zeros(N_ARMS) + INIT_REWARD
# number of times each arm was played
played = np.zeros(N_ARMS)
for play in xrange(0, N_PLAYS):
# determine if choice is stochastic
if random.uniform(0, 1) > epsilon:
# deterministic case:
# select action with highest reward
choice = np.argmax(rewards)
else:
# stochastic case:
# perform exploratory action
choice = random.randrange(N_ARMS)
played[choice] += 1
current_reward = bias[choice] + random.gauss(0.0, 1.0) + ARM_SIGMA
total_rewards[play, model] += current_reward
# is this the optimal choice?
if choice == best_index:
optimal_action[play, model] += 1
# update estimated rewards
if played[choice] >= 2:
rewards[choice] = (rewards[choice] * (played[choice] - 1) + current_reward) / played[choice]
else:
rewards[choice] = current_reward
# percentage optimal action was chosen per play
percent_optimal_action = optimal_action / N_SAMPLES
# average reward per play
average_reward = total_rewards / N_SAMPLES
# plot results
x = np.arange(0, N_PLAYS)
fig = plt.figure(figsize=(15.0, 15.0))
fig.suptitle("%d-Armed Bandit" % N_ARMS)
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
for model in range(0, len(EPSILON_VALUES)):
epsilon = EPSILON_VALUES[model]
y1 = percent_optimal_action[:,model]
ax1.plot(x, y1, label='epsilon=%.3f' % epsilon)
y2 = average_reward[:,model]
ax2.plot(x, y2, label='epsilon=%.3f' % epsilon)
ax1.set_xlabel('Plays')
ax1.set_ylabel('% Optimal Action')
ax1.legend()
ax2.set_xlabel('Plays')
ax2.set_ylabel('Average Reward')
ax2.legend()
plt.show()
@fabrizioc1
Copy link
Author

N_Armed_Bandit

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment