Skip to content

Instantly share code, notes, and snippets.

@sergeyf
Forked from anonymous/bandit_simulations.py
Last active March 15, 2017 06:11
Show Gist options
  • Save sergeyf/63a37a5db18f8cebf962 to your computer and use it in GitHub Desktop.
Save sergeyf/63a37a5db18f8cebf962 to your computer and use it in GitHub Desktop.
import numpy as np
from matplotlib import pylab as plt
#from mpltools import style # uncomment for prettier plots
#style.use(['ggplot'])
# generate all bernoulli rewards ahead of time
def generate_bernoulli_bandit_data(num_samples,K):
CTRs_that_generated_data = np.tile(np.random.rand(K),(num_samples,1))
true_rewards = np.random.rand(num_samples,K) < CTRs_that_generated_data
return true_rewards,CTRs_that_generated_data
# totally random
def random(observed_data):
return np.random.randint(0,len(observed_data))
# the naive algorithm
def naive(observed_data,number_to_explore=100):
totals = observed_data.sum(1) # totals
if np.any(totals < number_to_explore): # if have been explored less than specified
least_explored = np.argmin(totals) # return the one least explored
return least_explored
else: # return the best mean forever
successes = observed_data[:,0] # successes
estimated_means = successes/totals # the current means
best_mean = np.argmax(estimated_means) # the best mean
return best_mean
# the epsilon greedy algorithm
def epsilon_greedy(observed_data,epsilon=0.01):
totals = observed_data.sum(1) # totals
successes = observed_data[:,0] # successes
estimated_means = successes/totals # the current means
best_mean = np.argmax(estimated_means) # the best mean
be_exporatory = np.random.rand() < epsilon # should we explore?
if be_exporatory: # totally random, excluding the best_mean
other_choice = np.random.randint(0,len(observed_data))
while other_choice == best_mean:
other_choice = np.random.randint(0,len(observed_data))
return other_choice
else: # take the best mean
return best_mean
# the UCB algorithm using
# (1 - 1/t) confidence interval using Chernoff-Hoeffding bound)
# for details of this particular confidence bound, see the UCB1-TUNED section, slide 18, of:
# http://lane.compbio.cmu.edu/courses/slides_ucb.pdf
def UCB(observed_data):
t = float(observed_data.sum()) # total number of rounds so far
totals = observed_data.sum(1)
successes = observed_data[:,0]
estimated_means = successes/totals # sample mean
estimated_variances = estimated_means - estimated_means**2
UCB = estimated_means + np.sqrt( np.minimum( estimated_variances + np.sqrt(2*np.log(t)/totals), 0.25 ) * np.log(t)/totals )
return np.argmax(UCB)
# the UCB algorithm - using fixed 95% confidence intervals
# see slide 8 for details:
# http://dept.stat.lsa.umich.edu/~kshedden/Courses/Stat485/Notes/binomial_confidence_intervals.pdf
def UCB_normal(observed_data):
totals = observed_data.sum(1) # totals
successes = observed_data[:,0] # successes
estimated_means = successes/totals # sample mean
estimated_variances = estimated_means - estimated_means**2
UCB = estimated_means + 1.96*np.sqrt(estimated_variances/totals)
return np.argmax(UCB)
# Thompson Sampling
# http://www.economics.uci.edu/~ivan/asmb.874.pdf
# http://camdp.com/blogs/multi-armed-bandits
def thompson_sampling(observed_data):
return np.argmax( np.random.beta(observed_data[:,0], observed_data[:,1]) )
# the bandit algorithm
def run_bandit_alg(true_rewards,CTRs_that_generated_data,choice_func):
num_samples,K = true_rewards.shape
# seed the estimated params
prior_a = 1. # aka successes
prior_b = 1. # aka failures
observed_data = np.zeros((K,2))
observed_data[:,0] += prior_a # allocating the initial conditions
observed_data[:,1] += prior_b
regret = np.zeros(num_samples)
for i in range(0,num_samples):
# pulling a lever & updating observed_data
this_choice = choice_func(observed_data)
# update parameters
if true_rewards[i,this_choice] == 1:
update_ind = 0
else:
update_ind = 1
observed_data[this_choice,update_ind] += 1
# updated expected regret
regret[i] = np.max(CTRs_that_generated_data[i,:]) - CTRs_that_generated_data[i,this_choice]
cum_regret = np.cumsum(regret)
return cum_regret
# define number of samples and number of choices
num_samples = 10000
K = 5
number_experiments = 100
regret_accumulator = np.zeros((num_samples,5))
for i in range(number_experiments):
print "Running experiment:", i+1
true_rewards,CTRs_that_generated_data = generate_bernoulli_bandit_data(num_samples,K)
regret_accumulator[:,0] += run_bandit_alg(true_rewards,CTRs_that_generated_data,random)
regret_accumulator[:,1] += run_bandit_alg(true_rewards,CTRs_that_generated_data,naive)
regret_accumulator[:,2] += run_bandit_alg(true_rewards,CTRs_that_generated_data,epsilon_greedy)
regret_accumulator[:,3] += run_bandit_alg(true_rewards,CTRs_that_generated_data,UCB)
regret_accumulator[:,4] += run_bandit_alg(true_rewards,CTRs_that_generated_data,UCB_normal)
plt.semilogy(regret_accumulator/number_experiments)
plt.title('Simulated Bandit Performance for K = 5')
plt.ylabel('Cumulative Expected Regret')
plt.xlabel('Round Index')
plt.legend(('Random','Naive','Epsilon-Greedy','(1 - 1/t) UCB','95% UCB'),loc='lower right')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment