conormm · September 21, 2018 16:38
diff --git a/bandits.py b/bandits.py

 # coding: utf-8

 # In[150]:


 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns

 sns.set_style("whitegrid")
 get_ipython().run_line_magic('matplotlib', 'inline')


 # In[25]:


 # we have four machines each with p of paying out (reward equal 1 or nothing (0))
 machines = [0, 1, 2, 3]
 payoffs = [0.07, 0.11, 0.22, 0.24]


 # In[26]:


 # random selections
 payouts = []
 n_payouts_random = 0
 for i in range(10000):
    machine = np.random.choice([0, 1, 2, 3])
    m = np.random.binomial(1, p=payoffs[machine])
    n_payouts_random += m

 print(f"Sum of the reward is: {n_payouts_random}")


 # In[275]:


 # greedy epsilon algorithm. More efficieient ways to do this.

 # greedy epsilon selections
 n_learning = 2000
 payoffs = [0.07, 0.11, 0.10, 0.08]
 e = 0.02
 n_trials = 10000

 successes = np.zeros(4)
 m_chosen = np.zeros(4)
 m_prob = np.zeros(4)
 n_payouts_greedy = 0
 t = 0
 regret = np.zeros(n_trials)

 for i in range(n_trials):
    t += 1
    payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
    #payoffs[3] = .25 if i % 3 == 0 else .20 # add variability to payouts
    # randomly select a machine
    #payoffs[3] = .4 if i % 3 == 0 else .04 # add variability to payouts
    # after n_learning random draws select the machine that is paying out the most
    machine = machine if i < n_learning else np.argmax(m_prob)
    # 0.02% of the time take a random draw from machines
    machine = np.random.choice(machines) if np.random.uniform(0, 1) > (1 - e) else machine
    m = np.random.binomial(1, p=payoffs[machine])
    # every 100 trials update the successes
    if i % 100:
        # update the count of successes for the chosen machine
        successes[machine] += m
        # update the probability of payout for each machine
        m_prob = successes/m_chosen
        # how many times was machine m chosen    
    m_chosen[machine] += 1
    # count total reward (sum of payouts)
    n_payouts_greedy += m
    regret[i] = np.max(m_prob) - m_prob[machine]
    
 print(f"Sum of the reward is: {n_payouts_greedy}")


 # In[248]:


 print(a)
 print(b)


 # In[277]:


 # thompson sampling
 payoffs = [0.07, 0.11, 0.10, 0.08]
 n_trials = 10000
 shape = (4, 200)

 m = 0
 k = 0
 a = np.ones(4)
 b = np.ones(4)
 m_prob = np.zeros(4)
 thetam = np.zeros(4)
 # priors for machine payout distribution - uniformly distributed
 beta_post = np.random.uniform(0, 1, size=shape)
 regret = np.zeros(n_trials)
 total_reward = 0

 for i in range(n_trials):
    # updated posterior
    beta_post[m, :] = np.random.beta(a[m], b[m], size=shape)[m]
    payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
    #payoffs[0] = 0.07 if i < 4000 else .8
    for k in range(len(machines)):
        # gives mean of beta distribution
        #thetam[k] = a[k]/(a[k] + b[k])
        # sample from posterior (this is the thompson sampling approach)
        # this leads to more exploration because machines with > uncertainty can then be selected as the machine
        thetam[k] = np.random.choice(beta_post[k, :])
    
    # select machine with highest posterior p of payout
    m = machines[np.argmax(thetam)]
    # play machine - payout is binomial e [0, 1] with p payout 
    reward = np.random.binomial(1, p=payoffs[m])
    regret[i] = np.max(thetam) - thetam[k]
    #update dist
    # (a, b) = (a, b) + (r, 1 - r) 
    a[m] += reward
    b[m] += 1 - reward
    total_reward += reward

 print(total_reward)
 print(a)
 print(b)


 # In[278]:


 plt.figure(figsize=(14, 4))

 plt.subplot(121)
 for i in range(len(machines)):
    plt.plot(beta_post[i, :], alpha=.4, label=i)

 plt.subplot(122)    
 for i in range(len(machines)):
    sns.distplot(beta_post[i, :], hist=False, label=i);


 # In[210]:


 np.random.choice(beta_post[k, :])


 # In[241]:


 b


 # In[242]:


 a


 # In[211]:


 beta_post[k, :]

	# coding: utf-8

	# In[150]:


	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	sns.set_style("whitegrid")
	get_ipython().run_line_magic('matplotlib', 'inline')


	# In[25]:


	# we have four machines each with p of paying out (reward equal 1 or nothing (0))
	machines = [0, 1, 2, 3]
	payoffs = [0.07, 0.11, 0.22, 0.24]


	# In[26]:


	# random selections
	payouts = []
	n_payouts_random = 0
	for i in range(10000):
	machine = np.random.choice([0, 1, 2, 3])
	m = np.random.binomial(1, p=payoffs[machine])
	n_payouts_random += m

	print(f"Sum of the reward is: {n_payouts_random}")


	# In[275]:


	# greedy epsilon algorithm. More efficieient ways to do this.

	# greedy epsilon selections
	n_learning = 2000
	payoffs = [0.07, 0.11, 0.10, 0.08]
	e = 0.02
	n_trials = 10000

	successes = np.zeros(4)
	m_chosen = np.zeros(4)
	m_prob = np.zeros(4)
	n_payouts_greedy = 0
	t = 0
	regret = np.zeros(n_trials)

	for i in range(n_trials):
	t += 1
	payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
	#payoffs[3] = .25 if i % 3 == 0 else .20 # add variability to payouts
	# randomly select a machine
	#payoffs[3] = .4 if i % 3 == 0 else .04 # add variability to payouts
	# after n_learning random draws select the machine that is paying out the most
	machine = machine if i < n_learning else np.argmax(m_prob)
	# 0.02% of the time take a random draw from machines
	machine = np.random.choice(machines) if np.random.uniform(0, 1) > (1 - e) else machine
	m = np.random.binomial(1, p=payoffs[machine])
	# every 100 trials update the successes
	if i % 100:
	# update the count of successes for the chosen machine
	successes[machine] += m
	# update the probability of payout for each machine
	m_prob = successes/m_chosen
	# how many times was machine m chosen
	m_chosen[machine] += 1
	# count total reward (sum of payouts)
	n_payouts_greedy += m
	regret[i] = np.max(m_prob) - m_prob[machine]

	print(f"Sum of the reward is: {n_payouts_greedy}")


	# In[248]:


	print(a)
	print(b)


	# In[277]:


	# thompson sampling
	payoffs = [0.07, 0.11, 0.10, 0.08]
	n_trials = 10000
	shape = (4, 200)

	m = 0
	k = 0
	a = np.ones(4)
	b = np.ones(4)
	m_prob = np.zeros(4)
	thetam = np.zeros(4)
	# priors for machine payout distribution - uniformly distributed
	beta_post = np.random.uniform(0, 1, size=shape)
	regret = np.zeros(n_trials)
	total_reward = 0

	for i in range(n_trials):
	# updated posterior
	beta_post[m, :] = np.random.beta(a[m], b[m], size=shape)[m]
	payoffs[3] = .15 if i % 3 == 0 else .007 # add variability to payouts
	#payoffs[0] = 0.07 if i < 4000 else .8
	for k in range(len(machines)):
	# gives mean of beta distribution
	#thetam[k] = a[k]/(a[k] + b[k])
	# sample from posterior (this is the thompson sampling approach)
	# this leads to more exploration because machines with > uncertainty can then be selected as the machine
	thetam[k] = np.random.choice(beta_post[k, :])

	# select machine with highest posterior p of payout
	m = machines[np.argmax(thetam)]
	# play machine - payout is binomial e [0, 1] with p payout
	reward = np.random.binomial(1, p=payoffs[m])
	regret[i] = np.max(thetam) - thetam[k]
	#update dist
	# (a, b) = (a, b) + (r, 1 - r)
	a[m] += reward
	b[m] += 1 - reward
	total_reward += reward

	print(total_reward)
	print(a)
	print(b)


	# In[278]:


	plt.figure(figsize=(14, 4))

	plt.subplot(121)
	for i in range(len(machines)):
	plt.plot(beta_post[i, :], alpha=.4, label=i)

	plt.subplot(122)
	for i in range(len(machines)):
	sns.distplot(beta_post[i, :], hist=False, label=i);


	# In[210]:


	np.random.choice(beta_post[k, :])


	# In[241]:


	b


	# In[242]:


	a


	# In[211]:


	beta_post[k, :]
No results found