doobwa · April 7, 2013 07:05
diff --git a/mbandit.r b/mbandit.r
 # Multi-armed bandit example

 # Each arm has some (unknown) probability of reward=1.
 # Using beta priors we can update our posterior after each pull
 # The following uses Thompson sampling, where we sample probabilities
 # from our posterior and choose the arm with the largest expected reward.

 set.seed(2)
 J <- 3
 probs <- rbeta(J,1,1) # True probability
 alphas <- betas <- rep(1,J)

 T <- 1000
 action <- rep(0,T)
 reward <- rep(0,T)

 for (t in 1:T) {
  
  # Sample arm parameters from posterior
  p <- sapply(1:J, function(j) rbeta(1, alphas[j], betas[j]))
  
  # Pick arm that maximizes expected reward
  action[t] <- which.max(p)
  reward[t] <- (runif(1) < probs[action[t]])*1
  
  # Update posteriors
  alphas[a[t]] <- alphas[action[t]] + reward[t]
  betas[a[t]] <- betas[action[t]] + 1
 }

 # Plot regret
 best <- probs[which.max(probs)]
 chosen <- probs[action]
 plot(cumsum(best - chosen),type="l")
 table(action)
	# Multi-armed bandit example

	# Each arm has some (unknown) probability of reward=1.
	# Using beta priors we can update our posterior after each pull
	# The following uses Thompson sampling, where we sample probabilities
	# from our posterior and choose the arm with the largest expected reward.

	set.seed(2)
	J <- 3
	probs <- rbeta(J,1,1) # True probability
	alphas <- betas <- rep(1,J)

	T <- 1000
	action <- rep(0,T)
	reward <- rep(0,T)

	for (t in 1:T) {

	# Sample arm parameters from posterior
	p <- sapply(1:J, function(j) rbeta(1, alphas[j], betas[j]))

	# Pick arm that maximizes expected reward
	action[t] <- which.max(p)
	reward[t] <- (runif(1) < probs[action[t]])*1

	# Update posteriors
	alphas[a[t]] <- alphas[action[t]] + reward[t]
	betas[a[t]] <- betas[action[t]] + 1
	}

	# Plot regret
	best <- probs[which.max(probs)]
	chosen <- probs[action]
	plot(cumsum(best - chosen),type="l")
	table(action)