Skip to content

Instantly share code, notes, and snippets.

@quq99
Created February 16, 2017 14:02
Show Gist options
  • Save quq99/a62f791d120c61d0e37ecf28365a6a37 to your computer and use it in GitHub Desktop.
Save quq99/a62f791d120c61d0e37ecf28365a6a37 to your computer and use it in GitHub Desktop.
a PG algorithm for Acrobot-v1
import numpy as np, os
os.environ["THEANO_FLAGS"]="device=cpu,floatX=float64"
import theano, theano.tensor as T
import gym
from gym import wrappers
def discount(x, gamma):
"""
Given vector x, computes a vector y such that
y[i] = x[i] + gamma * x[i+1] + gamma^2 * x[i+2] ...
"""
out = np.zeros(len(x), 'float64')
out[-1] = x[-1]
for i in reversed(xrange(len(x)-1)):
out[i] = x[i] + gamma*out[i+1]
assert x.ndim >=1
#More efficient version
#scipy.signal.lfilter([1],[1,-gamma],x[::-1],axis=0)[::-1]
return out
#sample random action
def categorical_sample(prob_n):
"""
Sample from categorical distribution,
specified by a vector of class probabilities
boltzmann , better than e-greedy
"""
prob_n = np.asarray(prob_n)
csprob_n = np.cumsum(prob_n)
return (csprob_n > np.random.rand()).argmax()
"""
def categorical_sample(prob_n):
"" "
Sample from categorical distribution,
specified by a vector of class probabilities
e-greedy
"" "
prob_n = np.asarray(prob_n)
#csprob_n = np.cumsum(prob_n)
#return (csprob_n > np.random.rand()).argmax()
if np.random.random_sample() > 0.1:
return np.argmax(prob_n)
else:
return np.random.randint(0,len(prob_n))
"""
def get_traj(agent, env, episode_max_length, render=False):
"""
Run agent-environment loop for one whole episode (trajectory)
Return dictionary of results
"""
ob = env.reset()
#pre_ob = ob
obs = []
acts = []
#max_prob_acts = []
rews = []#rewards
for _ in xrange(episode_max_length):
#a, max_a = agent.act(ob)
#subob = ob - pre_ob
#a = agent.act(subob)
#pre_ob = ob
a = agent.act(ob)
(ob, rew, done, _) = env.step(a)
obs.append(ob)
rews.append(rew)
acts.append(a)
#max_prob_acts.append(max_a)
if done: break
if render: env.render()
#reward, observation, action
diction = {"reward":np.array(rews),
"ob":np.array(obs),
"action":np.array(acts),
}
#"max_prob_action":np.array(max_prob_acts)
return diction
def sgd_updates(grads, params, stepsize):
"""
Create list of parameter updates from stochastic gradient ascent
"""
updates = []
for (param, grad) in zip(params, grads):
updates.append((param, param+stepsize*grad))
return updates
def rmsprop_updates(grads, params, stepsize, rho=0.9, epsilon=1e-9):
"""
Create list of parameter updates from RMSProp
"""
updates = []
for param, grad in zip(params, grads):
accum = theano.shared(np.zeros(param.get_value(borrow=True).shape, dtype=param.dtype))
accum_new = rho * accum + (1-rho) * grad**2
param_new = param + (stepsize*grad/T.sqrt(accum_new+epsilon))
updates.append((accum, accum_new))
updates.append((param, param_new))
return updates
class REINFORCEAgent(object):
"""
Reinforce with baselines
Currently just works for discrete action space
"""
def __init__(self, ob_space, action_space, **usercfg):
"""
Initialize your agent's parameters
"""
nO = ob_space.shape[0]
nA = action_space.n
#Here are all algorithm parameters
#You can modify them by passing in args
self.config = dict(
episode_max_length=500,
timesteps_per_batch=10000,
n_iter=6000,
gamma=0.99,
stepsize=3e-5,
nhid=50#number of nodes of a hidden layer
)
self.config.update(usercfg)
#Symbolic variables for observation,action,and advantage
#These variables stack the results from many timesteps--the first dimension is the timestep
ob_no = T.fmatrix() #observation
a_n = T.ivector() #discrete action,int32 shape:(?,)
adv_n = T.fvector() #advantage, float32, shape:(?,)
def shared(arr):
return theano.shared(arr.astype('float64'))
#Create weights of neural network with one hidden layer
W0 = shared(np.random.randn(nO, self.config['nhid'])/np.sqrt(nO))
b0 = shared(np.zeros(self.config['nhid']))
W1 = shared(1e-4*np.random.randn(self.config['nhid'], nA))
b1 = shared(np.zeros(nA))
params = [W0, b0, W1, b1]
#Action probabilities
prob_na = T.nnet.softmax(T.tanh(ob_no.dot(W0)+b0[None,:]).dot(W1) + b1[None,:])
N = ob_no.shape[0]
#loss function that we'll differentiate to get the policy gradient
#Note that we've divided by the total number of timesteps
loss = T.log(prob_na[T.arange(N), a_n]).dot(adv_n) / N
stepsize = T.fscalar()
grads = T.grad(loss, params)
#perform parameter updates
#I find that sgd doesn't work well
#updates = sgd_updates(grads, params, stepsize)
updates = rmsprop_updates(grads, params, stepsize)
self.pg_update = theano.function([ob_no, a_n, adv_n, stepsize], [], updates=updates, allow_input_downcast=True)
self.compute_prob = theano.function([ob_no], prob_na, allow_input_downcast=True)
def act(self, ob):
"""
Choose an action.
"""
prob = self.compute_prob(ob.reshape(1,-1))
#max_prob_a = np.argmax(prob)
action = categorical_sample(prob)
#return action, max_prob_a
return action
def learn(self, env):
"""
Run learning algorithm
"""
cfg = self.config
for iteration in xrange(cfg["n_iter"]):
#collect trajectories until we get timesteps_per_batch total timestep
trajs = []
timesteps_total = 0
while timesteps_total < cfg["timesteps_per_batch"]:
traj = get_traj(self, env, cfg["episode_max_length"])
trajs.append(traj)
timesteps_total += len(traj["reward"])
all_ob = np.concatenate([traj["ob"] for traj in trajs])
#compute discount sums of rewards
rets = [discount(traj["reward"], cfg["gamma"]) for traj in trajs]
maxlen = max(len(ret) for ret in rets)
padded_rets = [np.concatenate([ret, np.zeros(maxlen-len(ret))]) for ret in rets]
#Conpute baseline
baseline = np.mean(padded_rets, axis=0)
#Compute advantage function
advs = [ret - baseline[:len(ret)] for ret in rets]
all_adv = np.concatenate(advs)
all_action = np.concatenate([traj["action"] for traj in trajs])
#do policy gradient update step
self.pg_update(all_ob, all_action, all_adv, cfg["stepsize"])
#episode total rewards
eprews = np.array([traj["reward"].sum() for traj in trajs])
#episode length
eplens = np.array([len(traj["reward"]) for traj in trajs])
#print stats
print "---------------"
print "Iteration: \t %i"%iteration
print "NumTrajs: \t %i"%len(eprews)
print "NumTimesteps: \t %i"%np.sum(eplens)
print "MaxRew: \t %s"%eprews.max()
print "MeanRew: \t %s +- %s"%(eprews.mean(), eprews.std()/np.sqrt(len(eprews)))
print "MeanLen: \t %s +- %s"%(eplens.mean(), eplens.std()/np.sqrt(len(eplens)))
print "---------------"
#get_traj(self, env, cfg["episode_max_length"], render=True)
def main():
env = gym.make("Acrobot-v1")
#record results
env = wrappers.Monitor(env, "./tmp/Acrobot-v1-experiment-2")
obs = env.observation_space
acts = env.action_space
ep_maxlen = env.spec.timestep_limit
agent = REINFORCEAgent(obs, acts, episode_max_length=ep_maxlen)
agent.learn(env)
#upload results and make a gist
env.close()
#gym.upload('./tmp/Acrobot-v1-experiment-1', writeup='https://gist.github.com/gdb/b6365e79be6052e7531e7ba6ea8caf23', api_key='sk_kczUyjeoSrCoQNen2TuUwA')
#gym.upload('./tmp/Acrobot-v1-experiment-1', api_key='sk_kczUyjeoSrCoQNen2TuUwA')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment