Created
February 16, 2017 14:02
-
-
Save quq99/a62f791d120c61d0e37ecf28365a6a37 to your computer and use it in GitHub Desktop.
a PG algorithm for Acrobot-v1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np, os | |
os.environ["THEANO_FLAGS"]="device=cpu,floatX=float64" | |
import theano, theano.tensor as T | |
import gym | |
from gym import wrappers | |
def discount(x, gamma): | |
""" | |
Given vector x, computes a vector y such that | |
y[i] = x[i] + gamma * x[i+1] + gamma^2 * x[i+2] ... | |
""" | |
out = np.zeros(len(x), 'float64') | |
out[-1] = x[-1] | |
for i in reversed(xrange(len(x)-1)): | |
out[i] = x[i] + gamma*out[i+1] | |
assert x.ndim >=1 | |
#More efficient version | |
#scipy.signal.lfilter([1],[1,-gamma],x[::-1],axis=0)[::-1] | |
return out | |
#sample random action | |
def categorical_sample(prob_n): | |
""" | |
Sample from categorical distribution, | |
specified by a vector of class probabilities | |
boltzmann , better than e-greedy | |
""" | |
prob_n = np.asarray(prob_n) | |
csprob_n = np.cumsum(prob_n) | |
return (csprob_n > np.random.rand()).argmax() | |
""" | |
def categorical_sample(prob_n): | |
"" " | |
Sample from categorical distribution, | |
specified by a vector of class probabilities | |
e-greedy | |
"" " | |
prob_n = np.asarray(prob_n) | |
#csprob_n = np.cumsum(prob_n) | |
#return (csprob_n > np.random.rand()).argmax() | |
if np.random.random_sample() > 0.1: | |
return np.argmax(prob_n) | |
else: | |
return np.random.randint(0,len(prob_n)) | |
""" | |
def get_traj(agent, env, episode_max_length, render=False): | |
""" | |
Run agent-environment loop for one whole episode (trajectory) | |
Return dictionary of results | |
""" | |
ob = env.reset() | |
#pre_ob = ob | |
obs = [] | |
acts = [] | |
#max_prob_acts = [] | |
rews = []#rewards | |
for _ in xrange(episode_max_length): | |
#a, max_a = agent.act(ob) | |
#subob = ob - pre_ob | |
#a = agent.act(subob) | |
#pre_ob = ob | |
a = agent.act(ob) | |
(ob, rew, done, _) = env.step(a) | |
obs.append(ob) | |
rews.append(rew) | |
acts.append(a) | |
#max_prob_acts.append(max_a) | |
if done: break | |
if render: env.render() | |
#reward, observation, action | |
diction = {"reward":np.array(rews), | |
"ob":np.array(obs), | |
"action":np.array(acts), | |
} | |
#"max_prob_action":np.array(max_prob_acts) | |
return diction | |
def sgd_updates(grads, params, stepsize): | |
""" | |
Create list of parameter updates from stochastic gradient ascent | |
""" | |
updates = [] | |
for (param, grad) in zip(params, grads): | |
updates.append((param, param+stepsize*grad)) | |
return updates | |
def rmsprop_updates(grads, params, stepsize, rho=0.9, epsilon=1e-9): | |
""" | |
Create list of parameter updates from RMSProp | |
""" | |
updates = [] | |
for param, grad in zip(params, grads): | |
accum = theano.shared(np.zeros(param.get_value(borrow=True).shape, dtype=param.dtype)) | |
accum_new = rho * accum + (1-rho) * grad**2 | |
param_new = param + (stepsize*grad/T.sqrt(accum_new+epsilon)) | |
updates.append((accum, accum_new)) | |
updates.append((param, param_new)) | |
return updates | |
class REINFORCEAgent(object): | |
""" | |
Reinforce with baselines | |
Currently just works for discrete action space | |
""" | |
def __init__(self, ob_space, action_space, **usercfg): | |
""" | |
Initialize your agent's parameters | |
""" | |
nO = ob_space.shape[0] | |
nA = action_space.n | |
#Here are all algorithm parameters | |
#You can modify them by passing in args | |
self.config = dict( | |
episode_max_length=500, | |
timesteps_per_batch=10000, | |
n_iter=6000, | |
gamma=0.99, | |
stepsize=3e-5, | |
nhid=50#number of nodes of a hidden layer | |
) | |
self.config.update(usercfg) | |
#Symbolic variables for observation,action,and advantage | |
#These variables stack the results from many timesteps--the first dimension is the timestep | |
ob_no = T.fmatrix() #observation | |
a_n = T.ivector() #discrete action,int32 shape:(?,) | |
adv_n = T.fvector() #advantage, float32, shape:(?,) | |
def shared(arr): | |
return theano.shared(arr.astype('float64')) | |
#Create weights of neural network with one hidden layer | |
W0 = shared(np.random.randn(nO, self.config['nhid'])/np.sqrt(nO)) | |
b0 = shared(np.zeros(self.config['nhid'])) | |
W1 = shared(1e-4*np.random.randn(self.config['nhid'], nA)) | |
b1 = shared(np.zeros(nA)) | |
params = [W0, b0, W1, b1] | |
#Action probabilities | |
prob_na = T.nnet.softmax(T.tanh(ob_no.dot(W0)+b0[None,:]).dot(W1) + b1[None,:]) | |
N = ob_no.shape[0] | |
#loss function that we'll differentiate to get the policy gradient | |
#Note that we've divided by the total number of timesteps | |
loss = T.log(prob_na[T.arange(N), a_n]).dot(adv_n) / N | |
stepsize = T.fscalar() | |
grads = T.grad(loss, params) | |
#perform parameter updates | |
#I find that sgd doesn't work well | |
#updates = sgd_updates(grads, params, stepsize) | |
updates = rmsprop_updates(grads, params, stepsize) | |
self.pg_update = theano.function([ob_no, a_n, adv_n, stepsize], [], updates=updates, allow_input_downcast=True) | |
self.compute_prob = theano.function([ob_no], prob_na, allow_input_downcast=True) | |
def act(self, ob): | |
""" | |
Choose an action. | |
""" | |
prob = self.compute_prob(ob.reshape(1,-1)) | |
#max_prob_a = np.argmax(prob) | |
action = categorical_sample(prob) | |
#return action, max_prob_a | |
return action | |
def learn(self, env): | |
""" | |
Run learning algorithm | |
""" | |
cfg = self.config | |
for iteration in xrange(cfg["n_iter"]): | |
#collect trajectories until we get timesteps_per_batch total timestep | |
trajs = [] | |
timesteps_total = 0 | |
while timesteps_total < cfg["timesteps_per_batch"]: | |
traj = get_traj(self, env, cfg["episode_max_length"]) | |
trajs.append(traj) | |
timesteps_total += len(traj["reward"]) | |
all_ob = np.concatenate([traj["ob"] for traj in trajs]) | |
#compute discount sums of rewards | |
rets = [discount(traj["reward"], cfg["gamma"]) for traj in trajs] | |
maxlen = max(len(ret) for ret in rets) | |
padded_rets = [np.concatenate([ret, np.zeros(maxlen-len(ret))]) for ret in rets] | |
#Conpute baseline | |
baseline = np.mean(padded_rets, axis=0) | |
#Compute advantage function | |
advs = [ret - baseline[:len(ret)] for ret in rets] | |
all_adv = np.concatenate(advs) | |
all_action = np.concatenate([traj["action"] for traj in trajs]) | |
#do policy gradient update step | |
self.pg_update(all_ob, all_action, all_adv, cfg["stepsize"]) | |
#episode total rewards | |
eprews = np.array([traj["reward"].sum() for traj in trajs]) | |
#episode length | |
eplens = np.array([len(traj["reward"]) for traj in trajs]) | |
#print stats | |
print "---------------" | |
print "Iteration: \t %i"%iteration | |
print "NumTrajs: \t %i"%len(eprews) | |
print "NumTimesteps: \t %i"%np.sum(eplens) | |
print "MaxRew: \t %s"%eprews.max() | |
print "MeanRew: \t %s +- %s"%(eprews.mean(), eprews.std()/np.sqrt(len(eprews))) | |
print "MeanLen: \t %s +- %s"%(eplens.mean(), eplens.std()/np.sqrt(len(eplens))) | |
print "---------------" | |
#get_traj(self, env, cfg["episode_max_length"], render=True) | |
def main(): | |
env = gym.make("Acrobot-v1") | |
#record results | |
env = wrappers.Monitor(env, "./tmp/Acrobot-v1-experiment-2") | |
obs = env.observation_space | |
acts = env.action_space | |
ep_maxlen = env.spec.timestep_limit | |
agent = REINFORCEAgent(obs, acts, episode_max_length=ep_maxlen) | |
agent.learn(env) | |
#upload results and make a gist | |
env.close() | |
#gym.upload('./tmp/Acrobot-v1-experiment-1', writeup='https://gist.github.com/gdb/b6365e79be6052e7531e7ba6ea8caf23', api_key='sk_kczUyjeoSrCoQNen2TuUwA') | |
#gym.upload('./tmp/Acrobot-v1-experiment-1', api_key='sk_kczUyjeoSrCoQNen2TuUwA') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment