quq99 · February 16, 2017 14:02
diff --git a/pg.py b/pg.py
 import numpy as np, os
 os.environ["THEANO_FLAGS"]="device=cpu,floatX=float64"
 import theano, theano.tensor as T
 import gym
 from gym import wrappers

 def discount(x, gamma):
    """
    Given vector x, computes a vector y such that
    y[i] = x[i] + gamma * x[i+1] + gamma^2 * x[i+2] ...
    """
    out = np.zeros(len(x), 'float64')
    out[-1] = x[-1]
    for i in reversed(xrange(len(x)-1)):
        out[i] = x[i] + gamma*out[i+1]
    assert x.ndim >=1
    #More efficient version
    #scipy.signal.lfilter([1],[1,-gamma],x[::-1],axis=0)[::-1]
    return out

 #sample random action
 def categorical_sample(prob_n):
    """
    Sample from categorical distribution,
    specified by a vector of class probabilities
    boltzmann , better than e-greedy
    """
    prob_n = np.asarray(prob_n)
    csprob_n = np.cumsum(prob_n)
    return (csprob_n > np.random.rand()).argmax()
    
 """
 def categorical_sample(prob_n):
    "" "
    Sample from categorical distribution,
    specified by a vector of class probabilities
    e-greedy 
    "" "
    prob_n = np.asarray(prob_n)
    #csprob_n = np.cumsum(prob_n)
    #return (csprob_n > np.random.rand()).argmax()
    if np.random.random_sample() > 0.1:
    	return np.argmax(prob_n)
    else:
    	return np.random.randint(0,len(prob_n))
 """
    

 def get_traj(agent, env, episode_max_length, render=False):
    """
    Run agent-environment loop for one whole episode (trajectory)
    Return dictionary of results
    """
    ob = env.reset()
    #pre_ob = ob
    obs = []
    acts = []
    #max_prob_acts = []
    rews = []#rewards
    for _ in xrange(episode_max_length):
        #a, max_a = agent.act(ob)
        #subob = ob - pre_ob
        #a = agent.act(subob)
        #pre_ob = ob
        a = agent.act(ob)
        (ob, rew, done, _) = env.step(a)
        obs.append(ob)
        rews.append(rew)
        acts.append(a)
        #max_prob_acts.append(max_a)
        if done: break
        if render: env.render()
   #reward, observation, action 
    diction = {"reward":np.array(rews),
                "ob":np.array(obs),
                "action":np.array(acts),
                }
                #"max_prob_action":np.array(max_prob_acts)
                

    return diction

 def sgd_updates(grads, params, stepsize):
    """
    Create list of parameter updates from stochastic gradient ascent
    """
    updates = []
    for (param, grad) in zip(params, grads):
        updates.append((param, param+stepsize*grad))

    return updates

 def rmsprop_updates(grads, params, stepsize, rho=0.9, epsilon=1e-9):
    """
    Create list of parameter updates from RMSProp
    """
    updates = []
    for param, grad in zip(params, grads):
        accum = theano.shared(np.zeros(param.get_value(borrow=True).shape, dtype=param.dtype))
        accum_new = rho * accum + (1-rho) * grad**2
        param_new = param + (stepsize*grad/T.sqrt(accum_new+epsilon))
        updates.append((accum, accum_new))
        updates.append((param, param_new))

    return updates

 class REINFORCEAgent(object):
    """
    Reinforce with baselines
    Currently just works for discrete action space
    """

    def __init__(self, ob_space, action_space, **usercfg):
        """
        Initialize your agent's parameters
        """
        nO = ob_space.shape[0]
        nA = action_space.n
        #Here are all algorithm parameters
        #You can modify them by passing in args
        self.config = dict(
                episode_max_length=500,
                timesteps_per_batch=10000,
                n_iter=6000,
                gamma=0.99,
                stepsize=3e-5,
                nhid=50#number of nodes of a hidden layer
                )
        self.config.update(usercfg)

        #Symbolic variables for observation,action,and advantage
        #These variables stack the results from many timesteps--the first dimension is the timestep
        ob_no = T.fmatrix() #observation
        a_n = T.ivector() #discrete action,int32 shape:(?,)
        adv_n = T.fvector() #advantage, float32, shape:(?,)

        def shared(arr):
            return theano.shared(arr.astype('float64'))

        #Create weights of neural network with one hidden layer
        W0 = shared(np.random.randn(nO, self.config['nhid'])/np.sqrt(nO))
        b0 = shared(np.zeros(self.config['nhid']))
        W1 = shared(1e-4*np.random.randn(self.config['nhid'], nA))
        b1 = shared(np.zeros(nA))
        params = [W0, b0, W1, b1]

        #Action probabilities
        prob_na = T.nnet.softmax(T.tanh(ob_no.dot(W0)+b0[None,:]).dot(W1) + b1[None,:])
        N = ob_no.shape[0]

        #loss function that we'll differentiate to get the policy gradient
        #Note that we've divided by the total number of timesteps
        loss = T.log(prob_na[T.arange(N), a_n]).dot(adv_n) / N
        stepsize = T.fscalar()
        grads = T.grad(loss, params)

        #perform parameter updates
        #I find that sgd doesn't work well
        #updates = sgd_updates(grads, params, stepsize)
        updates = rmsprop_updates(grads, params, stepsize)
        self.pg_update = theano.function([ob_no, a_n, adv_n, stepsize], [], updates=updates, allow_input_downcast=True)
        self.compute_prob = theano.function([ob_no], prob_na, allow_input_downcast=True)

    def act(self, ob):
        """
        Choose an action.
        """
        prob = self.compute_prob(ob.reshape(1,-1))
        #max_prob_a = np.argmax(prob)
        action = categorical_sample(prob)
        
        #return action, max_prob_a
        return action

    def learn(self, env):
        """
        Run learning algorithm
        """
        cfg = self.config
        for iteration in xrange(cfg["n_iter"]):
            #collect trajectories until we get timesteps_per_batch total timestep
            trajs = []
            timesteps_total = 0
            while timesteps_total < cfg["timesteps_per_batch"]:
                traj = get_traj(self, env, cfg["episode_max_length"])
                trajs.append(traj)
                timesteps_total += len(traj["reward"])

            all_ob = np.concatenate([traj["ob"] for traj in trajs])

            #compute discount sums of rewards
            rets = [discount(traj["reward"], cfg["gamma"]) for traj in trajs]
            maxlen = max(len(ret) for ret in rets)
            padded_rets = [np.concatenate([ret, np.zeros(maxlen-len(ret))]) for ret in rets]

            #Conpute baseline
            baseline = np.mean(padded_rets, axis=0)

            #Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_adv = np.concatenate(advs)
            all_action = np.concatenate([traj["action"] for traj in trajs])

            #do policy gradient update step
            self.pg_update(all_ob, all_action, all_adv, cfg["stepsize"])
            #episode total rewards
            eprews = np.array([traj["reward"].sum() for traj in trajs])
            #episode length
            eplens = np.array([len(traj["reward"]) for traj in trajs])

            #print stats
            print "---------------"
            print "Iteration: \t %i"%iteration
            print "NumTrajs: \t %i"%len(eprews)
            print "NumTimesteps: \t %i"%np.sum(eplens)
            print "MaxRew: \t %s"%eprews.max()
            print "MeanRew: \t %s +- %s"%(eprews.mean(), eprews.std()/np.sqrt(len(eprews)))
            print "MeanLen: \t %s +- %s"%(eplens.mean(), eplens.std()/np.sqrt(len(eplens)))
            print "---------------"
            #get_traj(self, env, cfg["episode_max_length"], render=True)


 def main():
    env = gym.make("Acrobot-v1")
    #record results
    env = wrappers.Monitor(env, "./tmp/Acrobot-v1-experiment-2")
    
    obs = env.observation_space
    acts = env.action_space
    ep_maxlen = env.spec.timestep_limit

    agent = REINFORCEAgent(obs, acts, episode_max_length=ep_maxlen)
    agent.learn(env)
    
    #upload results and make a gist
    env.close()
    #gym.upload('./tmp/Acrobot-v1-experiment-1', writeup='https://gist.github.com/gdb/b6365e79be6052e7531e7ba6ea8caf23',  api_key='sk_kczUyjeoSrCoQNen2TuUwA')
    #gym.upload('./tmp/Acrobot-v1-experiment-1', api_key='sk_kczUyjeoSrCoQNen2TuUwA')
    

 if __name__ == "__main__":
    main()
	import numpy as np, os
	os.environ["THEANO_FLAGS"]="device=cpu,floatX=float64"
	import theano, theano.tensor as T
	import gym
	from gym import wrappers

	def discount(x, gamma):
	"""
	Given vector x, computes a vector y such that
	y[i] = x[i] + gamma * x[i+1] + gamma^2 * x[i+2] ...
	"""
	out = np.zeros(len(x), 'float64')
	out[-1] = x[-1]
	for i in reversed(xrange(len(x)-1)):
	out[i] = x[i] + gamma*out[i+1]
	assert x.ndim >=1
	#More efficient version
	#scipy.signal.lfilter([1],[1,-gamma],x[::-1],axis=0)[::-1]
	return out

	#sample random action
	def categorical_sample(prob_n):
	"""
	Sample from categorical distribution,
	specified by a vector of class probabilities
	boltzmann , better than e-greedy
	"""
	prob_n = np.asarray(prob_n)
	csprob_n = np.cumsum(prob_n)
	return (csprob_n > np.random.rand()).argmax()

	"""
	def categorical_sample(prob_n):
	"" "
	Sample from categorical distribution,
	specified by a vector of class probabilities
	e-greedy
	"" "
	prob_n = np.asarray(prob_n)
	#csprob_n = np.cumsum(prob_n)
	#return (csprob_n > np.random.rand()).argmax()
	if np.random.random_sample() > 0.1:
	return np.argmax(prob_n)
	else:
	return np.random.randint(0,len(prob_n))
	"""


	def get_traj(agent, env, episode_max_length, render=False):
	"""
	Run agent-environment loop for one whole episode (trajectory)
	Return dictionary of results
	"""
	ob = env.reset()
	#pre_ob = ob
	obs = []
	acts = []
	#max_prob_acts = []
	rews = []#rewards
	for _ in xrange(episode_max_length):
	#a, max_a = agent.act(ob)
	#subob = ob - pre_ob
	#a = agent.act(subob)
	#pre_ob = ob
	a = agent.act(ob)
	(ob, rew, done, _) = env.step(a)
	obs.append(ob)
	rews.append(rew)
	acts.append(a)
	#max_prob_acts.append(max_a)
	if done: break
	if render: env.render()
	#reward, observation, action
	diction = {"reward":np.array(rews),
	"ob":np.array(obs),
	"action":np.array(acts),
	}
	#"max_prob_action":np.array(max_prob_acts)


	return diction

	def sgd_updates(grads, params, stepsize):
	"""
	Create list of parameter updates from stochastic gradient ascent
	"""
	updates = []
	for (param, grad) in zip(params, grads):
	updates.append((param, param+stepsize*grad))

	return updates

	def rmsprop_updates(grads, params, stepsize, rho=0.9, epsilon=1e-9):
	"""
	Create list of parameter updates from RMSProp
	"""
	updates = []
	for param, grad in zip(params, grads):
	accum = theano.shared(np.zeros(param.get_value(borrow=True).shape, dtype=param.dtype))
	accum_new = rho * accum + (1-rho) * grad**2
	param_new = param + (stepsize*grad/T.sqrt(accum_new+epsilon))
	updates.append((accum, accum_new))
	updates.append((param, param_new))

	return updates

	class REINFORCEAgent(object):
	"""
	Reinforce with baselines
	Currently just works for discrete action space
	"""

	def __init__(self, ob_space, action_space, **usercfg):
	"""
	Initialize your agent's parameters
	"""
	nO = ob_space.shape[0]
	nA = action_space.n
	#Here are all algorithm parameters
	#You can modify them by passing in args
	self.config = dict(
	episode_max_length=500,
	timesteps_per_batch=10000,
	n_iter=6000,
	gamma=0.99,
	stepsize=3e-5,
	nhid=50#number of nodes of a hidden layer
	)
	self.config.update(usercfg)

	#Symbolic variables for observation,action,and advantage
	#These variables stack the results from many timesteps--the first dimension is the timestep
	ob_no = T.fmatrix() #observation
	a_n = T.ivector() #discrete action,int32 shape:(?,)
	adv_n = T.fvector() #advantage, float32, shape:(?,)

	def shared(arr):
	return theano.shared(arr.astype('float64'))

	#Create weights of neural network with one hidden layer
	W0 = shared(np.random.randn(nO, self.config['nhid'])/np.sqrt(nO))
	b0 = shared(np.zeros(self.config['nhid']))
	W1 = shared(1e-4*np.random.randn(self.config['nhid'], nA))
	b1 = shared(np.zeros(nA))
	params = [W0, b0, W1, b1]

	#Action probabilities
	prob_na = T.nnet.softmax(T.tanh(ob_no.dot(W0)+b0[None,:]).dot(W1) + b1[None,:])
	N = ob_no.shape[0]

	#loss function that we'll differentiate to get the policy gradient
	#Note that we've divided by the total number of timesteps
	loss = T.log(prob_na[T.arange(N), a_n]).dot(adv_n) / N
	stepsize = T.fscalar()
	grads = T.grad(loss, params)

	#perform parameter updates
	#I find that sgd doesn't work well
	#updates = sgd_updates(grads, params, stepsize)
	updates = rmsprop_updates(grads, params, stepsize)
	self.pg_update = theano.function([ob_no, a_n, adv_n, stepsize], [], updates=updates, allow_input_downcast=True)
	self.compute_prob = theano.function([ob_no], prob_na, allow_input_downcast=True)

	def act(self, ob):
	"""
	Choose an action.
	"""
	prob = self.compute_prob(ob.reshape(1,-1))
	#max_prob_a = np.argmax(prob)
	action = categorical_sample(prob)

	#return action, max_prob_a
	return action

	def learn(self, env):
	"""
	Run learning algorithm
	"""
	cfg = self.config
	for iteration in xrange(cfg["n_iter"]):
	#collect trajectories until we get timesteps_per_batch total timestep
	trajs = []
	timesteps_total = 0
	while timesteps_total < cfg["timesteps_per_batch"]:
	traj = get_traj(self, env, cfg["episode_max_length"])
	trajs.append(traj)
	timesteps_total += len(traj["reward"])

	all_ob = np.concatenate([traj["ob"] for traj in trajs])

	#compute discount sums of rewards
	rets = [discount(traj["reward"], cfg["gamma"]) for traj in trajs]
	maxlen = max(len(ret) for ret in rets)
	padded_rets = [np.concatenate([ret, np.zeros(maxlen-len(ret))]) for ret in rets]

	#Conpute baseline
	baseline = np.mean(padded_rets, axis=0)

	#Compute advantage function
	advs = [ret - baseline[:len(ret)] for ret in rets]
	all_adv = np.concatenate(advs)
	all_action = np.concatenate([traj["action"] for traj in trajs])

	#do policy gradient update step
	self.pg_update(all_ob, all_action, all_adv, cfg["stepsize"])
	#episode total rewards
	eprews = np.array([traj["reward"].sum() for traj in trajs])
	#episode length
	eplens = np.array([len(traj["reward"]) for traj in trajs])

	#print stats
	print "---------------"
	print "Iteration: \t %i"%iteration
	print "NumTrajs: \t %i"%len(eprews)
	print "NumTimesteps: \t %i"%np.sum(eplens)
	print "MaxRew: \t %s"%eprews.max()
	print "MeanRew: \t %s +- %s"%(eprews.mean(), eprews.std()/np.sqrt(len(eprews)))
	print "MeanLen: \t %s +- %s"%(eplens.mean(), eplens.std()/np.sqrt(len(eplens)))
	print "---------------"
	#get_traj(self, env, cfg["episode_max_length"], render=True)


	def main():
	env = gym.make("Acrobot-v1")
	#record results
	env = wrappers.Monitor(env, "./tmp/Acrobot-v1-experiment-2")

	obs = env.observation_space
	acts = env.action_space
	ep_maxlen = env.spec.timestep_limit

	agent = REINFORCEAgent(obs, acts, episode_max_length=ep_maxlen)
	agent.learn(env)

	#upload results and make a gist
	env.close()
	#gym.upload('./tmp/Acrobot-v1-experiment-1', writeup='https://gist.github.com/gdb/b6365e79be6052e7531e7ba6ea8caf23', api_key='sk_kczUyjeoSrCoQNen2TuUwA')
	#gym.upload('./tmp/Acrobot-v1-experiment-1', api_key='sk_kczUyjeoSrCoQNen2TuUwA')


	if __name__ == "__main__":
	main()