pranavkantgaur · December 28, 2018 17:53
diff --git a/annealed_epsilon_greedy_agent_cartpole_agent.py b/annealed_epsilon_greedy_agent_cartpole_agent.py
 '''this code implements annealed epsilon greedy ml agent for CartPole problem '''
 import numpy as np
 import gym

 class myAnnealedEpsilonGreedyCartPoleAgent(object):
    def __init__(self, action_space):
        self.action_space = action_space
        assert (action_space, gym.spaces.discrete.Discrete),  "unsupported action space for now."
    def act(self, state, reward, time_step, done, default_epsilon = 0.98):
        # TODO  compute epsilon for current iteration
 	# annealing will follow linear model to reduce over time.
 	current_epsilon = default_epsilon / (time_step + 1.0)
        # take action based on sample from epsilon distribution
        if np.random.random() < current_epsilon:
            return self.action_space.sample() # explore
        else:
           return 1 # exploite TODO to be replaced by policy gradient based agent model

 if __name__ == '__main__':
    # instantiate environment
    env = gym.make('CartPole-v0')
    # instantiate agent
    agent = myAnnealedEpsilonGreedyCartPoleAgent(env.action_space)

    # task-specific configurations
    max_episodes = 500
    max_steps_per_episodes = 200

    sum_reward_running = 0.0
    current_reward = 0.0
    done = False

    for i in xrange(max_episodes):
        current_observation = env.reset()
        sum_rewards = 0.0
        last_reward = 0.0
        for j in xrange(max_steps_per_episodes):
            action = agent.act(current_observation, last_reward, j, done) # agent is invoked
            env.render()
 	    next_observation, current_reward, done, _ = env.step(action) # enviroment is acted upon
            sum_rewards += current_reward
            if done:
                break
            else:
                current_observation = next_observation
                last_reward = current_reward
        sum_reward_running = 0.95 * sum_reward_running + 0.05 * sum_rewards

        print '%d running reward: %d' % (i, sum_reward_running)
	'''this code implements annealed epsilon greedy ml agent for CartPole problem '''
	import numpy as np
	import gym

	class myAnnealedEpsilonGreedyCartPoleAgent(object):
	def __init__(self, action_space):
	self.action_space = action_space
	assert (action_space, gym.spaces.discrete.Discrete), "unsupported action space for now."
	def act(self, state, reward, time_step, done, default_epsilon = 0.98):
	# TODO compute epsilon for current iteration
	# annealing will follow linear model to reduce over time.
	current_epsilon = default_epsilon / (time_step + 1.0)
	# take action based on sample from epsilon distribution
	if np.random.random() < current_epsilon:
	return self.action_space.sample() # explore
	else:
	return 1 # exploite TODO to be replaced by policy gradient based agent model

	if __name__ == '__main__':
	# instantiate environment
	env = gym.make('CartPole-v0')
	# instantiate agent
	agent = myAnnealedEpsilonGreedyCartPoleAgent(env.action_space)

	# task-specific configurations
	max_episodes = 500
	max_steps_per_episodes = 200

	sum_reward_running = 0.0
	current_reward = 0.0
	done = False

	for i in xrange(max_episodes):
	current_observation = env.reset()
	sum_rewards = 0.0
	last_reward = 0.0
	for j in xrange(max_steps_per_episodes):
	action = agent.act(current_observation, last_reward, j, done) # agent is invoked
	env.render()
	next_observation, current_reward, done, _ = env.step(action) # enviroment is acted upon
	sum_rewards += current_reward
	if done:
	break
	else:
	current_observation = next_observation
	last_reward = current_reward
	sum_reward_running = 0.95 * sum_reward_running + 0.05 * sum_rewards

	print '%d running reward: %d' % (i, sum_reward_running)