elumixor · May 24, 2020 23:23
diff --git a/TRPO_update_1.py b/TRPO_update_1.py
 def update_agent(rollouts):
    states = torch.cat([r.states for r in rollouts], dim=0)
    actions = torch.cat([r.actions for r in rollouts], dim=0).flatten()

    advantages = [estimate_advantages(states, next_states[-1], rewards) for states, _, rewards, next_states in rollouts]
    advantages = torch.cat(advantages, dim=0).flatten()

    # Normalize advantages to reduce skewness and improve convergence
    advantages = (advantages - advantages.mean()) / advantages.std()  
    
    update_critic(advantages)

    distribution = actor(states)

    # Important! We clamp the probabilities, so they do not reach zero
    distribution = torch.distributions.utils.clamp_probs(distribution).
    
    probabilities = distribution[range(distribution.shape[0]), actions]
	def update_agent(rollouts):
	states = torch.cat([r.states for r in rollouts], dim=0)
	actions = torch.cat([r.actions for r in rollouts], dim=0).flatten()

	advantages = [estimate_advantages(states, next_states[-1], rewards) for states, _, rewards, next_states in rollouts]
	advantages = torch.cat(advantages, dim=0).flatten()

	# Normalize advantages to reduce skewness and improve convergence
	advantages = (advantages - advantages.mean()) / advantages.std()

	update_critic(advantages)

	distribution = actor(states)

	# Important! We clamp the probabilities, so they do not reach zero
	distribution = torch.distributions.utils.clamp_probs(distribution).

	probabilities = distribution[range(distribution.shape[0]), actions]