Skip to content

Instantly share code, notes, and snippets.

@jangirrishabh
Last active July 13, 2018 13:24
Show Gist options
  • Save jangirrishabh/3cbe9d3eed4695cd8c0e460d58b7a914 to your computer and use it in GitHub Desktop.
Save jangirrishabh/3cbe9d3eed4695cd8c0e460d58b7a914 to your computer and use it in GitHub Desktop.
Snippet for using demonstrations in ddpg.py agent, blog usage, not executable
self.lambda1 = 0.001
self.lambda2 = 0.0078
def _create_network(self, reuse=False):
mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0)
target_Q_pi_tf = self.target.Q_pi_tf
clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) # y = r + gamma*Q(pi)
self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) #(y-Q(critic))^2
if self.bc_loss ==1 and self.q_filter == 1 :
maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic?
self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0)))
self.pi_loss_tf = -self.lambda1 * tf.reduce_mean(self.main.Q_pi_tf)
self.pi_loss_tf += self.lambda1 * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
self.pi_loss_tf += self.lambda2 * self.cloning_loss_tf
elif self.bc_loss == 1 and self.q_filter == 0:
self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask)))
self.pi_loss_tf = -self.lambda1 * tf.reduce_mean(self.main.Q_pi_tf)
self.pi_loss_tf += self.lambda1 * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
self.pi_loss_tf += self.lambda2 * self.cloning_loss_tf
else:
self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
self.cloning_loss_tf = tf.reduce_sum(tf.square(self.main.pi_tf - batch_tf['u'])) #random values
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment