Skip to content

Instantly share code, notes, and snippets.

@redsymbol
Last active January 12, 2018 17:28
Show Gist options
  • Save redsymbol/dc6b57230c9a1d9148e6c56569751cf1 to your computer and use it in GitHub Desktop.
Save redsymbol/dc6b57230c9a1d9148e6c56569751cf1 to your computer and use it in GitHub Desktop.
Demo code for policy gradient question
'''_
Demo code for my question here:
https://www.reddit.com/r/reinforcementlearning/comments/7pxvh1/help_with_a_policy_gradient_detail/
Run with:
python3 demo.py --verbose --episodes 100000
Notice in the theta vector being periodically printed out, all
elements but the last stay very close to zero.
Updates to this demo:
https://gist.github.com/redsymbol/dc6b57230c9a1d9148e6c56569751cf1
'''
import argparse
import random
from collections import namedtuple, defaultdict
import gym
import numpy as np
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--verbose', default=False, action='store_true')
parser.add_argument('--episodes', default=20, type=int)
parser.add_argument('--render', default=False, action='store_true')
return parser.parse_args()
def softmax(val, vals):
return np.exp(val) / np.sum(np.exp(vals))
class ReinforceTask:
ALPHA = 10**-2
GAMMA = .95
def __init__(self, env, verbose=False):
self.env = env
self.verbose = verbose
self.actions = np.array(range(self.env.action_space.n))
self.feature_size = 5
self.theta = np.zeros(self.feature_size)
def feature(self, obs, action):
return np.array(list(obs) + [action])
def h(self, obs, action):
return np.dot(self.theta, self.feature(obs, action))
def pi(self, obs, action):
return softmax(
self.h(obs, action),
np.array([self.h(obs, _action) for _action in self.actions]))
def grad_ln_pi(self, obs, action):
return self.feature(obs, action) - \
(self.pi(obs, 1) * self.feature(obs, 1) + \
self.pi(obs, 0) * self.feature(obs, 0))
def choose_action(self, obs):
probs = [self.pi(obs, action) for action in self.actions]
return np.random.choice(self.actions, p=probs)
def run(self, episode_count, render=True):
for episode_num in range(episode_count):
total_reward = self.run_episode(render)
if self.verbose:
print(f'Episode {episode_num}: Reward={total_reward}, Theta={list(self.theta)}')
def run_episode(self, render):
'''
returns total reward
'''
ALPHA = self.ALPHA
GAMMA = self.GAMMA
# generate episode
done = False
obs = self.env.reset()
actions = np.array([], dtype='i4')
rewards = np.array([0])
states = [obs]
while not done:
if render:
env.render()
action = self.choose_action(obs)
actions = np.append(actions, action)
obs, reward, done, info = self.env.step(action)
states.append(obs)
rewards = np.append(rewards, reward)
# calc gainz
G = np.zeros(len(rewards))
for i in range(len(G)-1, 0, -1):
G[i-1] = GAMMA * G[i] + rewards[i]
# adjust theta
for t in range(0, len(G) - 1):
gain = G[t]
action = actions[t]
state = states[t]
self.theta = self.theta + ALPHA * (GAMMA ** t) * self.grad_ln_pi(state, action)
return np.sum(rewards)
if __name__ == '__main__':
args = get_args()
env = gym.make('CartPole-v0')
task = ReinforceTask(env, args.verbose)
task.run(args.episodes, render=args.render)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment