Last active
January 12, 2018 17:28
-
-
Save redsymbol/dc6b57230c9a1d9148e6c56569751cf1 to your computer and use it in GitHub Desktop.
Demo code for policy gradient question
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''_ | |
Demo code for my question here: | |
https://www.reddit.com/r/reinforcementlearning/comments/7pxvh1/help_with_a_policy_gradient_detail/ | |
Run with: | |
python3 demo.py --verbose --episodes 100000 | |
Notice in the theta vector being periodically printed out, all | |
elements but the last stay very close to zero. | |
Updates to this demo: | |
https://gist.github.com/redsymbol/dc6b57230c9a1d9148e6c56569751cf1 | |
''' | |
import argparse | |
import random | |
from collections import namedtuple, defaultdict | |
import gym | |
import numpy as np | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--verbose', default=False, action='store_true') | |
parser.add_argument('--episodes', default=20, type=int) | |
parser.add_argument('--render', default=False, action='store_true') | |
return parser.parse_args() | |
def softmax(val, vals): | |
return np.exp(val) / np.sum(np.exp(vals)) | |
class ReinforceTask: | |
ALPHA = 10**-2 | |
GAMMA = .95 | |
def __init__(self, env, verbose=False): | |
self.env = env | |
self.verbose = verbose | |
self.actions = np.array(range(self.env.action_space.n)) | |
self.feature_size = 5 | |
self.theta = np.zeros(self.feature_size) | |
def feature(self, obs, action): | |
return np.array(list(obs) + [action]) | |
def h(self, obs, action): | |
return np.dot(self.theta, self.feature(obs, action)) | |
def pi(self, obs, action): | |
return softmax( | |
self.h(obs, action), | |
np.array([self.h(obs, _action) for _action in self.actions])) | |
def grad_ln_pi(self, obs, action): | |
return self.feature(obs, action) - \ | |
(self.pi(obs, 1) * self.feature(obs, 1) + \ | |
self.pi(obs, 0) * self.feature(obs, 0)) | |
def choose_action(self, obs): | |
probs = [self.pi(obs, action) for action in self.actions] | |
return np.random.choice(self.actions, p=probs) | |
def run(self, episode_count, render=True): | |
for episode_num in range(episode_count): | |
total_reward = self.run_episode(render) | |
if self.verbose: | |
print(f'Episode {episode_num}: Reward={total_reward}, Theta={list(self.theta)}') | |
def run_episode(self, render): | |
''' | |
returns total reward | |
''' | |
ALPHA = self.ALPHA | |
GAMMA = self.GAMMA | |
# generate episode | |
done = False | |
obs = self.env.reset() | |
actions = np.array([], dtype='i4') | |
rewards = np.array([0]) | |
states = [obs] | |
while not done: | |
if render: | |
env.render() | |
action = self.choose_action(obs) | |
actions = np.append(actions, action) | |
obs, reward, done, info = self.env.step(action) | |
states.append(obs) | |
rewards = np.append(rewards, reward) | |
# calc gainz | |
G = np.zeros(len(rewards)) | |
for i in range(len(G)-1, 0, -1): | |
G[i-1] = GAMMA * G[i] + rewards[i] | |
# adjust theta | |
for t in range(0, len(G) - 1): | |
gain = G[t] | |
action = actions[t] | |
state = states[t] | |
self.theta = self.theta + ALPHA * (GAMMA ** t) * self.grad_ln_pi(state, action) | |
return np.sum(rewards) | |
if __name__ == '__main__': | |
args = get_args() | |
env = gym.make('CartPole-v0') | |
task = ReinforceTask(env, args.verbose) | |
task.run(args.episodes, render=args.render) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment