Applicability: Can be used with some work
git: https://github.com/kengz/SLM-Lab
docs: https://kengz.gitbooks.io/slm-lab/content/
tutorial: https://medium.com/@kengz/pip-module-for-rl-agents-in-slm-lab-50e73872445d
Commentaries:
- Possible to be used as a python module, as in the tutorial above.
- Has to set everything with a spec file, .json
- Pytorch
Example:
import os
# NOTE increase if needed. Pytorch thread overusage https://github.com/pytorch/pytorch/issues/975
os.environ['OMP_NUM_THREADS'] = '1'
from slm_lab.agent import Agent
from slm_lab.env import OpenAIEnv
from slm_lab.experiment import analysis
from slm_lab.experiment.monitor import Body, InfoSpace, enable_aeb_space
from slm_lab.lib import logger, util
from slm_lab.spec import spec_util
class Session:
'''The class which initializes the agent, environment, and runs them.'''
def __init__(self, spec, info_space):
self.spec = spec
self.info_space = info_space
self.index = self.info_space.get('session')
# init singleton agent and env
self.env = OpenAIEnv(self.spec)
body = Body(self.env, self.spec['agent'])
self.agent = Agent(self.spec, self.info_space, body=body)
enable_aeb_space(self) # to use lab's data analysis framework
logger.info(f'Initialized session {self.index}')
def run_episode(self):
self.env.clock.tick('epi')
reward, state, done = self.env.reset()
self.agent.reset(state)
while not done:
self.env.clock.tick('t')
action = self.agent.act(state)
reward, state, done = self.env.step(action)
self.agent.update(action, reward, state, done)
self.agent.body.log_summary()
def close(self):
self.agent.close()
self.env.close()
logger.info('Session done and closed.')
def run(self):
while self.env.clock.get('epi') <= self.env.max_episode:
self.run_episode()
self.data = analysis.analyze_session(self) # session fitness
self.close()
return self.data
# To use SLM-Lab's existing spec. Alternatively, you can write one yourself too
spec = spec_util.get(spec_file='ppo.json', spec_name='ppo_mlp_shared_cartpole')
info_space = InfoSpace()
# set proper env variables for the lab
os.environ['PREPATH'] = util.get_prepath(spec, info_space)
os.environ['lab_mode'] = 'dev' # set to 'train' to run at full speed
# inspect the agent spec; edit if you wish to
print(spec['agent'])
# edit the env spec to run for less episodes
spec['env'][0]['max_episode'] = 100
# initialize and run session
sess = Session(spec, info_space)
data = sess.run()
print(f'Data is available at {util.smart_path(os.environ["PREPATH"])}')
Applicability Level: Usable with much work
git: https://github.com/hill-a/stable-baselines
docs: https://stable-baselines.readthedocs.io/en/master/
Commentaries:
- Needs to rewrite the train (similar to runner)
- Tensorflow
Example:
import gym
import itertools
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers
import baselines.common.tf_util as U
from baselines import logger
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.deepq.utils import ObservationInput
from baselines.common.schedules import LinearSchedule
def model(inpt, num_actions, scope, reuse=False):
"""This model takes as input an observation and returns values of all actions."""
with tf.variable_scope(scope, reuse=reuse):
out = inpt
out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
return out
if __name__ == '__main__':
with U.make_session(num_cpu=8):
# Create the environment
env = gym.make("CartPole-v0")
# Create all the functions necessary to train the model
act, train, update_target, debug = deepq.build_train(
make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
q_func=model,
num_actions=env.action_space.n,
optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
)
# Create the replay buffer
replay_buffer = ReplayBuffer(50000)
# Create the schedule for exploration starting from 1 (every action is random) down to
# 0.02 (98% of actions are selected according to values predicted by the model).
exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
# Initialize the parameters and copy them to the target network.
U.initialize()
update_target()
episode_rewards = [0.0]
obs = env.reset()
for t in itertools.count():
# Take action and update exploration to the newest value
action = act(obs[None], update_eps=exploration.value(t))[0]
new_obs, rew, done, _ = env.step(action)
# Store transition in the replay buffer.
replay_buffer.add(obs, action, rew, new_obs, float(done))
obs = new_obs
episode_rewards[-1] += rew
if done:
obs = env.reset()
episode_rewards.append(0)
is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
if is_solved:
# Show off the result
env.render()
else:
# Minimize the error in Bellman's equation on a batch sampled from replay buffer.
if t > 1000:
obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
# Update target network periodically.
if t % 1000 == 0:
update_target()
if done and len(episode_rewards) % 10 == 0:
logger.record_tabular("steps", t)
logger.record_tabular("episodes", len(episode_rewards))
logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
logger.dump_tabular()
implementations: https://github.com/Kaixhin/Rainbow and https://github.com/hengyuan-hu/rainbow
Commentaries:
- DQN only
- Pytorch
- Need to work on train function:
Example:
import os
import time
import torch
import torch.nn
from torch.autograd import Variable
import numpy as np
import utils
from core import samples_to_tensors
from logger import Logger
def update_agent(agent, replay_memory, gamma, optim, batch_size):
samples = replay_memory.sample(batch_size)
states, actions, rewards, next_states, non_ends = samples_to_tensors(samples)
actions = utils.one_hot(actions.unsqueeze(1), agent.num_actions)
targets = agent.compute_targets(rewards, next_states, non_ends, gamma)
states = Variable(states)
actions = Variable(actions)
targets = Variable(targets)
loss = agent.loss(states, actions, targets)
loss.backward()
optim.step()
optim.zero_grad()
return loss.data[0]
def train(agent,
env,
policy,
replay_memory,
gamma,
batch_size,
num_iters,
frames_per_update,
frames_per_sync,
frames_per_eval,
evaluator,
output_dir):
logger = Logger(os.path.join(output_dir, 'train_log.txt'))
optim = torch.optim.Adam(agent.parameters(), lr=6.25e-5, eps=1.5e-4)
action_dist = np.zeros(env.num_actions)
max_epsd_iters = 20000
best_avg_rewards = 0
num_epsd = 0
epsd_iters = 0
epsd_rewards = 0
t = time.time()
for i in xrange(num_iters):
if env.end or epsd_iters > max_epsd_iters:
num_epsd += 1
if num_epsd % 10 == 0:
fps = epsd_iters / (time.time() - t)
logger.write('Episode: %d, Iter: %d, Fps: %.2f'
% (num_epsd, i+1, fps))
logger.write('sum clipped rewards %d' % epsd_rewards)
logger.log()
epsd_iters = 0
epsd_rewards = 0
t = time.time()
state = env.reset()
action = policy.get_action(state)
action_dist[action] += 1
next_state, reward = env.step(action)
replay_memory.append(state, action, reward, next_state, env.end)
state = next_state
epsd_iters += 1
epsd_rewards += reward
if (i+1) % frames_per_update == 0:
loss = update_agent(agent, replay_memory, gamma, optim, batch_size)
logger.append('loss', loss)
policy.decay()
if (i+1) % frames_per_sync == 0:
logger.write('>>>syncing nets, i: %d' % (i+1))
agent.sync_target()
if (i+1) % frames_per_eval == 0:
logger.write('Train Action distribution:')
for act, count in enumerate(action_dist):
prob = float(count) / action_dist.sum()
logger.write('\t action: %d, p: %.4f' % (act, prob))
action_dist = np.zeros(env.num_actions)
avg_rewards = evaluator(logger)
if avg_rewards > best_avg_rewards:
prefix = os.path.join(output_dir, '')
agent.save_q_net(prefix)
best_avg_rewards = avg_rewards
def evaluate(env, policy, num_epsd, logger):
actions = np.zeros(env.num_actions)
total_rewards = np.zeros(num_epsd)
epsd_idx = 0
epsd_iters = 0
max_epsd_iters = 108000
state = env.reset()
while epsd_idx < num_epsd:
action = policy.get_action(state)
actions[action] += 1
state, _ = env.step(action)
epsd_iters += 1
if env.end or epsd_iters >= max_epsd_iters:
total_rewards[epsd_idx] = env.total_reward
logger.write('>>>Eval: [%d/%d], rewards: %s' %
(epsd_idx+1, num_epsd, total_rewards[epsd_idx]))
if epsd_idx < num_epsd - 1: # leave last reset to next run
state = env.reset()
epsd_idx += 1
epsd_iters = 0
avg_rewards = total_rewards.mean()
logger.write('>>>Eval: avg total rewards: %s' % avg_rewards)
logger.write('>>>Eval: actions dist:')
probs = list(actions/actions.sum())
for action, prob in enumerate(probs):
logger.write('\t action: %d, p: %.4f' % (action, prob))
return avg_rewards
Applicability Level: Can be used with some encapsulation
git: https://github.com/rlcode/reinforcement-learning
Commentaries:
- Can be used as base to implement our own.
- Tensorflow
- Accepts multiple agents.
Example:
import threading
import numpy as np
import tensorflow as tf
import pylab
import time
import gym
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
# global variables for threading
episode = 0
scores = []
EPISODES = 2000
# This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
# In this example, we use A3C algorithm
class A3CAgent:
def __init__(self, state_size, action_size, env_name):
# get size of state and action
self.state_size = state_size
self.action_size = action_size
# get gym environment name
self.env_name = env_name
# these are hyper parameters for the A3C
self.actor_lr = 0.001
self.critic_lr = 0.001
self.discount_factor = .99
self.hidden1, self.hidden2 = 24, 24
self.threads = 8
# create model for actor and critic network
self.actor, self.critic = self.build_model()
# method for training actor and critic network
self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]
self.sess = tf.InteractiveSession()
K.set_session(self.sess)
self.sess.run(tf.global_variables_initializer())
# approximate policy and value using Neural Network
# actor -> state is input and probability of each action is output of network
# critic -> state is input and value of state is output of network
# actor and critic network share first hidden layer
def build_model(self):
state = Input(batch_shape=(None, self.state_size))
shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state)
actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared)
action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden)
value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared)
state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden)
actor = Model(inputs=state, outputs=action_prob)
critic = Model(inputs=state, outputs=state_value)
actor._make_predict_function()
critic._make_predict_function()
actor.summary()
critic.summary()
return actor, critic
# make loss function for Policy Gradient
# [log(action probability) * advantages] will be input for the back prop
# we add entropy of action probability to loss
def actor_optimizer(self):
action = K.placeholder(shape=(None, self.action_size))
advantages = K.placeholder(shape=(None, ))
policy = self.actor.output
good_prob = K.sum(action * policy, axis=1)
eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages)
loss = -K.sum(eligibility)
entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
actor_loss = loss + 0.01*entropy
optimizer = Adam(lr=self.actor_lr)
updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
train = K.function([self.actor.input, action, advantages], [], updates=updates)
return train
# make loss function for Value approximation
def critic_optimizer(self):
discounted_reward = K.placeholder(shape=(None, ))
value = self.critic.output
loss = K.mean(K.square(discounted_reward - value))
optimizer = Adam(lr=self.critic_lr)
updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
train = K.function([self.critic.input, discounted_reward], [], updates=updates)
return train
# make agents(local) and start training
def train(self):
# self.load_model('./save_model/cartpole_a3c.h5')
agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,
self.action_size, self.state_size) for i in range(self.threads)]
for agent in agents:
agent.start()
while True:
time.sleep(20)
plot = scores[:]
pylab.plot(range(len(plot)), plot, 'b')
pylab.savefig("./save_graph/cartpole_a3c.png")
self.save_model('./save_model/cartpole_a3c.h5')
def save_model(self, name):
self.actor.save_weights(name + "_actor.h5")
self.critic.save_weights(name + "_critic.h5")
def load_model(self, name):
self.actor.load_weights(name + "_actor.h5")
self.critic.load_weights(name + "_critic.h5")
# This is Agent(local) class for threading
class Agent(threading.Thread):
def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):
threading.Thread.__init__(self)
self.states = []
self.rewards = []
self.actions = []
self.index = index
self.actor = actor
self.critic = critic
self.optimizer = optimizer
self.env_name = env_name
self.discount_factor = discount_factor
self.action_size = action_size
self.state_size = state_size
# Thread interactive with environment
def run(self):
global episode
env = gym.make(self.env_name)
while episode < EPISODES:
state = env.reset()
score = 0
while True:
action = self.get_action(state)
next_state, reward, done, _ = env.step(action)
score += reward
self.memory(state, action, reward)
state = next_state
if done:
episode += 1
print("episode: ", episode, "/ score : ", score)
scores.append(score)
self.train_episode(score != 500)
break
# In Policy Gradient, Q function is not available.
# Instead agent uses sample returns for evaluating policy
def discount_rewards(self, rewards, done=True):
discounted_rewards = np.zeros_like(rewards)
running_add = 0
if not done:
running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0]
for t in reversed(range(0, len(rewards))):
running_add = running_add * self.discount_factor + rewards[t]
discounted_rewards[t] = running_add
return discounted_rewards
# save <s, a ,r> of each step
# this is used for calculating discounted rewards
def memory(self, state, action, reward):
self.states.append(state)
act = np.zeros(self.action_size)
act[action] = 1
self.actions.append(act)
self.rewards.append(reward)
# update policy network and value network every episode
def train_episode(self, done):
discounted_rewards = self.discount_rewards(self.rewards, done)
values = self.critic.predict(np.array(self.states))
values = np.reshape(values, len(values))
advantages = discounted_rewards - values
self.optimizer[0]([self.states, self.actions, advantages])
self.optimizer[1]([self.states, discounted_rewards])
self.states, self.actions, self.rewards = [], [], []
def get_action(self, state):
policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
return np.random.choice(self.action_size, 1, p=policy)[0]
if __name__ == "__main__":
env_name = 'CartPole-v1'
env = gym.make(env_name)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
env.close()
global_agent = A3CAgent(state_size, action_size, env_name)
global_agent.train()
Applicability Level: Needs work on fit
git: https://github.com/HassamSheikh/YARL
Commentaries:
- Keras
- Would need to work on the fit method
- Only DQN and DDQN
Example:
import sys
sys.path.append('../')
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop, Adam
from keras import losses
from rl.agents.DQN import DQNAgent
from rl.memory.ReplayBuffer import ReplayBuffer
from rl.policy.Policy import DecayingEpsilonGreedyPolicy
import gym
def create_model(state_dim, number_of_actions):
model = Sequential()
model.add(Dense(output_dim=64, activation='relu', input_dim=state_dim))
model.add(Dense(output_dim=number_of_actions, activation='linear'))
return model
env = gym.make('CartPole-v0')
model=create_model(env.observation_space.shape[0], env.action_space.n)
replay_buffer=ReplayBuffer()
policy=DecayingEpsilonGreedyPolicy()
agent=DQNAgent(env, model, policy, replay_buffer, tau=0.99)
agent.compile(RMSprop(lr=0.00025), losses.logcosh)
agent.fit(10000)
Applicability: Highly usable
git: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
Commentaries:
- Tensorflow
- Many algorithms
Example:
"""
Deep Q network,
Using:
Tensorflow: 1.0
gym: 0.7.3
"""
import gym
from RL_brain import DeepQNetwork
env = gym.make('CartPole-v0')
env = env.unwrapped
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
RL = DeepQNetwork(n_actions=env.action_space.n,
n_features=env.observation_space.shape[0],
learning_rate=0.01, e_greedy=0.9,
replace_target_iter=100, memory_size=2000,
e_greedy_increment=0.001,)
total_steps = 0
for i_episode in range(100):
observation = env.reset()
ep_r = 0
while True:
env.render()
action = RL.choose_action(observation)
observation_, reward, done, info = env.step(action)
# the smaller theta and closer to center the better
x, x_dot, theta, theta_dot = observation_
r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
reward = r1 + r2
RL.store_transition(observation, action, reward, observation_)
ep_r += reward
if total_steps > 1000:
RL.learn()
if done:
print('episode: ', i_episode,
'ep_r: ', round(ep_r, 2),
' epsilon: ', round(RL.epsilon, 2))
break
observation = observation_
total_steps += 1
RL.plot_cost()
Applicability: Needs some work on train/learn function
git: https://github.com/Officium/RL-Experiments
Commentaries:
- Basically a baselines with pytorch
- Needs work on the train function
- Pytorch
Example:
""" Run script """
import argparse
import os
import torch
from common.util import learn, parse_all_args
""" Some notice """
print("""
Notes:
CUDA usage is depend on `CUDA_VISIBLE_DEVICES`;
Log will be recorded at ../logs/{env}_{algorithm}_{seed}/ by default;
If you need multi-gpu training or other nn specific features, please
modify the default.py file in corresponding algorithm folder.
""")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
""" Parse arguments """
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('--env', type=str, required=True, help='environment ID')
parser.add_argument('--algorithm', type=str, required=True, help='Algorithm')
parser.add_argument('--nenv', type=int, default=0, help='parrallel number')
parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument('--number_timesteps', type=float, default=1e6)
parser.add_argument('--reward_scale', type=float, default=1.0)
parser.add_argument('--save_path', type=str, default='../checkpoints')
parser.add_argument('--save_interval', type=int, default=0,
help='save model every x steps (0 = disabled)')
parser.add_argument('--log_path', type=str, default='../logs',
help='save model every x steps (0 = disabled)')
common_options, other_options = parse_all_args(parser)
""" Learn """
if common_options.save_interval:
os.makedirs(common_options.save_path, exist_ok=True)
model = learn(
device=device,
env_id=common_options.env,
nenv=common_options.nenv,
seed=common_options.seed,
number_timesteps=int(common_options.number_timesteps),
save_path=common_options.save_path,
save_interval=common_options.save_interval,
log_path=common_options.log_path,
algorithm=common_options.algorithm,
reward_scale=common_options.reward_scale,
**other_options
)
Applicability: 4 Needs encapsulation
git: https://github.com/higgsfield/RL-Adventure & https://github.com/higgsfield/RL-Adventure-2
Commentaries:
- Pytorch
- Needs to encapsulate the cells of the python notebooks
Example:
num_frames = 15000
batch_size = 32
gamma = 0.99
losses = []
all_rewards = []
episode_reward = 0
state = env.reset()
for frame_idx in range(1, num_frames + 1):
action = current_model.act(state)
next_state, reward, done, _ = env.step(action)
replay_buffer.push(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
if done:
state = env.reset()
all_rewards.append(episode_reward)
episode_reward = 0
if len(replay_buffer) > batch_size:
loss = compute_td_loss(batch_size)
losses.append(loss.data[0])
if frame_idx % 200 == 0:
plot(frame_idx, all_rewards, losses)
if frame_idx % 1000 == 0:
update_target(current_model, target_model)
Applicability: Can be used with some rework on the Trainer
git: https://github.com/navneet-nmk/pytorch-rl
Commentaries:
- Pytorch
- Many algorithms
- Needs work on train method or Trainer object
Example:
# Training script for the DDPG
import torch
# Add this line to get better performance
torch.backends.cudnn.benchmark=True
from Utils import utils
import torch.optim as optim
from models.DDPG import DDPG
import torch.nn.functional as F
use_cuda = torch.cuda.is_available()
from Training.trainer import Trainer
import os
if __name__ == '__main__':
# Specify the environment name and create the appropriate environment
seed = 4240
env = utils.EnvGenerator(name='FetchReach-v1', goal_based=False, seed=seed)
eval_env = utils.EnvGenerator(name='FetchReach-v1', goal_based=False,seed=seed)
action_dim = env.get_action_dim()
observation_dim = env.get_observation_dim()
goal_dim = env.get_goal_dim()
env= env.get_environment()
eval_env = eval_env.get_environment()
# Training constants
her_training=True
# Future framnes to look at
future= 4
buffer_capacity = int(1e3)
q_dim = 1
batch_size = 128
hidden_units = 256
gamma = 0.98 # Discount Factor for future rewards
num_epochs = 50
learning_rate = 0.001
critic_learning_rate = 0.001
polyak_factor = 0.05
# Huber loss to aid small gradients
criterion = F.smooth_l1_loss
# Adam Optimizer
opt = optim.Adam
# Output Folder
output_folder = os.getcwd() + '/output_ddpg/'
# Convert the observation and action dimension to int
print(observation_dim)
observation_dim = int(observation_dim)
action_dim = int(action_dim)
print(action_dim)
goal_dim= int(goal_dim)
# Create the agent
agent = DDPG(num_hidden_units=hidden_units, input_dim=observation_dim+goal_dim,
num_actions=action_dim, num_q_val=q_dim, batch_size=batch_size, random_seed=seed,
use_cuda=use_cuda, gamma=gamma, actor_optimizer=opt, critic_optimizer=optim,
actor_learning_rate=learning_rate, critic_learning_rate=critic_learning_rate,
loss_function=criterion, polyak_constant=polyak_factor, buffer_capacity=buffer_capacity,
goal_dim=goal_dim, observation_dim=observation_dim)
# Train the agent
trainer = Trainer(agent=agent, num_epochs=50, num_rollouts=19*50, num_eval_rollouts=100,
max_episodes_per_epoch=50, env=env, eval_env=None,
nb_train_steps=19*50, multi_gpu_training=False, random_seed=seed, future=future)
if her_training:
trainer.her_training()
else:
trainer.train()
Applicability: needs rework on Runner
git: https://github.com/rlworkgroup/garage
Commentaries:
- Tensorflow
- Not a clear api without the runner
- Many algorithms
Example:
#!/usr/bin/env python3
"""
An example to train a task with DQN algorithm.
Here it creates a gym environment CartPole, and trains a DQN with 50k steps.
"""
import gym
from garage.experiment import LocalRunner, run_experiment
from garage.np.exploration_strategies import EpsilonGreedyStrategy
from garage.replay_buffer import SimpleReplayBuffer
from garage.tf.algos import DQN
from garage.tf.envs import TfEnv
from garage.tf.policies import DiscreteQfDerivedPolicy
from garage.tf.q_functions import DiscreteMLPQFunction
def run_task(snapshot_config, *_):
"""Run task."""
with LocalRunner(snapshot_config=snapshot_config) as runner:
n_epochs = 10
n_epoch_cycles = 10
sampler_batch_size = 500
num_timesteps = n_epochs * n_epoch_cycles * sampler_batch_size
env = TfEnv(gym.make('CartPole-v0'))
replay_buffer = SimpleReplayBuffer(
env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1)
qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf)
epilson_greedy_strategy = EpsilonGreedyStrategy(
env_spec=env.spec,
total_timesteps=num_timesteps,
max_epsilon=1.0,
min_epsilon=0.02,
decay_ratio=0.1)
algo = DQN(
env_spec=env.spec,
policy=policy,
qf=qf,
exploration_strategy=epilson_greedy_strategy,
replay_buffer=replay_buffer,
qf_lr=1e-4,
discount=1.0,
min_buffer_size=int(1e3),
double_q=True,
n_train_steps=500,
n_epoch_cycles=n_epoch_cycles,
target_network_update_freq=1,
buffer_batch_size=32)
runner.setup(algo, env)
runner.train(
n_epochs=n_epochs,
n_epoch_cycles=n_epoch_cycles,
batch_size=sampler_batch_size)
run_experiment(run_task, snapshot_mode='last', seed=1)
#!/usr/bin/env python3
"""
This is an example to train a task with VPG algorithm.
Here it runs CartPole-v1 environment with 100 iterations.
Results:
AverageReturn: 100
RiseTime: itr 13
"""
from garage.experiment import LocalRunner, run_experiment
from garage.np.baselines import LinearFeatureBaseline
from garage.tf.algos import TRPO
from garage.tf.envs import TfEnv
from garage.tf.policies import CategoricalMLPPolicy
def run_task(snapshot_config, *_):
with LocalRunner(snapshot_config=snapshot_config) as runner:
env = TfEnv(env_name='CartPole-v1')
policy = CategoricalMLPPolicy(
name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = TRPO(
env_spec=env.spec,
policy=policy,
baseline=baseline,
max_path_length=100,
discount=0.99,
max_kl_step=0.01)
runner.setup(algo, env)
runner.train(n_epochs=100, batch_size=4000)
run_experiment(
run_task,
snapshot_mode='last',
seed=1,
)
Applicability: Highly Usable Independent agent
git: https://github.com/rlgraph/rlgraph
docs: https://rlgraph.readthedocs.io/en/latest/?badge=latest
tutorial: https://rlgraph.github.io/rlgraph/2019/01/04/introducing-rlgraph.html
Commentaries:
- Many algorithms
- Some pytorch backend, but not fully
- Tensorflow and Ray
Example:
from rlgraph.agents import DQNAgent
from rlgraph.environments import OpenAIGymEnv
environment = OpenAIGymEnv('CartPole-v0')
# Create from .json file or dict, see agent API for all
# possible configuration parameters.
agent = DQNAgent.from_file(
"configs/dqn_cartpole.json",
state_space=environment.state_space,
action_space=environment.action_space
)
# Get an action, take a step, observe reward.
state = environment.reset()
action, preprocessed_state = agent.get_action(
states=state,
extra_returns="preprocessed_states"
)
# Execute step in environment.
next_state, reward, terminal, info = environment.step(action)
# Observe result.
agent.observe(
preprocessed_states=preprocessed_state,
actions=action,
internals=[],
next_states=next_state,
rewards=reward,
terminals=terminal
)
# Call update when desired:
loss = agent.update()
Applicability: Need rework on Core (runner)
git: https://github.com/AIRLab-POLIMI/mushroom
Commentaries:
- Pytorch
- Medium size list of algorithms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from mushroom.algorithms.value import DQN
from mushroom.core import Core
from mushroom.environments import *
from mushroom.policy import EpsGreedy
from mushroom.approximators.parametric.pytorch_network import *
from mushroom.utils.dataset import compute_J
from mushroom.utils.parameters import Parameter, LinearParameter
class Network(nn.Module):
def __init__(self, input_shape, output_shape, n_features, **kwargs):
super().__init__()
n_input = input_shape[-1]
n_output = output_shape[0]
self._h1 = nn.Linear(n_input, n_features)
self._h2 = nn.Linear(n_features, n_features)
self._h3 = nn.Linear(n_features, n_output)
nn.init.xavier_uniform_(self._h1.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h2.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h3.weight,
gain=nn.init.calculate_gain('linear'))
def forward(self, state, action=None):
features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
features2 = F.relu(self._h2(features1))
q = self._h3(features2)
if action is None:
return q
else:
action = action.long()
q_acted = torch.squeeze(q.gather(1, action))
return q_acted
def experiment(n_epochs, n_steps, n_steps_test):
np.random.seed()
# MDP
horizon = 1000
gamma = 0.99
gamma_eval = 1.
mdp = Gym('Acrobot-v1', horizon, gamma)
# Policy
epsilon = LinearParameter(value=1., threshold_value=.01, n=5000)
epsilon_test = Parameter(value=0.)
epsilon_random = Parameter(value=1.)
pi = EpsGreedy(epsilon=epsilon_random)
# Settings
initial_replay_size = 500
max_replay_size = 5000
target_update_frequency = 100
batch_size = 200
n_features = 80
train_frequency = 1
# Approximator
input_shape = mdp.info.observation_space.shape
approximator_params = dict(network=Network,
optimizer={'class': optim.Adam,
'params': {'lr': .001}},
loss=F.smooth_l1_loss,
n_features=n_features,
input_shape=input_shape,
output_shape=mdp.info.action_space.size,
n_actions=mdp.info.action_space.n)
# Agent
agent = DQN(PyTorchApproximator, pi, mdp.info,
approximator_params=approximator_params, batch_size=batch_size,
n_approximators=1, initial_replay_size=initial_replay_size,
max_replay_size=max_replay_size,
target_update_frequency=target_update_frequency)
# Algorithm
core = Core(agent, mdp)
core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)
# RUN
pi.set_epsilon(epsilon_test)
dataset = core.evaluate(n_steps=n_steps_test, render=False)
J = compute_J(dataset, gamma_eval)
print('J: ', np.mean(J))
for n in range(n_epochs):
print('Epoch: ', n)
pi.set_epsilon(epsilon)
core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
pi.set_epsilon(epsilon_test)
dataset = core.evaluate(n_steps=n_steps_test, render=False)
J = compute_J(dataset, gamma_eval)
print('J: ', np.mean(J))
print('Press a button to visualize acrobot')
input()
core.evaluate(n_episodes=5, render=True)
if __name__ == '__main__':
experiment(n_epochs=20, n_steps=1000, n_steps_test=2000)
Applicability: Can be used with some coding
git: https://github.com/csxeba/trickster
Commentaries:
- Keras
- Some models: DQN, DDQN, A2C, PPO
- Nice Documentation
Example:
import numpy as np
import gym
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from trickster.agent import DQN
from trickster.rollout import Trajectory, RolloutConfig, Rolling
from trickster.experience import Experience
from trickster.utility import visual
env = gym.make("CartPole-v1")
input_shape = env.observation_space.shape
num_actions = env.action_space.n
ann = Sequential([Dense(16, activation="relu", input_shape=input_shape),
Dense(16, activation="relu"),
Dense(num_actions, activation="linear")])
ann.compile(loss="mse", optimizer=Adam(1e-3))
agent = DQN(ann,
action_space=2,
memory=Experience(max_length=10000),
epsilon=1.,
discount_factor_gamma=0.98,
use_target_network=True)
rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"), config=RolloutConfig())
rewards = []
losses = []
for episode in range(1, 301):
episode_losses = []
for update in range(32):
rollout.roll(steps=4, verbose=0, push_experience=True)
agent_history = agent.fit(batch_size=32, verbose=0)
episode_losses.append(agent_history["loss"])
test_history = test_rollout.rollout(verbose=0, push_experience=False)
rewards.append(test_history["reward_sum"])
losses.append(np.mean(episode_losses))
print("\rEpisode {:>4} RWD {:>3.0f} LOSS {:.4f} EPS {:>6.2%}".format(
episode, np.mean(rewards[-10:]), np.mean(losses[-10:]), agent.epsilon), end="")
agent.epsilon *= 0.992
agent.epsilon = max(agent.epsilon, 0.01)
if episode % 5 == 0:
agent.push_weights()
print(" Pushed weights to target net!")
visual.plot_vectors([rewards, losses],
["Rewards", "Losses"],
smoothing_window_size=10)
Applicability: Needs rework on simulation
git: https://github.com/danaugrs/huskarl
Commentaries:
- Tensorflow 2.0 and Keras
- Many algorithms: DQN, Multi-step DQN, DDQN, Dueling DQN, A2C, DDPG
- Need work on the simulation class
Example:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
import gym
import huskarl as hk
if __name__ == '__main__':
# Setup gym environment
create_env = lambda: gym.make('CartPole-v0').unwrapped
dummy_env = create_env()
# Build a simple neural network with 3 fully connected layers as our model
model = Sequential([
Dense(16, activation='relu', input_shape=dummy_env.observation_space.shape),
Dense(16, activation='relu'),
Dense(16, activation='relu'),
])
# Create Deep Q-Learning Network agent
agent = hk.agent.DQN(model, actions=dummy_env.action_space.n, nsteps=2)
def plot_rewards(episode_rewards, episode_steps, done=False):
plt.clf()
plt.xlabel('Step')
plt.ylabel('Reward')
for ed, steps in zip(episode_rewards, episode_steps):
plt.plot(steps, ed)
plt.show() if done else plt.pause(0.001) # Pause a bit so that the graph is updated
# Create simulation, train and then test
sim = hk.Simulation(create_env, agent)
sim.train(max_steps=3000, visualize=True, plot=plot_rewards)
sim.test(max_steps=1000)
Applicability: Highly usable, but needs some encapsulation
git: https://github.com/seungeunrho/minimalRL
Commentaries:
- Pytorch
- Minimal implementation
- Algorithms: Reinforce, DQN, PPO, DDPG, A3C, ACER.
Example:
def main():
env = gym.make('CartPole-v1')
q = Qnet()
q_target = Qnet()
q_target.load_state_dict(q.state_dict())
memory = ReplayBuffer()
print_interval = 20
score = 0.0
optimizer = optim.Adam(q.parameters(), lr=learning_rate)
for n_epi in range(10000):
epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
s = env.reset()
for t in range(600):
a = q.sample_action(torch.from_numpy(s).float(), epsilon)
s_prime, r, done, info = env.step(a)
done_mask = 0.0 if done else 1.0
memory.put((s,a,r/100.0,s_prime, done_mask))
s = s_prime
score += r
if done:
break
if memory.size()>2000:
train(q, q_target, memory, optimizer)
if n_epi%print_interval==0 and n_epi!=0:
q_target.load_state_dict(q.state_dict())
print("# of episode :{}, avg score : {:.1f}, buffer size : {}, epsilon : {:.1f}%".format(
n_epi, score/print_interval, memory.size(), epsilon*100))
score = 0.0
env.close()
if __name__ == '__main__':
main()
Applicability: Highly Usable
git: https://github.com/cpnota/autonomous-learning-library
Commentaries:
- Pytorch
- Algorithms: DQN, A2C, VPG...
Example:
import argparse
from all.environments import GymEnvironment
from all.experiments import Experiment
from all.presets import classic_control
def run_atari():
parser = argparse.ArgumentParser(
description='Run a classic control benchmark.')
parser.add_argument('env', help='Name of the env (e.g. CartPole-v1)')
parser.add_argument(
'agent', help="Name of the agent (e.g. sarsa). See presets for available agents.")
parser.add_argument('--episodes', type=int, default=1000,
help='The number of training frames')
parser.add_argument(
'--device', default='cuda',
help='The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)'
)
args = parser.parse_args()
env = GymEnvironment(args.env, device=args.device)
agent_name = args.agent
agent = getattr(classic_control, agent_name)
experiment = Experiment(
env,
episodes=args.episodes
)
experiment.run(agent(device=args.device), label=agent_name)
if __name__ == '__main__':
run_atari()
Applicability: Usable
git: https://github.com/nikhilbarhate99/TD3-PyTorch-BipedalWalker-v2 & https://github.com/nikhilbarhate99/PPO-PyTorch
Commentaries:
- Pytorch
- Only two algorithms
Example:
import torch
import torch.nn as nn
from torch.distributions import Categorical
import gym
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Memory:
def __init__(self):
self.actions = []
self.states = []
self.logprobs = []
self.rewards = []
def clear_memory(self):
del self.actions[:]
del self.states[:]
del self.logprobs[:]
del self.rewards[:]
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, n_latent_var):
super(ActorCritic, self).__init__()
self.affine = nn.Linear(state_dim, n_latent_var)
# actor
self.action_layer = nn.Sequential(
nn.Linear(state_dim, n_latent_var),
nn.Tanh(),
nn.Linear(n_latent_var, n_latent_var),
nn.Tanh(),
nn.Linear(n_latent_var, action_dim),
nn.Softmax(dim=-1)
)
# critic
self.value_layer = nn.Sequential(
nn.Linear(state_dim, n_latent_var),
nn.Tanh(),
nn.Linear(n_latent_var, n_latent_var),
nn.Tanh(),
nn.Linear(n_latent_var, 1)
)
def forward(self):
raise NotImplementedError
def act(self, state, memory):
state = torch.from_numpy(state).float().to(device)
action_probs = self.action_layer(state)
dist = Categorical(action_probs)
action = dist.sample()
memory.states.append(state)
memory.actions.append(action)
memory.logprobs.append(dist.log_prob(action))
return action.item()
def evaluate(self, state, action):
action_probs = self.action_layer(state)
dist = Categorical(action_probs)
action_logprobs = dist.log_prob(action)
dist_entropy = dist.entropy()
state_value = self.value_layer(state)
return action_logprobs, torch.squeeze(state_value), dist_entropy
class PPO:
def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip):
self.lr = lr
self.betas = betas
self.gamma = gamma
self.eps_clip = eps_clip
self.K_epochs = K_epochs
self.policy = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
self.optimizer = torch.optim.Adam(self.policy.parameters(),
lr=lr, betas=betas)
self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
self.MseLoss = nn.MSELoss()
def update(self, memory):
# Monte Carlo estimate of state rewards:
rewards = []
discounted_reward = 0
for reward in reversed(memory.rewards):
discounted_reward = reward + (self.gamma * discounted_reward)
rewards.insert(0, discounted_reward)
# Normalizing the rewards:
rewards = torch.tensor(rewards).to(device)
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
# convert list to tensor
old_states = torch.stack(memory.states).to(device).detach()
old_actions = torch.stack(memory.actions).to(device).detach()
old_logprobs = torch.stack(memory.logprobs).to(device).detach()
# Optimize policy for K epochs:
for _ in range(self.K_epochs):
# Evaluating old actions and values :
logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
# Finding the ratio (pi_theta / pi_theta__old):
ratios = torch.exp(logprobs - old_logprobs.detach())
# Finding Surrogate Loss:
advantages = rewards - state_values.detach()
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
# take gradient step
self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()
# Copy new weights into old policy:
self.policy_old.load_state_dict(self.policy.state_dict())
def main():
############## Hyperparameters ##############
env_name = "LunarLander-v2"
# creating environment
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = 4
render = False
solved_reward = 230 # stop training if avg_reward > solved_reward
log_interval = 20 # print avg reward in the interval
max_episodes = 50000 # max training episodes
max_timesteps = 300 # max timesteps in one episode
n_latent_var = 64 # number of variables in hidden layer
update_timestep = 2000 # update policy every n timesteps
lr = 0.002
betas = (0.9, 0.999)
gamma = 0.99 # discount factor
K_epochs = 4 # update policy for K epochs
eps_clip = 0.2 # clip parameter for PPO
random_seed = None
#############################################
if random_seed:
torch.manual_seed(random_seed)
env.seed(random_seed)
memory = Memory()
ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
print(lr,betas)
# logging variables
running_reward = 0
avg_length = 0
timestep = 0
# training loop
for i_episode in range(1, max_episodes+1):
state = env.reset()
for t in range(max_timesteps):
timestep += 1
# Running policy_old:
action = ppo.policy_old.act(state, memory)
state, reward, done, _ = env.step(action)
# Saving reward:
memory.rewards.append(reward)
# update if its time
if timestep % update_timestep == 0:
ppo.update(memory)
memory.clear_memory()
timestep = 0
running_reward += reward
if render:
env.render()
if done:
break
avg_length += t
# stop training if avg_reward > solved_reward
if running_reward > (log_interval*solved_reward):
print("########## Solved! ##########")
torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
break
# logging
if i_episode % log_interval == 0:
avg_length = int(avg_length/log_interval)
running_reward = int((running_reward/log_interval))
print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
running_reward = 0
avg_length = 0
if __name__ == '__main__':
main()
Applicability: Highly Usable, modularized, needs work on runner
git: https://github.com/tensorforce/tensorforce
docs: https://tensorforce.readthedocs.io/en/stable/
Commentaries:
- Tensorflow
- Algorithms: DQN, DDQN, Reinforce, PPO, A3C, TRPO...
- Only need to understand and rework the runner
Example:
# Copyright 2017 reinforce.io. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import numpy as np
from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym
# Create an OpenAIgym environment.
environment = OpenAIGym('CartPole-v0', visualize=False)
# Network as list of layers
# - Embedding layer:
# - For Gym environments utilizing a discrete observation space, an
# "embedding" layer should be inserted at the head of the network spec.
# Such environments are usually identified by either:
# - class ...Env(discrete.DiscreteEnv):
# - self.observation_space = spaces.Discrete(...)
# Note that depending on the following layers used, the embedding layer *may* need a
# flattening layer
network_spec = [
# dict(type='embedding', indices=100, size=32),
# dict(type'flatten'),
dict(type='dense', size=32),
dict(type='dense', size=32)
]
agent = PPOAgent(
states=environment.states,
actions=environment.actions,
network=network_spec,
# Agent
states_preprocessing=None,
actions_exploration=None,
reward_preprocessing=None,
# MemoryModel
update_mode=dict(
unit='episodes',
# 10 episodes per update
batch_size=10,
# Every 10 episodes
frequency=10
),
memory=dict(
type='latest',
include_next_states=False,
capacity=5000
),
# DistributionModel
distributions=None,
entropy_regularization=0.01,
# PGModel
baseline_mode='states',
baseline=dict(
type='mlp',
sizes=[32, 32]
),
baseline_optimizer=dict(
type='multi_step',
optimizer=dict(
type='adam',
learning_rate=1e-3
),
num_steps=5
),
gae_lambda=0.97,
# PGLRModel
likelihood_ratio_clipping=0.2,
# PPOAgent
step_optimizer=dict(
type='adam',
learning_rate=1e-3
),
subsampling_fraction=0.2,
optimization_steps=25,
execution=dict(
type='single',
session_config=None,
distributed_spec=None
)
)
# Create the runner
runner = Runner(agent=agent, environment=environment)
# Callback function printing episode statistics
def episode_finished(r):
print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep,
reward=r.episode_rewards[-1]))
return True
# Start learning
runner.run(episodes=3000, max_episode_timesteps=200, episode_finished=episode_finished)
runner.close()
# Print statistics
print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(
ep=runner.episode,
ar=np.mean(runner.episode_rewards[-100:]))
)