Last active
October 26, 2017 23:44
-
-
Save marekgalovic/d2965086f9779b06d1b9a3630a78cada to your computer and use it in GitHub Desktop.
RL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import numpy as np | |
import gym | |
import random | |
env = gym.make('CartPole-v0') | |
print('State space:', env.observation_space) | |
print('Action space:', env.action_space) | |
N_EPISODES = 2000 | |
STATE_SIZE = len(env.observation_space.low) | |
ACTION_SIZE = env.action_space.n | |
UPDATE_FREQUENCY = 3 | |
BATCH_SIZE = 128 | |
LEARNING_RATE = 0.15 | |
GAMMA = 0.98 | |
def discounted_rewards(rewards): | |
result = np.zeros(len(rewards)) | |
running_sum = 0 | |
for t in reversed(range(len(rewards))): | |
running_sum = GAMMA*running_sum + rewards[t] | |
result[t] = running_sum | |
return result | |
with tf.Graph().as_default(): | |
# Placeholders | |
state_ph = tf.placeholder(tf.float32, [None, STATE_SIZE]) | |
# Weights | |
W = tf.Variable( | |
tf.random_uniform([STATE_SIZE,ACTION_SIZE], minval=.5, maxval=1.5), | |
trainable=True | |
) | |
# Predict | |
action_p = tf.nn.softmax(tf.matmul(state_ph, W)) | |
reward_ph = tf.placeholder(tf.float32, [None]) | |
selected_action_ph = tf.placeholder(tf.int32, [None]) | |
# Loss | |
action_indices = tf.concat([ | |
tf.expand_dims(tf.range(tf.shape(action_p)[0]), -1), | |
tf.expand_dims(selected_action_ph, -1) | |
], 1) | |
responsible_actions = tf.gather_nd(action_p, action_indices) | |
loss = -tf.reduce_mean(tf.log(responsible_actions) * reward_ph) | |
# Update | |
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) | |
train_op = optimizer.minimize(loss) | |
with tf.Session() as sess: | |
sess.run(tf.global_variables_initializer()) | |
total_rewards = [] | |
samples_buffer = [] | |
try: | |
for e in range(N_EPISODES): | |
state = env.reset() | |
e_states, e_actions, e_rewards = [], [], [] | |
while True: | |
# env.render() | |
action_p_val = sess.run(action_p, feed_dict={state_ph: state.reshape(1, -1)}) | |
action = np.random.choice(list(range(len(action_p_val[0]))), p=action_p_val[0]) | |
new_state, reward, done, _ = env.step(action) | |
e_states.append(state) | |
e_actions.append(action) | |
e_rewards.append(reward) | |
state = new_state | |
if done: | |
break | |
running_rewards = discounted_rewards(e_rewards) | |
samples_buffer.extend(zip(e_states,e_actions,running_rewards)) | |
# print('Episode:', e, 'Total reward:', sum(e_rewards)) | |
total_rewards.append(sum(e_rewards)) | |
if e % UPDATE_FREQUENCY == 0: | |
random.shuffle(samples_buffer) | |
states, actions, rewards = zip(*samples_buffer[:BATCH_SIZE]) | |
samples_buffer = [] | |
_, _loss = sess.run([train_op, loss], feed_dict={ | |
state_ph: np.vstack(states), | |
selected_action_ph: np.array(actions), | |
reward_ph: np.array(rewards) | |
}) | |
if e % 100 == 0: | |
print('Reward (%d):' % (e), np.mean(total_rewards[-100:])) | |
except KeyboardInterrupt: | |
print('Training interrupted.') | |
env._max_episode_steps = 10000 | |
state = env.reset() | |
while True: | |
env.render() | |
action_p_val = sess.run(action_p, feed_dict={state_ph: state.reshape(1, -1)}) | |
new_state, _, done, _ = env.step(np.argmax(action_p_val[0])) | |
state = new_state | |
if done: | |
break |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Q-network implementation using linear function aproximator. | |
# Estimates action-value given state and weights matrix Q(a|s,W) | |
import gym | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import time | |
env = gym.make('FrozenLake-v0') | |
print('Observation space:', env.observation_space) | |
print('Action space:', env.action_space) | |
N_STATES, N_ACTIONS = env.observation_space.n, env.action_space.n | |
# Hyper-params | |
LEARNING_RATE = 0.15 | |
GAMMA = 0.99 | |
N_EPISODES = 2000 | |
EPS_DECAY = 0.02 | |
W = np.random.uniform(low=.0, high=.1, size=(N_STATES, N_ACTIONS)) | |
def one_hot(idx, size): | |
oh = np.zeros(size) | |
oh[idx] = 1 | |
return oh | |
def predict(state): | |
return np.matmul(one_hot(state, N_STATES).reshape(1, N_STATES), W) | |
def loss(target, predicted): | |
return np.sum(np.power(target - predicted, 2)) | |
def gradient(state, predicted, target): | |
return LEARNING_RATE * np.matmul(one_hot(state, N_STATES).reshape(N_STATES, 1), 2*(target - predicted)) | |
def e_greedy(action_probas, eps): | |
if np.random.rand(1) < eps: | |
return env.action_space.sample() | |
return np.argmax(action_probas) | |
rewards, epsilons, losses = [], [], [] | |
for e in range(N_EPISODES): | |
eps = 1. / (1 + e*EPS_DECAY) | |
epsilons.append(eps) | |
state = env.reset() | |
i, total_reward, mean_loss = 0, 0, 0 | |
while True: | |
i += 1 | |
action_p = predict(state) | |
action = e_greedy(action_p[0], eps) | |
new_state, reward, done, _ = env.step(action) | |
# Target vector | |
target = action_p.copy() | |
target[0, action] = reward + GAMMA*np.max(predict(new_state)[0]) | |
# Update weights | |
l = loss(target, action_p) | |
W += gradient(state, action_p, target) | |
# Metrics | |
total_reward += reward | |
mean_loss += (1.0/i)*(l - mean_loss) | |
state = new_state | |
if done: | |
break | |
rewards.append(total_reward) | |
losses.append(mean_loss) | |
print('Success rate:', np.mean(rewards)) | |
plt.plot(rewards, label='Episode reward') | |
plt.plot(losses, label='Loss (MSE)') | |
plt.plot(epsilons, label='Eps') | |
plt.legend(loc='upper right') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment