Created
July 1, 2016 07:27
-
-
Save simeneide/736d34240c53cc278f8a89790a669208 to your computer and use it in GitHub Desktop.
frozenlake-v0 attempt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a modification of **'s algorithm. (cant find it now) | |
# Some tuning was done: (cost is now negative, and added cost of ending the game with zero reward so the algorithm would want to go towards the goal) | |
# Also, training was done not monitored, as it seems like other people have done that. | |
# Managed to get just as good results with less steps, but the stochasticity of the problem makes it hard to reproduce. | |
import gym | |
import numpy as np | |
env = gym.make('FrozenLake-v0') | |
# Initialize Q matrix | |
Q = np.zeros((env.observation_space.n, env.action_space.n)) + 0.25 | |
# Probability of random action | |
epsilon = 0.5 | |
epsilon_decay = 0.9999 | |
# Learning rate | |
alpha = 0.05 | |
### TRAINING PHASE ### | |
num_episodes = 20000 | |
for n in xrange(num_episodes): | |
observation = env.reset() | |
done = False | |
while not done: | |
# Current state | |
state = observation | |
if np.random.rand() > epsilon: | |
# Choose best action according to current Q matrix | |
action = np.argmax(Q[state, :]) | |
else: | |
# Take a random action | |
action = env.action_space.sample() | |
# Take action and observe state and reward | |
observation, reward, done, info = env.step(action) | |
if (done is True and reward == 0): | |
reward = -0.1 | |
reward -= 0.001 # cost of life | |
# Q-learning update | |
Q[state, action] += alpha * (reward + np.max(Q[observation, :]) - Q[state, action]) | |
# Decay epsilon | |
epsilon *= epsilon_decay | |
### Test Phase ### | |
env.monitor.start('recordings', force=True) | |
num_episodes = 1000 | |
for n in xrange(num_episodes): | |
observation = env.reset() | |
done = False | |
while not done: | |
# Current state | |
state = observation | |
action = np.argmax(Q[state, :]) | |
observation, reward, done, info = env.step(action) | |
env.monitor.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment