Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save simeneide/736d34240c53cc278f8a89790a669208 to your computer and use it in GitHub Desktop.
Save simeneide/736d34240c53cc278f8a89790a669208 to your computer and use it in GitHub Desktop.
frozenlake-v0 attempt
# This is a modification of **'s algorithm. (cant find it now)
# Some tuning was done: (cost is now negative, and added cost of ending the game with zero reward so the algorithm would want to go towards the goal)
# Also, training was done not monitored, as it seems like other people have done that.
# Managed to get just as good results with less steps, but the stochasticity of the problem makes it hard to reproduce.
import gym
import numpy as np
env = gym.make('FrozenLake-v0')
# Initialize Q matrix
Q = np.zeros((env.observation_space.n, env.action_space.n)) + 0.25
# Probability of random action
epsilon = 0.5
epsilon_decay = 0.9999
# Learning rate
alpha = 0.05
### TRAINING PHASE ###
num_episodes = 20000
for n in xrange(num_episodes):
observation = env.reset()
done = False
while not done:
# Current state
state = observation
if np.random.rand() > epsilon:
# Choose best action according to current Q matrix
action = np.argmax(Q[state, :])
else:
# Take a random action
action = env.action_space.sample()
# Take action and observe state and reward
observation, reward, done, info = env.step(action)
if (done is True and reward == 0):
reward = -0.1
reward -= 0.001 # cost of life
# Q-learning update
Q[state, action] += alpha * (reward + np.max(Q[observation, :]) - Q[state, action])
# Decay epsilon
epsilon *= epsilon_decay
### Test Phase ###
env.monitor.start('recordings', force=True)
num_episodes = 1000
for n in xrange(num_episodes):
observation = env.reset()
done = False
while not done:
# Current state
state = observation
action = np.argmax(Q[state, :])
observation, reward, done, info = env.step(action)
env.monitor.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment