simeneide · July 1, 2016 07:27
diff --git a/gistfile1.txt b/gistfile1.txt
 # This is a modification of **'s algorithm. (cant find it now)
 # Some tuning was done: (cost is now negative, and added cost of ending the game with zero reward so the algorithm would want to go towards the goal)
 # Also, training was done not monitored, as it seems like other people have done that. 
 # Managed to get just as good results with less steps, but the stochasticity of the problem makes it hard to reproduce.

 import gym
 import numpy as np



 env = gym.make('FrozenLake-v0')

 # Initialize Q matrix
 Q = np.zeros((env.observation_space.n, env.action_space.n)) + 0.25

 # Probability of random action
 epsilon = 0.5
 epsilon_decay = 0.9999

 # Learning rate
 alpha = 0.05


 ### TRAINING PHASE ###
 num_episodes = 20000
 for n in xrange(num_episodes):
    observation = env.reset()
    done = False

    while not done:
        # Current state
        state = observation

        if np.random.rand() > epsilon:
            # Choose best action according to current Q matrix
            action = np.argmax(Q[state, :])
        else:
            # Take a random action
            action = env.action_space.sample()

        # Take action and observe state and reward
        observation, reward, done, info = env.step(action)
        
        if (done is True and reward == 0):
            reward = -0.1

        reward -= 0.001 # cost of life
        # Q-learning update
        Q[state, action] += alpha * (reward + np.max(Q[observation, :]) - Q[state, action])

    # Decay epsilon
    epsilon *= epsilon_decay
    
 ### Test Phase ###
 env.monitor.start('recordings', force=True)
 num_episodes = 1000

 for n in xrange(num_episodes):
    observation = env.reset()
    done = False

    while not done:
        # Current state
        state = observation
        action = np.argmax(Q[state, :])
        observation, reward, done, info = env.step(action)
 env.monitor.close()
	# This is a modification of **'s algorithm. (cant find it now)
	# Some tuning was done: (cost is now negative, and added cost of ending the game with zero reward so the algorithm would want to go towards the goal)
	# Also, training was done not monitored, as it seems like other people have done that.
	# Managed to get just as good results with less steps, but the stochasticity of the problem makes it hard to reproduce.

	import gym
	import numpy as np



	env = gym.make('FrozenLake-v0')

	# Initialize Q matrix
	Q = np.zeros((env.observation_space.n, env.action_space.n)) + 0.25

	# Probability of random action
	epsilon = 0.5
	epsilon_decay = 0.9999

	# Learning rate
	alpha = 0.05


	### TRAINING PHASE ###
	num_episodes = 20000
	for n in xrange(num_episodes):
	observation = env.reset()
	done = False

	while not done:
	# Current state
	state = observation

	if np.random.rand() > epsilon:
	# Choose best action according to current Q matrix
	action = np.argmax(Q[state, :])
	else:
	# Take a random action
	action = env.action_space.sample()

	# Take action and observe state and reward
	observation, reward, done, info = env.step(action)

	if (done is True and reward == 0):
	reward = -0.1

	reward -= 0.001 # cost of life
	# Q-learning update
	Q[state, action] += alpha * (reward + np.max(Q[observation, :]) - Q[state, action])

	# Decay epsilon
	epsilon *= epsilon_decay

	### Test Phase ###
	env.monitor.start('recordings', force=True)
	num_episodes = 1000

	for n in xrange(num_episodes):
	observation = env.reset()
	done = False

	while not done:
	# Current state
	state = observation
	action = np.argmax(Q[state, :])
	observation, reward, done, info = env.step(action)
	env.monitor.close()