Skip to content

Instantly share code, notes, and snippets.

@starhopp3r
Created February 19, 2018 08:29
Show Gist options
  • Save starhopp3r/7e4b47e04b41c8b319e45429b76919e6 to your computer and use it in GitHub Desktop.
Save starhopp3r/7e4b47e04b41c8b319e45429b76919e6 to your computer and use it in GitHub Desktop.
from keras.models import Sequential # One layer after the other
from keras.layers import Dense, Flatten # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from collections import deque # For storing moves
import numpy as np
import gym # To train our network
import random # For sampling batches from the observations
env = gym.make('SuperMarioBros-1-1-v0') # Choose game (any in the gym should work)
action_space = 6
# Create network. Input is two consecutive game states, output is Q-values of the possible moves.
model = Sequential()
model.add(Dense(20, kernel_initializer="uniform", input_shape=(2,) + env.observation_space.shape, activation="relu"))
model.add(Flatten()) # Flatten input so as to have no problems with processing
model.add(Dense(18, kernel_initializer="uniform", activation="relu"))
model.add(Dense(10, kernel_initializer="uniform", activation="relu"))
model.add(Dense(action_space, kernel_initializer="uniform", activation="linear")) # Same number of outputs as possible actions
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
# Parameters
D = deque() # Register where the actions will be stored
observetime = 500 # Number of timesteps we will be acting on the game and observing results
epsilon = 0.7 # Probability of doing a random move
gamma = 0.9 # Discounted future reward. How much we care about steps further in time
mb_size = 50 # Learning minibatch size
# MAPPING ACTIONS
mapping = {
0: [0, 0, 0, 0, 0, 0], # NOOP
1: [1, 0, 0, 0, 0, 0], # Up
2: [0, 0, 1, 0, 0, 0], # Down
3: [0, 1, 0, 0, 0, 0], # Left
4: [0, 1, 0, 0, 1, 0], # Left + A
5: [0, 1, 0, 0, 0, 1], # Left + B
6: [0, 1, 0, 0, 1, 1], # Left + A + B
7: [0, 0, 0, 1, 0, 0], # Right
8: [0, 0, 0, 1, 1, 0], # Right + A
9: [0, 0, 0, 1, 0, 1], # Right + B
10: [0, 0, 0, 1, 1, 1], # Right + A + B
11: [0, 0, 0, 0, 1, 0], # A
12: [0, 0, 0, 0, 0, 1], # B
13: [0, 0, 0, 0, 1, 1], # A + B
}
# FIRST STEP: Knowing what each action does (Observing)
observation = env.reset() # Game begins
obs = np.expand_dims(observation, axis=0) # (Formatting issues) Making the observation the first element of a batch of inputs
state = np.stack((obs, obs), axis=1)
done = False
for t in range(observetime):
if np.random.rand() <= epsilon:
action = np.random.randint(0, action_space, size=1)[0]
else:
Q = model.predict(state) # Q-values predictions
action = np.argmax(Q) # Move with highest Q-value is the chosen one
action = mapping[action]
observation_new, reward, done, info = env.step(action) # See state of the game, reward... after performing the action
obs_new = np.expand_dims(observation_new, axis=0) # (Formatting issues)
state_new = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1) # Update the input with the new state of the game
D.append((state, action, reward, state_new, done)) # 'Remember' action and consequence
state = state_new # Update state
if done:
env.reset() # Restart game if it's finished
obs = np.expand_dims(observation, axis=0) # (Formatting issues) Making the observation the first element of a batch of inputs
state = np.stack((obs, obs), axis=1)
print('Observing Finished')
# SECOND STEP: Learning from the observations (Experience replay)
minibatch = random.sample(D, mb_size) # Sample some moves
inputs_shape = (mb_size,) + state.shape[1:]
inputs = np.zeros(inputs_shape)
targets = np.zeros((mb_size, action_space))
for i in range(0, mb_size):
state = minibatch[i][0]
action = minibatch[i][1]
reward = minibatch[i][2]
state_new = minibatch[i][3]
done = minibatch[i][4]
# Build Bellman equation for the Q function
inputs[i:i+1] = np.expand_dims(state, axis=0)
targets[i] = model.predict(state)
Q_sa = model.predict(state_new)
if done:
targets[i, action] = reward
else:
targets[i, action] = reward + gamma * np.max(Q_sa)
# Train network to output the Q function
model.train_on_batch(inputs, targets)
# Run N complete
print('Run {} complete'.format(i))
# Learning Finished
print('Learning Finished')
# THIRD STEP: Play!
observation = env.reset()
obs = np.expand_dims(observation, axis=0)
state = np.stack((obs, obs), axis=1)
done = False
tot_reward = 0.0
while not done:
env.render() # Uncomment to see game running
Q = model.predict(state)
action = np.argmax(Q)
action = mapping[action]
observation, reward, done, info = env.step(action)
obs = np.expand_dims(observation, axis=0)
state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)
tot_reward += reward
print(reward)
print('Game ended! Total reward: {}'.format(reward))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment