Created
February 19, 2018 08:29
-
-
Save starhopp3r/7e4b47e04b41c8b319e45429b76919e6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keras.models import Sequential # One layer after the other | |
from keras.layers import Dense, Flatten # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs | |
from collections import deque # For storing moves | |
import numpy as np | |
import gym # To train our network | |
import random # For sampling batches from the observations | |
env = gym.make('SuperMarioBros-1-1-v0') # Choose game (any in the gym should work) | |
action_space = 6 | |
# Create network. Input is two consecutive game states, output is Q-values of the possible moves. | |
model = Sequential() | |
model.add(Dense(20, kernel_initializer="uniform", input_shape=(2,) + env.observation_space.shape, activation="relu")) | |
model.add(Flatten()) # Flatten input so as to have no problems with processing | |
model.add(Dense(18, kernel_initializer="uniform", activation="relu")) | |
model.add(Dense(10, kernel_initializer="uniform", activation="relu")) | |
model.add(Dense(action_space, kernel_initializer="uniform", activation="linear")) # Same number of outputs as possible actions | |
model.compile(loss='mse', optimizer='adam', metrics=['accuracy']) | |
# Parameters | |
D = deque() # Register where the actions will be stored | |
observetime = 500 # Number of timesteps we will be acting on the game and observing results | |
epsilon = 0.7 # Probability of doing a random move | |
gamma = 0.9 # Discounted future reward. How much we care about steps further in time | |
mb_size = 50 # Learning minibatch size | |
# MAPPING ACTIONS | |
mapping = { | |
0: [0, 0, 0, 0, 0, 0], # NOOP | |
1: [1, 0, 0, 0, 0, 0], # Up | |
2: [0, 0, 1, 0, 0, 0], # Down | |
3: [0, 1, 0, 0, 0, 0], # Left | |
4: [0, 1, 0, 0, 1, 0], # Left + A | |
5: [0, 1, 0, 0, 0, 1], # Left + B | |
6: [0, 1, 0, 0, 1, 1], # Left + A + B | |
7: [0, 0, 0, 1, 0, 0], # Right | |
8: [0, 0, 0, 1, 1, 0], # Right + A | |
9: [0, 0, 0, 1, 0, 1], # Right + B | |
10: [0, 0, 0, 1, 1, 1], # Right + A + B | |
11: [0, 0, 0, 0, 1, 0], # A | |
12: [0, 0, 0, 0, 0, 1], # B | |
13: [0, 0, 0, 0, 1, 1], # A + B | |
} | |
# FIRST STEP: Knowing what each action does (Observing) | |
observation = env.reset() # Game begins | |
obs = np.expand_dims(observation, axis=0) # (Formatting issues) Making the observation the first element of a batch of inputs | |
state = np.stack((obs, obs), axis=1) | |
done = False | |
for t in range(observetime): | |
if np.random.rand() <= epsilon: | |
action = np.random.randint(0, action_space, size=1)[0] | |
else: | |
Q = model.predict(state) # Q-values predictions | |
action = np.argmax(Q) # Move with highest Q-value is the chosen one | |
action = mapping[action] | |
observation_new, reward, done, info = env.step(action) # See state of the game, reward... after performing the action | |
obs_new = np.expand_dims(observation_new, axis=0) # (Formatting issues) | |
state_new = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1) # Update the input with the new state of the game | |
D.append((state, action, reward, state_new, done)) # 'Remember' action and consequence | |
state = state_new # Update state | |
if done: | |
env.reset() # Restart game if it's finished | |
obs = np.expand_dims(observation, axis=0) # (Formatting issues) Making the observation the first element of a batch of inputs | |
state = np.stack((obs, obs), axis=1) | |
print('Observing Finished') | |
# SECOND STEP: Learning from the observations (Experience replay) | |
minibatch = random.sample(D, mb_size) # Sample some moves | |
inputs_shape = (mb_size,) + state.shape[1:] | |
inputs = np.zeros(inputs_shape) | |
targets = np.zeros((mb_size, action_space)) | |
for i in range(0, mb_size): | |
state = minibatch[i][0] | |
action = minibatch[i][1] | |
reward = minibatch[i][2] | |
state_new = minibatch[i][3] | |
done = minibatch[i][4] | |
# Build Bellman equation for the Q function | |
inputs[i:i+1] = np.expand_dims(state, axis=0) | |
targets[i] = model.predict(state) | |
Q_sa = model.predict(state_new) | |
if done: | |
targets[i, action] = reward | |
else: | |
targets[i, action] = reward + gamma * np.max(Q_sa) | |
# Train network to output the Q function | |
model.train_on_batch(inputs, targets) | |
# Run N complete | |
print('Run {} complete'.format(i)) | |
# Learning Finished | |
print('Learning Finished') | |
# THIRD STEP: Play! | |
observation = env.reset() | |
obs = np.expand_dims(observation, axis=0) | |
state = np.stack((obs, obs), axis=1) | |
done = False | |
tot_reward = 0.0 | |
while not done: | |
env.render() # Uncomment to see game running | |
Q = model.predict(state) | |
action = np.argmax(Q) | |
action = mapping[action] | |
observation, reward, done, info = env.step(action) | |
obs = np.expand_dims(observation, axis=0) | |
state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1) | |
tot_reward += reward | |
print(reward) | |
print('Game ended! Total reward: {}'.format(reward)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment