Created
October 20, 2017 20:13
-
-
Save JannesKlaas/1c6c170c5b965bb69ffb765080b57326 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ExperienceReplay(object): | |
""" | |
During gameplay all the experiences < s, a, r, s’ > are stored in a replay memory. | |
In training, batches of randomly drawn experiences are used to generate the input and target for training. | |
""" | |
def __init__(self, max_memory=100, discount=.9): | |
""" | |
Setup | |
max_memory: the maximum number of experiences we want to store | |
memory: a list of experiences | |
discount: the discount factor for future experience | |
In the memory the information whether the game ended at the state is stored seperately in a nested array | |
[... | |
[experience, game_over] | |
[experience, game_over] | |
...] | |
""" | |
self.max_memory = max_memory | |
self.memory = list() | |
self.discount = discount | |
def remember(self, states, game_over): | |
#Save a state to memory | |
self.memory.append([states, game_over]) | |
#We don't want to store infinite memories, so if we have too many, we just delete the oldest one | |
if len(self.memory) > self.max_memory: | |
del self.memory[0] | |
def get_batch(self, model, batch_size=10): | |
#How many experiences do we have? | |
len_memory = len(self.memory) | |
#Calculate the number of actions that can possibly be taken in the game | |
num_actions = model.output_shape[-1] | |
#Dimensions of the game field | |
env_dim = self.memory[0][0][0].shape[1] | |
#We want to return an input and target vector with inputs from an observed state... | |
inputs = np.zeros((min(len_memory, batch_size), env_dim)) | |
#...and the target r + gamma * max Q(s’,a’) | |
#Note that our target is a matrix, with possible fields not only for the action taken but also | |
#for the other possible actions. The actions not take the same value as the prediction to not affect them | |
targets = np.zeros((inputs.shape[0], num_actions)) | |
#We draw states to learn from randomly | |
for i, idx in enumerate(np.random.randint(0, len_memory, | |
size=inputs.shape[0])): | |
""" | |
Here we load one transition <s, a, r, s’> from memory | |
state_t: initial state s | |
action_t: action taken a | |
reward_t: reward earned r | |
state_tp1: the state that followed s’ | |
""" | |
state_t, action_t, reward_t, state_tp1 = self.memory[idx][0] | |
#We also need to know whether the game ended at this state | |
game_over = self.memory[idx][1] | |
#add the state s to the input | |
inputs[i:i+1] = state_t | |
# First we fill the target values with the predictions of the model. | |
# They will not be affected by training (since the training loss for them is 0) | |
targets[i] = model.predict(state_t)[0] | |
""" | |
If the game ended, the expected reward Q(s,a) should be the final reward r. | |
Otherwise the target value is r + gamma * max Q(s’,a’) | |
""" | |
# Here Q_sa is max_a'Q(s', a') | |
Q_sa = np.max(model.predict(state_tp1)[0]) | |
#if the game ended, the reward is the final reward | |
if game_over: # if game_over is True | |
targets[i, action_t] = reward_t | |
else: | |
# r + gamma * max Q(s’,a’) | |
targets[i, action_t] = reward_t + self.discount * Q_sa | |
return inputs, targets |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is so awesome!