-
-
Save tsu-nera/edd306ddeefebe4afb1efceefbc3f953 to your computer and use it in GitHub Desktop.
| import gym | |
| import numpy as np | |
| from keras.models import Sequential | |
| from keras.layers import Dense | |
| from keras.optimizers import Adam | |
| from collections import deque | |
| # Create the Cart-Pole game environment | |
| env = gym.make('CartPole-v0') | |
| class QNetwork: | |
| def __init__(self, learning_rate=0.01, state_size=4, | |
| action_size=2, hidden_size=10): | |
| # state inputs to the Q-network | |
| self.model = Sequential() | |
| self.model.add(Dense(hidden_size, activation='relu', | |
| input_dim=state_size)) | |
| self.model.add(Dense(hidden_size, activation='relu')) | |
| self.model.add(Dense(action_size, activation='linear')) | |
| self.optimizer = Adam(lr=learning_rate) | |
| self.model.compile(loss='mse', optimizer=self.optimizer) | |
| class Memory(): | |
| def __init__(self, max_size=1000): | |
| self.buffer = deque(maxlen=max_size) | |
| def add(self, experience): | |
| self.buffer.append(experience) | |
| def sample(self, batch_size): | |
| idx = np.random.choice(np.arange(len(self.buffer)), | |
| size=batch_size, | |
| replace=False) | |
| return [self.buffer[ii] for ii in idx] | |
| train_episodes = 1000 # max number of episodes to learn from | |
| max_steps = 200 # max steps in an episode | |
| gamma = 0.99 # future reward discount | |
| # Exploration parameters | |
| explore_start = 1.0 # exploration probability at start | |
| explore_stop = 0.01 # minimum exploration probability | |
| decay_rate = 0.0001 # exponential decay rate for exploration prob | |
| # Network parameters | |
| hidden_size = 16 # number of units in each Q-network hidden layer | |
| learning_rate = 0.001 # Q-network learning rate | |
| # Memory parameters | |
| memory_size = 10000 # memory capacity | |
| batch_size = 32 # experience mini-batch size | |
| pretrain_length = batch_size # number experiences to pretrain the memory | |
| mainQN = QNetwork(hidden_size=hidden_size, learning_rate=learning_rate) | |
| ################################### | |
| ## Populate the experience memory | |
| ################################### | |
| # Initialize the simulation | |
| env.reset() | |
| # Take one random step to get the pole and cart moving | |
| state, reward, done, _ = env.step(env.action_space.sample()) | |
| state = np.reshape(state, [1, 4]) | |
| memory = Memory(max_size=memory_size) | |
| # Make a bunch of random actions and store the experiences | |
| for ii in range(pretrain_length): | |
| # Uncomment the line below to watch the simulation | |
| # env.render() | |
| # Make a random action | |
| action = env.action_space.sample() | |
| next_state, reward, done, _ = env.step(action) | |
| next_state = np.reshape(next_state, [1, 4]) | |
| if done: | |
| # The simulation fails so no next state | |
| next_state = np.zeros(state.shape) | |
| # Add experience to memory | |
| memory.add((state, action, reward, next_state)) | |
| # Start new episode | |
| env.reset() | |
| # Take one random step to get the pole and cart moving | |
| state, reward, done, _ = env.step(env.action_space.sample()) | |
| state = np.reshape(state, [1, 4]) | |
| else: | |
| # Add experience to memory | |
| memory.add((state, action, reward, next_state)) | |
| state = next_state | |
| ############# | |
| ## Training | |
| ############# | |
| step = 0 | |
| for ep in range(1, train_episodes): | |
| total_reward = 0 | |
| t = 0 | |
| while t < max_steps: | |
| step += 1 | |
| # Uncomment this next line to watch the training | |
| # env.render() | |
| # Explore or Exploit | |
| explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) | |
| if explore_p > np.random.rand(): | |
| # Make a random action | |
| action = env.action_space.sample() | |
| else: | |
| # Get action from Q-network | |
| Qs = mainQN.model.predict(state)[0] | |
| action = np.argmax(Qs) | |
| # Take action, get new state and reward | |
| next_state, reward, done, _ = env.step(action) | |
| next_state = np.reshape(next_state, [1, 4]) | |
| total_reward += reward | |
| if done: | |
| # the episode ends so no next state | |
| next_state = np.zeros(state.shape) | |
| t = max_steps | |
| print('Episode: {}'.format(ep), | |
| 'Total reward: {}'.format(total_reward), | |
| 'Explore P: {:.4f}'.format(explore_p)) | |
| # Add experience to memory | |
| memory.add((state, action, reward, next_state)) | |
| # Start new episode | |
| env.reset() | |
| # Take one random step to get the pole and cart moving | |
| state, reward, done, _ = env.step(env.action_space.sample()) | |
| state = np.reshape(state, [1, 4]) | |
| else: | |
| # Add experience to memory | |
| memory.add((state, action, reward, next_state)) | |
| state = next_state | |
| t += 1 | |
| # Replay | |
| inputs = np.zeros((batch_size, 4)) | |
| targets = np.zeros((batch_size, 2)) | |
| minibatch = memory.sample(batch_size) | |
| for i, (state_b, action_b, reward_b, next_state_b) in enumerate(minibatch): | |
| inputs[i:i+1] = state_b | |
| target = reward_b | |
| if not (next_state_b == np.zeros(state_b.shape)).all(axis=1): | |
| target_Q = mainQN.model.predict(next_state_b)[0] | |
| target = reward_b + gamma * np.amax(mainQN.model.predict(next_state_b)[0]) | |
| targets[i] = mainQN.model.predict(state_b) | |
| targets[i][action_b] = target | |
| mainQN.model.fit(inputs, targets, epochs=1, verbose=0) |
Episode: 169 Total reward: 199.0 Explore P: 0.5193
Episode: 170 Total reward: 55.0 Explore P: 0.5165
Episode: 171 Total reward: 138.0 Explore P: 0.5095
Episode: 172 Total reward: 174.0 Explore P: 0.5009
Episode: 173 Total reward: 199.0 Explore P: 0.4913
Episode: 174 Total reward: 31.0 Explore P: 0.4898
Episode: 175 Total reward: 199.0 Explore P: 0.4803
Episode: 176 Total reward: 50.0 Explore P: 0.4780
Episode: 177 Total reward: 124.0 Explore P: 0.4722
Episode: 178 Total reward: 187.0 Explore P: 0.4636
Episode: 246 Total reward: 199.0 Explore P: 0.2822
Episode: 247 Total reward: 199.0 Explore P: 0.2769
Episode: 248 Total reward: 199.0 Explore P: 0.2716
Episode: 249 Total reward: 199.0 Explore P: 0.2665
Episode: 250 Total reward: 199.0 Explore P: 0.2614
Episode: 251 Total reward: 199.0 Explore P: 0.2565
Episode: 252 Total reward: 199.0 Explore P: 0.2516
Episode: 253 Total reward: 199.0 Explore P: 0.2468
Could you tell me if it works robustly or just converges sometimes and what hyperparameters you used?
I tried to execute this code and it never learns anything and I don't know if it's because of the code or if I have some problem with my keras version.
('Episode: 580', 'Total reward: 12.0', 'Explore P: 0.4601')
('Episode: 581', 'Total reward: 13.0', 'Explore P: 0.4595')
('Episode: 582', 'Total reward: 10.0', 'Explore P: 0.4591')
('Episode: 583', 'Total reward: 8.0', 'Explore P: 0.4587')
('Episode: 584', 'Total reward: 10.0', 'Explore P: 0.4583')
('Episode: 585', 'Total reward: 8.0', 'Explore P: 0.4579')
('Episode: 586', 'Total reward: 9.0', 'Explore P: 0.4575')
('Episode: 587', 'Total reward: 15.0', 'Explore P: 0.4568')
('Episode: 588', 'Total reward: 9.0', 'Explore P: 0.4564')
('Episode: 589', 'Total reward: 9.0', 'Explore P: 0.4560')
('Episode: 590', 'Total reward: 8.0', 'Explore P: 0.4557')
('Episode: 591', 'Total reward: 13.0', 'Explore P: 0.4551')
Can you please tell me why you aren't using a different target network like it is mentioned in the dqn paper?
Episode: 22 Total reward: 48.0 Explore P: 0.9598
Episode: 23 Total reward: 28.0 Explore P: 0.9571
Episode: 24 Total reward: 20.0 Explore P: 0.9552
Episode: 25 Total reward: 16.0 Explore P: 0.9537
Episode: 26 Total reward: 14.0 Explore P: 0.9524
Episode: 27 Total reward: 13.0 Explore P: 0.9512
Episode: 28 Total reward: 15.0 Explore P: 0.9497
Episode: 29 Total reward: 16.0 Explore P: 0.9482
Episode: 30 Total reward: 11.0 Explore P: 0.9472
Episode: 31 Total reward: 21.0 Explore P: 0.9452