lorenzotinfena · April 13, 2022 17:27
diff --git a/main.py b/main.py
 state = get_first_state()
 while not done:
    # policy
    if random ∈[0,1] < epsilon: action = random_action
    else: action = argmax(nn.predict(state))
    next_state, reward, done = step(action) # perform the action
    replay_memory.put(state, action, reward, done, next_state) # save in replay memory
    if len(replay_memory) >= batch_size: # if there is enough memory
        # get a mini-batch from the replay memory
        for state_exp, action_exp, reward_exp, done_exp, next_state_exp in replay_memory.get_random(batch_size):
            q_values = nn.predict(state_exp) # check Q-values
            q_values_target = copy(q_values)
            if done: q_values_target[action_exp] = reward_exp
            else: q_values_target[action_exp] =
                # compute a target Q-values con la target network
                reward_exp + discount_factor * max(target_nn.predict(next_state_exp))
            nn.train(state_exp, q_values_target, learning_rate, momentum) # optimization
        if steps mod steps_to_sync_target_nn == 0:
            self._sync_target_nn_weights() # sync target nn with main nn
        epsilon *= epsilon_decay # epsilon-decay algorithm
        if epsilon < min_epsilon: epsilon = min_epsilon
    state = next_state # set the new current state
 self._sync_target_nn_weights() # sync target nn with the main nn even at the end
	state = get_first_state()
	while not done:
	# policy
	if random ∈[0,1] < epsilon: action = random_action
	else: action = argmax(nn.predict(state))
	next_state, reward, done = step(action) # perform the action
	replay_memory.put(state, action, reward, done, next_state) # save in replay memory
	if len(replay_memory) >= batch_size: # if there is enough memory
	# get a mini-batch from the replay memory
	for state_exp, action_exp, reward_exp, done_exp, next_state_exp in replay_memory.get_random(batch_size):
	q_values = nn.predict(state_exp) # check Q-values
	q_values_target = copy(q_values)
	if done: q_values_target[action_exp] = reward_exp
	else: q_values_target[action_exp] =
	# compute a target Q-values con la target network
	reward_exp + discount_factor * max(target_nn.predict(next_state_exp))
	nn.train(state_exp, q_values_target, learning_rate, momentum) # optimization
	if steps mod steps_to_sync_target_nn == 0:
	self._sync_target_nn_weights() # sync target nn with main nn
	epsilon *= epsilon_decay # epsilon-decay algorithm
	if epsilon < min_epsilon: epsilon = min_epsilon
	state = next_state # set the new current state
	self._sync_target_nn_weights() # sync target nn with the main nn even at the end