NMZivkovic · July 20, 2019 10:09
diff --git a/Agent.py b/Agent.py
 class Agent(object):
    def __init__(self, enviroment, optimizer, image_shape):
        
        # Initialize atributes
        self._action_size = enviroment.action_space.n
        self._optimizer = optimizer
        self._image_shape = image_shape
        self.enviroment = enviroment
        
        self.expirience_replay = deque(maxlen=100000)
        
        # Initialize discount and exploration rate
        self.gamma = 0.6
        self.epsilon = 0.1
        
        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.alighn_target_model()

    def store(self, state, action, reward, next_state, terminated):
        self.expirience_replay.append((state, action, reward, next_state, terminated))
    
    def _update_epsilon(self):
        self.epsilon -= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
    
    def _build_compile_model(self):
        model = Sequential()
        model.add(Conv2D(32, 8, strides=(4, 4), padding="valid",activation="relu", 
                              input_shape = self._image_shape))
        model.add(Conv2D(64, 4, strides=(2, 2), padding="valid", activation="relu",
                              input_shape = self._image_shape))
        model.add(Conv2D(64, 3, strides=(1, 1), padding="valid",activation="relu",
                              input_shape = self._image_shape))
        model.add(Flatten())
        model.add(Dense(512, activation="relu"))
        model.add(Dense(self._action_size))
        huber = Huber()
        model.compile(loss = huber,
                           optimizer=self._optimizer,
                           metrics=["accuracy"])
        return model

    def alighn_target_model(self):
        self.target_network.set_weights(self.q_network.get_weights())
    
    def act(self, frame):
        if np.random.rand() <= self.epsilon:
            return self.enviroment.action_space.sample()
        
        frame = np.expand_dims(np.asarray(frame).astype(np.float64), axis=0)
        
        q_values = self.q_network.predict(frame)
        return np.argmax(q_values[0])
    

    def retrain(self, batch_size):
        minibatch = random.sample(self.expirience_replay, batch_size)
        
        for state, action, reward, next_state, terminated in minibatch:
            
            state = np.expand_dims(np.asarray(state).astype(np.float64), axis=0)
            next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0)
            
            target = self.q_network.predict(state)
            
            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict(next_state)
                target[0][action] = reward + self.gamma * np.amax(t)
            
            self.q_network.fit(state, target, epochs=1, verbose=0)
	class Agent(object):
	def __init__(self, enviroment, optimizer, image_shape):

	# Initialize atributes
	self._action_size = enviroment.action_space.n
	self._optimizer = optimizer
	self._image_shape = image_shape
	self.enviroment = enviroment

	self.expirience_replay = deque(maxlen=100000)

	# Initialize discount and exploration rate
	self.gamma = 0.6
	self.epsilon = 0.1

	# Build networks
	self.q_network = self._build_compile_model()
	self.target_network = self._build_compile_model()
	self.alighn_target_model()

	def store(self, state, action, reward, next_state, terminated):
	self.expirience_replay.append((state, action, reward, next_state, terminated))

	def _update_epsilon(self):
	self.epsilon -= self.epsilon_decay
	self.epsilon = max(self.epsilon_min, self.epsilon)

	def _build_compile_model(self):
	model = Sequential()
	model.add(Conv2D(32, 8, strides=(4, 4), padding="valid",activation="relu",
	input_shape = self._image_shape))
	model.add(Conv2D(64, 4, strides=(2, 2), padding="valid", activation="relu",
	input_shape = self._image_shape))
	model.add(Conv2D(64, 3, strides=(1, 1), padding="valid",activation="relu",
	input_shape = self._image_shape))
	model.add(Flatten())
	model.add(Dense(512, activation="relu"))
	model.add(Dense(self._action_size))
	huber = Huber()
	model.compile(loss = huber,
	optimizer=self._optimizer,
	metrics=["accuracy"])
	return model

	def alighn_target_model(self):
	self.target_network.set_weights(self.q_network.get_weights())

	def act(self, frame):
	if np.random.rand() <= self.epsilon:
	return self.enviroment.action_space.sample()

	frame = np.expand_dims(np.asarray(frame).astype(np.float64), axis=0)

	q_values = self.q_network.predict(frame)
	return np.argmax(q_values[0])


	def retrain(self, batch_size):
	minibatch = random.sample(self.expirience_replay, batch_size)

	for state, action, reward, next_state, terminated in minibatch:

	state = np.expand_dims(np.asarray(state).astype(np.float64), axis=0)
	next_state = np.expand_dims(np.asarray(next_state).astype(np.float64), axis=0)

	target = self.q_network.predict(state)

	if terminated:
	target[0][action] = reward
	else:
	t = self.target_network.predict(next_state)
	target[0][action] = reward + self.gamma * np.amax(t)

	self.q_network.fit(state, target, epochs=1, verbose=0)