qgallouedec · April 26, 2020 12:35
diff --git a/value_iteration_loop.py b/value_iteration_loop.py
 gamma = 0.8 # discount factor
 # initialize V
 new_V = np.zeros((nb_states, 1))
 # loop until it reaches the optimal policy
 while True:
    old_V = new_V
    # V(s) <- \max_{a}( R(s, a) + gamma * \sum_{s'} P(s, a, s')*V(s') )
    new_V = np.max(R_SA + gamma*np.squeeze(np.dot(P, old_V)), axis=1, keepdims=True)
    # if the changes are small, we consider that we have found V_*
    if np.max(new_V - old_V) < 0.01:
        V = new_V
        break
	gamma = 0.8 # discount factor
	# initialize V
	new_V = np.zeros((nb_states, 1))
	# loop until it reaches the optimal policy
	while True:
	old_V = new_V
	# V(s) <- \max_{a}( R(s, a) + gamma * \sum_{s'} P(s, a, s')*V(s') )
	new_V = np.max(R_SA + gamma*np.squeeze(np.dot(P, old_V)), axis=1, keepdims=True)
	# if the changes are small, we consider that we have found V_*
	if np.max(new_V - old_V) < 0.01:
	V = new_V
	break
No results found