adityajn105 · October 18, 2018 11:08
diff --git a/policy_iteration.py b/policy_iteration.py
 """
 Author : Aditya Jain
 Contact: https://adityajn105.github.io
 """
 import gym
 import numpy as np
 #create game
 game = gym.make('FrozenLake-v0')

 #get game env object
 env = game.env
 policy_to_action = {0:"L",1:"D",2:"R",3:"U"}

 def policy_iterations(env,theta=1e-10, discount_factor=0.9):
    """
    Args:
        env = the game env
            env.P returns all and their corresponsing action
            env.nS returns total no of states
            env.nA return total no of actions
        theta =  stop iteration if change become less than theta
        discount_factor = Gamma value
        
    Returns: best_policy, value function
    """
    
    def policy_evaluation(policy,V,env=env,theta=theta,discount_factor=discount_factor):
        """Helper function that returns new value function corresponding to a policy"""
        while True:
            temp_V = V.copy()
            biggest_change=0
            for s in range(env.nS):
                action = policy[s]
                v = 0
                for (prob,next_state,reward,_) in env.P[s][action]:
                    v += prob * (reward + discount_factor * V[next_state] )

                temp_V[s]=v
                change = abs(v-V[s])
                if biggest_change < change:
                    biggest_change = change
            V = temp_V
            if biggest_change<theta:
                return V
            
            
    def policy_improvement(V,env=env,discount_factor=discount_factor):
        """Helper function that returns best policy corresponding to value function using greedy method"""
        npolicy = np.zeros(env.nS)
        for s in range(env.nS):
            A = dict()
            for a in env.P[s].keys():
                A[a] = 0
                for (prob,next_state,reward,_) in env.P[s][a]:
                    A[a] += prob * (reward + V[next_state]*discount_factor )
            best_action = 0
            best_value = float('-inf')
            for a,v in A.items():
                if best_value < v:
                    best_action = a
                    best_value = v   
            npolicy[s] = best_action
            
        return npolicy
        
    
    #initializing policy which says always move right
    policy = np.zeros(env.nS)+2
    
    #initializing V
    V = np.zeros(env.nS)
    
    #policy iterations
    while True:
        V = policy_evaluation(policy,V)
        npolicy = policy_improvement(V)
        
        change = False
        for _p,p in zip(npolicy,policy):
            if _p!=p: change=True;break
        
        if not change:
            break
        else:
            policy = npolicy
            
    return policy,V
   
 policy, value = policy_iterations(env)
 gpolicy = list(map(lambda a: policy_to_action[a],policy))
 print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(4,4))))
 print("Optimal Values :\n {}".format(np.reshape(value,(4,4))))

 """
 Lets see our success rate
 """
 games = 1000
 won = 0
 for _ in range(games):
    state = game.reset()
    while True:
        action = int(policy[state])
        (state,reward,is_done,_) = game.step(action)
        if is_done:
            if reward>0:
                won+=1
            game.close()
            break
            
 print("Success Rate : {}".format(won/games))
diff --git a/value_iteration.py b/value_iteration.py
 """
 Author : Aditya Jain
 Contact: https://adityajn105.github.io
 """
 import gym
 import numpy as np
 #create game
 game = gym.make('FrozenLake-v0')

 #get game env object
 env = game.env
 policy_to_action = {0:"L",1:"D",2:"R",3:"U"}

 def value_iterations(env, theta = 0.00001, discount_factor = 0.9):
    """
    Args:
        env = the game env
            env.P returns all and their corresponsing action
            env.nS returns total no of states
            env.nA return total no of actions
        theta =  stop iteration if change become less than theta
        discount_factor = Gamma value
        
    Returns: best_policy, value function
    
    """
    def one_step_lookahead(s,V,env=env,discount_factor=discount_factor):
        """Helper Function to to best action and its value for a state"""
        A = dict()
        for a in env.P[s].keys():
            A[a] = 0
            for (action_prob,next_state,reward,is_done) in env.P[s][a]:
                A[a] += action_prob * ( reward + discount_factor*V[next_state] ) 
        
        best_action = 0
        best_value = float('-inf')
        for a,v in A.items():
            if v > best_value:
                best_value = v
                best_action = a
        
        return best_action,best_value
        
    
    #value optimization
    V = np.zeros(env.nS)
    while True:
        biggest_change=0
        for s in range(env.nS):
            _, new_v = one_step_lookahead(s,V)
            old_v = V[s]

            V[s] = new_v
            
            change = abs(old_v-new_v)
            if biggest_change < change:
                biggest_change = change
        
        if biggest_change< theta:
            break
    
    #policy extraction
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        best_a,_ = one_step_lookahead(s,V)
        policy[s] = best_a
    
    
    return policy, V

 policy, value = value_iterations(env)
 gpolicy = list(map(lambda a: policy_to_action[a],policy))
 print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(4,4))))
 print("Optimal Values :\n {}".format(np.reshape(value,(4,4))))

 """
 Lets see our success rate
 """
 games = 1000
 won = 0
 for _ in range(games):
    state = game.reset()
    while True:
        action = int(policy[state])
        (state,reward,is_done,_) = game.step(action)
        if is_done:
            if reward>0:
                won+=1
            game.close()
            break
            
 print("Success Rate : {}".format(won/games))
	"""
	Author : Aditya Jain
	Contact: https://adityajn105.github.io
	"""
	import gym
	import numpy as np
	#create game
	game = gym.make('FrozenLake-v0')

	#get game env object
	env = game.env
	policy_to_action = {0:"L",1:"D",2:"R",3:"U"}

	def policy_iterations(env,theta=1e-10, discount_factor=0.9):
	"""
	Args:
	env = the game env
	env.P returns all and their corresponsing action
	env.nS returns total no of states
	env.nA return total no of actions
	theta = stop iteration if change become less than theta
	discount_factor = Gamma value

	Returns: best_policy, value function
	"""

	def policy_evaluation(policy,V,env=env,theta=theta,discount_factor=discount_factor):
	"""Helper function that returns new value function corresponding to a policy"""
	while True:
	temp_V = V.copy()
	biggest_change=0
	for s in range(env.nS):
	action = policy[s]
	v = 0
	for (prob,next_state,reward,_) in env.P[s][action]:
	v += prob * (reward + discount_factor * V[next_state] )

	temp_V[s]=v
	change = abs(v-V[s])
	if biggest_change < change:
	biggest_change = change
	V = temp_V
	if biggest_change<theta:
	return V


	def policy_improvement(V,env=env,discount_factor=discount_factor):
	"""Helper function that returns best policy corresponding to value function using greedy method"""
	npolicy = np.zeros(env.nS)
	for s in range(env.nS):
	A = dict()
	for a in env.P[s].keys():
	A[a] = 0
	for (prob,next_state,reward,_) in env.P[s][a]:
	A[a] += prob * (reward + V[next_state]*discount_factor )
	best_action = 0
	best_value = float('-inf')
	for a,v in A.items():
	if best_value < v:
	best_action = a
	best_value = v
	npolicy[s] = best_action

	return npolicy


	#initializing policy which says always move right
	policy = np.zeros(env.nS)+2

	#initializing V
	V = np.zeros(env.nS)

	#policy iterations
	while True:
	V = policy_evaluation(policy,V)
	npolicy = policy_improvement(V)

	change = False
	for _p,p in zip(npolicy,policy):
	if _p!=p: change=True;break

	if not change:
	break
	else:
	policy = npolicy

	return policy,V

	policy, value = policy_iterations(env)
	gpolicy = list(map(lambda a: policy_to_action[a],policy))
	print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(4,4))))
	print("Optimal Values :\n {}".format(np.reshape(value,(4,4))))

	"""
	Lets see our success rate
	"""
	games = 1000
	won = 0
	for _ in range(games):
	state = game.reset()
	while True:
	action = int(policy[state])
	(state,reward,is_done,_) = game.step(action)
	if is_done:
	if reward>0:
	won+=1
	game.close()
	break

	print("Success Rate : {}".format(won/games))