Skip to content

Instantly share code, notes, and snippets.

@adityajn105
Last active October 18, 2018 11:08
Show Gist options
  • Save adityajn105/f3d576866cbafd8e63d447e26ecac007 to your computer and use it in GitHub Desktop.
Save adityajn105/f3d576866cbafd8e63d447e26ecac007 to your computer and use it in GitHub Desktop.
Policy Optimization using policy iterations and value iterations.
"""
Author : Aditya Jain
Contact: https://adityajn105.github.io
"""
import gym
import numpy as np
#create game
game = gym.make('FrozenLake-v0')
#get game env object
env = game.env
policy_to_action = {0:"L",1:"D",2:"R",3:"U"}
def policy_iterations(env,theta=1e-10, discount_factor=0.9):
"""
Args:
env = the game env
env.P returns all and their corresponsing action
env.nS returns total no of states
env.nA return total no of actions
theta = stop iteration if change become less than theta
discount_factor = Gamma value
Returns: best_policy, value function
"""
def policy_evaluation(policy,V,env=env,theta=theta,discount_factor=discount_factor):
"""Helper function that returns new value function corresponding to a policy"""
while True:
temp_V = V.copy()
biggest_change=0
for s in range(env.nS):
action = policy[s]
v = 0
for (prob,next_state,reward,_) in env.P[s][action]:
v += prob * (reward + discount_factor * V[next_state] )
temp_V[s]=v
change = abs(v-V[s])
if biggest_change < change:
biggest_change = change
V = temp_V
if biggest_change<theta:
return V
def policy_improvement(V,env=env,discount_factor=discount_factor):
"""Helper function that returns best policy corresponding to value function using greedy method"""
npolicy = np.zeros(env.nS)
for s in range(env.nS):
A = dict()
for a in env.P[s].keys():
A[a] = 0
for (prob,next_state,reward,_) in env.P[s][a]:
A[a] += prob * (reward + V[next_state]*discount_factor )
best_action = 0
best_value = float('-inf')
for a,v in A.items():
if best_value < v:
best_action = a
best_value = v
npolicy[s] = best_action
return npolicy
#initializing policy which says always move right
policy = np.zeros(env.nS)+2
#initializing V
V = np.zeros(env.nS)
#policy iterations
while True:
V = policy_evaluation(policy,V)
npolicy = policy_improvement(V)
change = False
for _p,p in zip(npolicy,policy):
if _p!=p: change=True;break
if not change:
break
else:
policy = npolicy
return policy,V
policy, value = policy_iterations(env)
gpolicy = list(map(lambda a: policy_to_action[a],policy))
print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(4,4))))
print("Optimal Values :\n {}".format(np.reshape(value,(4,4))))
"""
Lets see our success rate
"""
games = 1000
won = 0
for _ in range(games):
state = game.reset()
while True:
action = int(policy[state])
(state,reward,is_done,_) = game.step(action)
if is_done:
if reward>0:
won+=1
game.close()
break
print("Success Rate : {}".format(won/games))
"""
Author : Aditya Jain
Contact: https://adityajn105.github.io
"""
import gym
import numpy as np
#create game
game = gym.make('FrozenLake-v0')
#get game env object
env = game.env
policy_to_action = {0:"L",1:"D",2:"R",3:"U"}
def value_iterations(env, theta = 0.00001, discount_factor = 0.9):
"""
Args:
env = the game env
env.P returns all and their corresponsing action
env.nS returns total no of states
env.nA return total no of actions
theta = stop iteration if change become less than theta
discount_factor = Gamma value
Returns: best_policy, value function
"""
def one_step_lookahead(s,V,env=env,discount_factor=discount_factor):
"""Helper Function to to best action and its value for a state"""
A = dict()
for a in env.P[s].keys():
A[a] = 0
for (action_prob,next_state,reward,is_done) in env.P[s][a]:
A[a] += action_prob * ( reward + discount_factor*V[next_state] )
best_action = 0
best_value = float('-inf')
for a,v in A.items():
if v > best_value:
best_value = v
best_action = a
return best_action,best_value
#value optimization
V = np.zeros(env.nS)
while True:
biggest_change=0
for s in range(env.nS):
_, new_v = one_step_lookahead(s,V)
old_v = V[s]
V[s] = new_v
change = abs(old_v-new_v)
if biggest_change < change:
biggest_change = change
if biggest_change< theta:
break
#policy extraction
policy = np.zeros(env.nS)
for s in range(env.nS):
best_a,_ = one_step_lookahead(s,V)
policy[s] = best_a
return policy, V
policy, value = value_iterations(env)
gpolicy = list(map(lambda a: policy_to_action[a],policy))
print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(4,4))))
print("Optimal Values :\n {}".format(np.reshape(value,(4,4))))
"""
Lets see our success rate
"""
games = 1000
won = 0
for _ in range(games):
state = game.reset()
while True:
action = int(policy[state])
(state,reward,is_done,_) = game.step(action)
if is_done:
if reward>0:
won+=1
game.close()
break
print("Success Rate : {}".format(won/games))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment