Last active
October 18, 2018 11:08
-
-
Save adityajn105/f3d576866cbafd8e63d447e26ecac007 to your computer and use it in GitHub Desktop.
Policy Optimization using policy iterations and value iterations.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Author : Aditya Jain | |
Contact: https://adityajn105.github.io | |
""" | |
import gym | |
import numpy as np | |
#create game | |
game = gym.make('FrozenLake-v0') | |
#get game env object | |
env = game.env | |
policy_to_action = {0:"L",1:"D",2:"R",3:"U"} | |
def policy_iterations(env,theta=1e-10, discount_factor=0.9): | |
""" | |
Args: | |
env = the game env | |
env.P returns all and their corresponsing action | |
env.nS returns total no of states | |
env.nA return total no of actions | |
theta = stop iteration if change become less than theta | |
discount_factor = Gamma value | |
Returns: best_policy, value function | |
""" | |
def policy_evaluation(policy,V,env=env,theta=theta,discount_factor=discount_factor): | |
"""Helper function that returns new value function corresponding to a policy""" | |
while True: | |
temp_V = V.copy() | |
biggest_change=0 | |
for s in range(env.nS): | |
action = policy[s] | |
v = 0 | |
for (prob,next_state,reward,_) in env.P[s][action]: | |
v += prob * (reward + discount_factor * V[next_state] ) | |
temp_V[s]=v | |
change = abs(v-V[s]) | |
if biggest_change < change: | |
biggest_change = change | |
V = temp_V | |
if biggest_change<theta: | |
return V | |
def policy_improvement(V,env=env,discount_factor=discount_factor): | |
"""Helper function that returns best policy corresponding to value function using greedy method""" | |
npolicy = np.zeros(env.nS) | |
for s in range(env.nS): | |
A = dict() | |
for a in env.P[s].keys(): | |
A[a] = 0 | |
for (prob,next_state,reward,_) in env.P[s][a]: | |
A[a] += prob * (reward + V[next_state]*discount_factor ) | |
best_action = 0 | |
best_value = float('-inf') | |
for a,v in A.items(): | |
if best_value < v: | |
best_action = a | |
best_value = v | |
npolicy[s] = best_action | |
return npolicy | |
#initializing policy which says always move right | |
policy = np.zeros(env.nS)+2 | |
#initializing V | |
V = np.zeros(env.nS) | |
#policy iterations | |
while True: | |
V = policy_evaluation(policy,V) | |
npolicy = policy_improvement(V) | |
change = False | |
for _p,p in zip(npolicy,policy): | |
if _p!=p: change=True;break | |
if not change: | |
break | |
else: | |
policy = npolicy | |
return policy,V | |
policy, value = policy_iterations(env) | |
gpolicy = list(map(lambda a: policy_to_action[a],policy)) | |
print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(4,4)))) | |
print("Optimal Values :\n {}".format(np.reshape(value,(4,4)))) | |
""" | |
Lets see our success rate | |
""" | |
games = 1000 | |
won = 0 | |
for _ in range(games): | |
state = game.reset() | |
while True: | |
action = int(policy[state]) | |
(state,reward,is_done,_) = game.step(action) | |
if is_done: | |
if reward>0: | |
won+=1 | |
game.close() | |
break | |
print("Success Rate : {}".format(won/games)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Author : Aditya Jain | |
Contact: https://adityajn105.github.io | |
""" | |
import gym | |
import numpy as np | |
#create game | |
game = gym.make('FrozenLake-v0') | |
#get game env object | |
env = game.env | |
policy_to_action = {0:"L",1:"D",2:"R",3:"U"} | |
def value_iterations(env, theta = 0.00001, discount_factor = 0.9): | |
""" | |
Args: | |
env = the game env | |
env.P returns all and their corresponsing action | |
env.nS returns total no of states | |
env.nA return total no of actions | |
theta = stop iteration if change become less than theta | |
discount_factor = Gamma value | |
Returns: best_policy, value function | |
""" | |
def one_step_lookahead(s,V,env=env,discount_factor=discount_factor): | |
"""Helper Function to to best action and its value for a state""" | |
A = dict() | |
for a in env.P[s].keys(): | |
A[a] = 0 | |
for (action_prob,next_state,reward,is_done) in env.P[s][a]: | |
A[a] += action_prob * ( reward + discount_factor*V[next_state] ) | |
best_action = 0 | |
best_value = float('-inf') | |
for a,v in A.items(): | |
if v > best_value: | |
best_value = v | |
best_action = a | |
return best_action,best_value | |
#value optimization | |
V = np.zeros(env.nS) | |
while True: | |
biggest_change=0 | |
for s in range(env.nS): | |
_, new_v = one_step_lookahead(s,V) | |
old_v = V[s] | |
V[s] = new_v | |
change = abs(old_v-new_v) | |
if biggest_change < change: | |
biggest_change = change | |
if biggest_change< theta: | |
break | |
#policy extraction | |
policy = np.zeros(env.nS) | |
for s in range(env.nS): | |
best_a,_ = one_step_lookahead(s,V) | |
policy[s] = best_a | |
return policy, V | |
policy, value = value_iterations(env) | |
gpolicy = list(map(lambda a: policy_to_action[a],policy)) | |
print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(4,4)))) | |
print("Optimal Values :\n {}".format(np.reshape(value,(4,4)))) | |
""" | |
Lets see our success rate | |
""" | |
games = 1000 | |
won = 0 | |
for _ in range(games): | |
state = game.reset() | |
while True: | |
action = int(policy[state]) | |
(state,reward,is_done,_) = game.step(action) | |
if is_done: | |
if reward>0: | |
won+=1 | |
game.close() | |
break | |
print("Success Rate : {}".format(won/games)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment