Created
June 21, 2016 18:31
-
-
Save jangirrishabh/b4dd7bd39de862d28a49a467b2df672b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym | |
import numpy as np | |
import cv2, math | |
import logging | |
import os | |
import scipy | |
from numpy import linalg as LA | |
from matplotlib import pyplot as plt | |
%matplotlib inline | |
from poleCart_RL import EpisodicAgent #get the RL agent | |
from poleCart_manual import expertFeatures | |
from cvxopt import matrix | |
from cvxopt import solvers | |
class irlAgent: | |
def __init__(self, gymEnv, rlEpisodes, rlMaxSteps): #initial constructor sorta function | |
self.env = gymEnv | |
self.episodesRL = rlEpisodes | |
self.maxStepsRL = rlMaxSteps | |
self.randomPolicy = [6.33159868, 22.18457058, 24.07697606, 64.68426447, 15.92186349] # random initialization | |
self.expertPolicy = [1.40327044 , 12.06541251 , 1.39011785 , 15.70455323 , 19.99994606] #human generated get it straight | |
#self.expertPolicy = [ 24.90898925 , 66.21544503 , 28.48223649 , 80.00899435 , 19.67509372] # human generated get max displacement | |
#self.expertPolicy = [ 1.38076217 , 3.6306461 , 0.79024451 , 3.27657669, 20.99918233] # machine generated get it straight | |
self.epsilon = 1.0 | |
self.policiesFE = {np.linalg.norm(np.asarray(self.expertPolicy)-np.asarray(self.randomPolicy)):self.randomPolicy} | |
def getRLAgentFE(self, W): #get the feature expectations of a new poliicy using RL agent | |
agent = EpisodicAgent(self.env.action_space) | |
return agent.reinforce(self.env, W, self.episodesRL, self.maxStepsRL) #return feature expectations | |
def policyListUpdater(self, W): #update the policyFE list and differences upon arrival of a new weight(policy) | |
for i in self.policiesFE.keys(): | |
temp = np.abs(np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(self.policiesFE[i])))) | |
if temp != i: | |
self.policiesFE[temp] = self.policiesFE[i] | |
del self.policiesFE[i] | |
tempFE = self.getRLAgentFE(W) | |
self.policiesFE[np.abs(np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(tempFE))))] = tempFE | |
#self.policiesFE[np.linalg.norm(np.asarray(self.expertPolicy)-np.asarray(tempFE))] = tempFE | |
#self.policiesFE[np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(tempFE)))] = tempFE | |
def optimalWeightFinder(self): | |
t_prev = 0 | |
while True: | |
W = self.optimization(np.matrix(self.expertPolicy) - np.matrix(self.policiesFE[min(self.policiesFE)])) | |
print " The minimum thing : ", min(self.policiesFE) | |
print " The sent Feature Expec : ", np.matrix(self.policiesFE[min(self.policiesFE)]) | |
#t = np.abs(np.dot(W, np.asarray(self.expertPolicy) - np.asarray(self.policiesFE[min(self.policiesFE)]))) | |
t = np.dot(W, np.asarray(self.expertPolicy) - np.asarray(self.policiesFE[min(self.policiesFE)])) | |
#print np.squeeze(np.asarray(np.matrix(self.expertPolicy) - np.matrix(self.policiesFE[min(self.policiesFE)]))) | |
if np.abs(t) <= 1.0 + self.epsilon: | |
break | |
#if np.abs(t-t_prev) < self.epsilon: | |
#break | |
self.policyListUpdater(W) | |
t_prev = t | |
print " the t value :: ", np.abs(t) | |
print "The KEYS::" , self.policiesFE.keys() | |
print "weights ", W | |
return W | |
def optimization(self, difference): | |
P = matrix(2.0*np.eye(5), tc='d') | |
q = matrix(np.zeros(5), tc='d') | |
#G = matrix((np.matrix(self.expertPolicy) - np.matrix(self.randomPolicy)), tc='d') | |
G = matrix(-difference, tc='d') | |
h = matrix(np.array([-1]), tc='d') | |
sol = solvers.qp(P,q,G,h) | |
#print sol['status'] | |
#return sol['x'] | |
weights = np.squeeze(np.asarray(sol['x'])) | |
norm = np.linalg.norm(weights) | |
weights = weights/norm | |
return weights | |
if __name__ == '__main__': | |
logger = logging.getLogger() | |
logger.setLevel(logging.INFO) | |
rlEpisodes = 100 | |
rlMaxSteps = 250 | |
#W = [-0.9, -0.9, -0.9, -0.9, 1] | |
env = gym.make('CartPole-v0') | |
irlearner = irlAgent(env, rlEpisodes, rlMaxSteps) | |
#print irlearner.policiesFE | |
#irlearner.policyListUpdater(W) | |
#print irlearner.rlAgentFeatureExpecs(W) | |
#print irlearner.expertFeatureExpecs() | |
print irlearner.optimalWeightFinder() | |
#print irlearner.optimization(20) | |
#np.squeeze(np.asarray(M)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment