Skip to content

Instantly share code, notes, and snippets.

@vihari
Created August 12, 2017 08:12
Show Gist options
  • Save vihari/3e6b87348fe08dc400c8c5f6fdf5b33d to your computer and use it in GitHub Desktop.
Save vihari/3e6b87348fe08dc400c8c5f6fdf5b33d to your computer and use it in GitHub Desktop.
#!/usr/bin/python
"""
I have written this script to see if the optimal policy solution obtained by Linear Programming is same as the one that would be obtained by a Gradient based method
The Gradient based approach is implemented with Tensorflow and LP with a scipy library.
I found that n of the nk constraints (greater than or equal to) for the solution become equalities for the solution obtained by LinProg and that is not so in the case of Gradient-based approach. This shows that the constraints do not sufficiently specify the solution, but works in case of LP because of the way it finds the solution.
TF - Tensorflow's Gradient Descent
Run: python <script> [LP|TF]
"""
import tensorflow as tf
import numpy as np
import sys
n, k = 3, 3
np.random.seed(2)
T, R = np.reshape([np.random.sample() for i in range(n*k*n)], [n, k, n]), np.random.standard_normal([n, k, n])
T = T/np.expand_dims(np.sum(T, axis=2), axis=2)
assert np.all(np.sum(T, axis=2)>0.99), "T array does not have valid probabilities"
g = .7
alg = str(sys.argv[1]) if len(sys.argv)>1 else ''
algs = set(['LP', 'TF'])
solve_by = 'LP' if (not alg in algs) else alg
if solve_by=='TF':
vs = [tf.get_variable("v%d" % i, shape=[], initializer=tf.random_normal_initializer) for i in range(n)]
l = sum(vs)
BIG = 1000
sms = []
for i in range(n):
for a in range(k):
sm = 0
for j in range(n):
sm += T[i][a][j]*(R[i][a][j]+g*vs[j])
sm -= vs[i]
sms.append(sm)
l -= tf.maximum(0., BIG*sm)
lr=0.1
opt = tf.train.GradientDescentOptimizer(lr)
op = opt.minimize(-l)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
for i in range(100):
x = sess.run([op, l] + sms)
if i%10 == 0:
print x
print sess.run(vs)
elif solve_by=='LP':
from scipy.optimize import linprog as lp
c, A, b = [], [], []
c = [-1]*n
for i in range(n):
for a in range(k):
a_coeffs, b_sum = [0]*n, 0
for j in range(n):
a_coeffs[j] = T[i][a][j]*g
b_sum -= T[i][a][j]*R[i][a][j]
a_coeffs[i] -= 1
A.append(a_coeffs)
b.append(b_sum)
res = lp(c, A_ub=A, b_ub=b)
print res
x = res.x
# the first k constarints correspond to several actions on the same variable
for l in range(n):
print [sum([A[i][j]*x[j] for j in range(n)])-b[i] for i in range(k*l, l*k+k)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment