-
-
Save Mahdisadjadi/f1ec09780f78dd23310a to your computer and use it in GitHub Desktop.
Pure Python implementation of some numerical optimizers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
Pure Python implementation of some numerical optimizers | |
Created on Jan 21, 2011 | |
@author Jiahao Chen | |
''' | |
from scipy import dot, eye, finfo, asarray_chkfinite, sqrt, \ | |
outer, zeros, isinf, arccos | |
from scipy.linalg import inv, norm, pinv, solve, LinAlgError | |
from copy import deepcopy | |
_epsilon = sqrt(finfo(float).eps) | |
################# | |
# Line searches # | |
################# | |
def MinpackLineSearch(f, df, x, p, df_x = None, f_x = None, | |
args = (), c1 = 1e-4, c2 = 0.4, amax = 50, xtol = _epsilon): | |
"Modified shamelessly from scipy.optimize" | |
if df_x == None: df_x = df(x) | |
if f_x == None: f_x = f(x) | |
from scipy.optimize import line_search | |
stp, _, _, _, _, _ = line_search(f, df, x, p, df_x, f_x, f_x, args, c1, c2, amax) | |
if stp == None: #Fall back to backtracking linesearch | |
return BacktrackingLineSearch(f, df, x, p, df_x, f_x, args) | |
else: | |
return stp | |
def MoreThuenteLinesearchIterate(delta, alpha_l, alpha_u, alpha_t, alpha_min, | |
alpha_max, f_l, g_l, f_t, g_t, f_u, g_u, isBracketed, Verbose): | |
#Section 4 of More and Thuente: Pick new trial alpha | |
if Verbose: print 'Section 4 of More and Thuente' | |
assert not (isBracketed and not min(alpha_l, alpha_u) < alpha_t < max(alpha_l, alpha_u)), 'Bracketing fail' | |
#if g_t * (alpha_t - alpha_l) < 0.0: | |
# if Verbose: print 'Going uphill; reverting to last good point' | |
# return isBracketed, alpha_l, alpha_l, alpha_t | |
if alpha_min == alpha_max: | |
print 'Squeezed!' | |
return isBracketed, alpha_min, alpha_min, alpha_min | |
assert alpha_min < alpha_max | |
sgnd = dot(g_t, g_l / abs(g_l)) | |
isBounded = False | |
if f_t > f_l: | |
if Verbose: | |
print """Case 1: a higher function value. \ | |
The minimum is bracketed by [a_l, a_t]. If the cubic step is closer to \ | |
lower bound then take it, else take average of quadratic and cubic steps.""" | |
#Calculate cubic and quadratic steps | |
da = alpha_t - alpha_l | |
d1 = g_l + g_t + 3 * (f_l - f_t) / da | |
s = max(g_l, g_t, d1) | |
d2 = s * ((d1 / s) ** 2 - (g_l / s * g_t / s)) ** 0.5 | |
if alpha_t < alpha_l: | |
d2 = -d2 | |
cubp = d2 - g_l + d1 | |
cubq = ((d2 - g_l) + d2) + g_t | |
if abs(cubq) > _epsilon: | |
r = cubp / cubq | |
#Minimizer of cubic that interpolates f_l, f_t, g_l, g_t | |
alpha_c = alpha_l + r * da | |
#Minimizer of quadratic that interpolates f_l, f_t, g_l | |
alpha_q = alpha_l + ((g_l / ((f_l - f_t) / da + g_l)) * 0.5 * da) | |
else: | |
#print 'WARNING: VERY SMALL INTERVAL' | |
alpha_c = alpha_q = 0.5 * (alpha_l + alpha_t) #Invalid, choose midpoint | |
#Calculate new trial | |
if Verbose: | |
print 'Interpolating guesses:', alpha_c, alpha_q | |
if abs(alpha_c - alpha_l) < abs(alpha_q - alpha_l): | |
alpha_tt = alpha_c | |
else: | |
alpha_tt = 0.5 * (alpha_c + alpha_q) | |
isBracketed = True | |
isBounded = True | |
elif sgnd < 0: #f_t <= f_l assumed by here | |
if Verbose: | |
print """Case 2: lower function value and derivatives are of opposite sign. \ | |
The minimum is bracketed by [a_l, a_t]. If the cubic step is closer to lower \ | |
bound then take it, else take quadratic step.""" | |
#Calculate cubic and quadratic steps | |
da = alpha_t - alpha_l | |
d1 = g_l + g_t + 3 * (f_l - f_t) / da | |
s = max(g_l, g_t, d1) | |
d2 = s * ((d1 / s) ** 2 - (g_l / s * g_t / s)) ** 0.5 | |
if alpha_t < alpha_l: | |
d2 = -d2 | |
cubp = d2 - g_t + d1 | |
cubq = ((d2 - g_t) + d2) + g_l | |
if abs(cubq) > _epsilon: | |
r = cubp / cubq #Minimizer of cubic that interpolates f_l, f_t, g_l, g_t | |
alpha_c = alpha_t - r * da #Minimizer of quadratic that interpolates f_l, f_t, g_l | |
alpha_s = alpha_t - (g_l / (g_l - g_t)) * da | |
else: | |
#print 'WARNING: VERY SMALL INTERVAL' | |
alpha_s = alpha_c = 0.5 * (alpha_l + alpha_t) #Invalid, choose midpoint | |
#Calculate new trial | |
if Verbose: | |
print 'Interpolating guesses:', alpha_c, alpha_s | |
if abs(alpha_c - alpha_t) >= abs(alpha_s - alpha_t): | |
alpha_tt = alpha_c | |
else: | |
alpha_tt = alpha_s | |
isBracketed = True | |
isBounded = False | |
elif norm(g_t) <= norm(g_l): #g_t * g_l >= 0 and f_t <= f_l assumed by here | |
if Verbose: | |
print """Case 3: lower function value and derivatives are of same \ | |
sign, and magnitude of derivative decreased.\n Cubic step used only if it tends \ | |
to infinity in direction of step or if minimum of cubic lies beyond step. \ | |
Otherwise cubic step is defined to be either stpmin or stpmax. Quadratic step is \ | |
taken if it is closer to alpha_t, else take the farthest step.""" | |
#Calculate cubic and quadratic steps | |
da = alpha_t - alpha_l | |
d1 = g_l + g_t + 3 * (f_l - f_t) / da | |
s = max(g_l, g_t, d1) | |
d2 = s * ((d1 / s) ** 2 - (g_l / s * g_t / s)) ** 0.5 | |
if alpha_t < alpha_l: | |
d2 = -d2 | |
cubp = d2 - g_t + d1 | |
cubq = ((d2 - g_t) + d2) + g_l | |
r = cubp / cubq | |
#The case d2 == 0 arises only if cubic does not tend to infinity in | |
#direction of step | |
if (r < 0. and abs(d2) < _epsilon): | |
alpha_c = alpha_t - r * da | |
elif alpha_t > alpha_l: | |
alpha_c = alpha_max | |
else: | |
alpha_c = alpha_min | |
#Minimizer of quadratic that interpolates f_l, f_t, g_l | |
alpha_s = -alpha_t - (g_l / (g_l - g_t)) * da | |
if isBracketed: | |
if abs(alpha_c - alpha_t) < abs(alpha_s - alpha_t): | |
alpha_tt = alpha_c | |
else: | |
alpha_tt = alpha_s | |
else: | |
if abs(alpha_c - alpha_t) > abs(alpha_s - alpha_t): | |
alpha_tt = alpha_c | |
else: | |
alpha_tt = alpha_s | |
else: | |
if Verbose: | |
print """Case 4: lower function value and derivatives \ | |
are of same sign, and magnitude of derivative did not decrease. | |
The minimum is NOT NECESSARILY bracketed. \ | |
If bracketed, take cubic step, otherwise take alpha_min or alpha_max.""" | |
#Minimizer of cubic that interpolates f_u, f_t, g_u, g_t | |
#Formula from Nocedal and Wright 2/e, Eq. 3.59, p 59 | |
if isBracketed: | |
da = alpha_t - alpha_u | |
d1 = g_u + g_t + 3 * (f_u - f_t) / da | |
s = max(g_l, g_t, d1) | |
d2 = s * ((d1 / s) ** 2 - (g_l / s * g_t / s)) ** 0.5 | |
if alpha_t < alpha_u: | |
d2 = -d2 | |
cubp = d2 - g_t + d1 | |
cubq = ((d2 - g_t) + d2) + g_u | |
if abs(cubq) > _epsilon: | |
r = cubp / cubq | |
alpha_tt = alpha_t - r * da | |
else: | |
#print 'WARNING: VERY SMALL INTERVAL' | |
alpha_tt = 0.5 * (alpha_l + alpha_t) #Invalid, choose midpoint | |
else: | |
if alpha_t > alpha_l: | |
alpha_tt = alpha_max | |
else: | |
alpha_tt = alpha_min | |
isBounded = False #abs(g_t) > abs(g_l) | |
if Verbose: print 'Now proceeding to update' | |
#Update interval of uncertainty | |
if f_t > f_l: | |
if Verbose: print 'Update case 1' | |
alpha_u, f_u, g_u = alpha_t, f_t, g_t | |
else: | |
if sgnd < 0: | |
if Verbose: print 'Update case 2' | |
alpha_u, f_u, g_u = alpha_l, f_l, g_l | |
else: | |
if Verbose: print 'Update case 3' | |
alpha_l, f_l, g_l = alpha_t, f_t, g_t | |
"""Refine trial value if it lies outside [alpha_t, alpha_u] or | |
is otherwise too close to alpha_u.""" | |
alpha_tt = min(alpha_max, alpha_tt) | |
alpha_tt = max(alpha_min, alpha_tt) #Update | |
alpha_t = alpha_tt | |
if isBracketed and isBounded: | |
if alpha_u > alpha_l: | |
alpha_t = min(alpha_l + delta * (alpha_u - alpha_l), alpha_t) | |
else: | |
alpha_t = max(alpha_l + delta * (alpha_u - alpha_l), alpha_t) | |
if Verbose: print 'Refined trial value = ', alpha_t | |
return isBracketed, alpha_t, alpha_l, alpha_u | |
def MoreThuenteLineSearch(f, df, x, p, df_x = None, f_x = None, args = (), | |
mu = 0.01, eta = 0.5, ftol = _epsilon, gtol = _epsilon, rtol = _epsilon, | |
maxiter = 9, fmin = 0.0, alpha_min0 = 0.001, alpha_max0 = 100.0, alpha_t = 1.0, | |
p5 = 0.5, xtrapf = 4.0, delta = 0.66, Verbose = False): | |
"""Implements the line search algorithm of Mor\'e and Thuente, ACM TOMS 1994 | |
"Line search algorithms with guaranteed sufficient decrease" | |
doi: 10.1145/192115.192132 | |
Inputs | |
------ | |
f | |
df | |
x | |
p | |
df_x = None | |
f_x = None | |
args = () | |
mu = 0.01 | |
mu is maximum required decrease | |
mu >= |f(x+dx) - f(x)| / |df(x).p| | |
mu in (0,1) | |
eta = 0.5 | |
eta is maximum permissible value of |df(x+dx).p|/|df(x).p| | |
eta in (0,1) | |
ftol = _epsilon | |
gtol = _epsilon | |
rtol = _epsilon | |
rtol is relative tolerance for acceptable step | |
maxiter: Maximum number of iterations. Default: 9 | |
fmin = 0.0 | |
alpha_min0 = 0.001 | |
alpha_max0 = 100.0 | |
alpha_t = 1.0 | |
p5 = 0.5 | |
xtrapf: An extrapolation parameter. Recommended: between [1.1, 4.0]. | |
Default: 4.0 | |
delta = 0.66 | |
Verbose = True | |
""" | |
################################## | |
# Evaluate function and derivative | |
################################## | |
if f_x == None: f_0 = f(x, *args) | |
else: f_0 = f_x | |
if df_x == None: df_0 = df(x, *args) | |
else: df_0 = df_x | |
g_0 = dot(df_0, p) | |
############################################################ | |
# Check that initial gradient in search direction is downhill | |
############################################################# | |
assert g_0.shape == (1, 1) or g_0.shape == (), 'Scipy screwed up the calculation.' | |
assert g_0 < 0, 'Attempted to linesearch uphill' | |
############ | |
# Initialize | |
############ | |
FoundGoodPoint = isBracketed = False | |
gtest = ftol * g_0 | |
width = alpha_max0 - alpha_min0 | |
width1 = width / p5 | |
f_t = f(x + alpha_t * p, *args) | |
df_t = df(x + alpha_t * p, *args) | |
g_t = dot(df_t, p) | |
# alpha = step size | |
# f = function value | |
# g = gradient value in descent direction | |
# _l = best step so far | |
# _u = other endpoint of interval of uncertainty | |
# _t = current step iterate | |
alpha_l = alpha_u = 0.0 | |
f_l = f_u = f_0 | |
g_l = g_u = g_0 | |
if alpha_l == alpha_min0: alpha_l += _epsilon | |
if alpha_u == alpha_max0: alpha_u -= _epsilon | |
assert alpha_min0 != alpha_max0 | |
for k in range(maxiter): | |
if Verbose: print 'Iteration', k, ':', alpha_t, 'in (', alpha_l, ',', alpha_u, ')' | |
############################################ | |
# Initialize current interval of uncertainty | |
############################################ | |
if isBracketed: | |
alpha_min = min(alpha_l, alpha_u) | |
alpha_max = max(alpha_l, alpha_u) | |
else: | |
alpha_min = alpha_l | |
alpha_max = alpha_t + xtrapf * abs(alpha_t - alpha_l) | |
assert alpha_min <= alpha_max | |
################################################## | |
# Safeguard trial step within pre-specified bounds | |
################################################## | |
alpha_t = max(alpha_t, alpha_min0) | |
alpha_t = min(alpha_t, alpha_max0) | |
#If something funny happened, set it to lowest point obtained thus far | |
if ((isBracketed and (alpha_t <= alpha_min or alpha_t >= alpha_max)) \ | |
or (isBracketed and (alpha_max - alpha_min <= rtol * alpha_max)) \ | |
): | |
if Verbose: print 'Something funny happened' | |
return alpha_l | |
#Evaluate function and gradient at new point | |
f_t = f(x + alpha_t * p, *args) | |
df_t = df(x + alpha_t * p, *args) | |
g_t = dot(df_t, p) | |
ftest1 = f_0 + alpha_t * gtest | |
###################### | |
# Test for convergence | |
###################### | |
if (alpha_l == alpha_max0 and f_t <= ftest1 and g_t <= gtest): | |
if Verbose: print 'Stuck on upper bound' | |
break | |
if (alpha_l == alpha_min0 and f_t > ftest1 and g_t >= gtest): | |
if Verbose: print 'Stuck on lower bound' | |
break | |
#Check for a) sufficient decrease AND b) strong curvature criterion | |
if ((f_t <= f_0 + mu * g_0 * alpha_t) \ | |
and abs(g_t) <= eta * abs(g_0)): | |
if Verbose: print 'Decrease criterion satisfied' | |
break | |
#Check for bad stuff | |
if isBracketed: | |
if (abs(alpha_max - alpha_min) >= delta * width1): | |
print 'Warning: Could not satisfy curvature conditions' | |
alpha_t = alpha_min + p5 * (alpha_max - alpha_min) | |
width1 = width | |
width = abs(alpha_max - alpha_min) | |
#If interval is bracketed to sufficient precision, break | |
if k > 0 and abs(alpha_l - alpha_u) < rtol: | |
if Verbose: print 'Interval bracketed: alpha = ', alpha_t | |
print 'Line search stuck. Emergency abort.' | |
break | |
######################################################## | |
# Nocedal's modification of the More-Thuente line search | |
######################################################## | |
ftest1 = f_0 + alpha_l * gtest | |
# Seek a step for which the modified function has a nonpositive value | |
# and a nonnegative derivative | |
if (not FoundGoodPoint) and f_t <= ftest1 and g_t >= g_0 * min(ftol, gtol): | |
FoundGoodPoint = True | |
# The modified function is used only if we don't have a step where the | |
# previous condition is attained, and if a lower function value has been | |
# obtained but it is not low enough | |
if (not FoundGoodPoint) and f_t <= f_l and f_t > ftest1: | |
if Verbose: print "Performing Nocedal's modification" | |
f_mt = f_t - alpha_t * gtest | |
f_ml = f_l - alpha_l * gtest | |
f_mu = f_u - alpha_u * gtest | |
g_mt = g_t - gtest | |
g_ml = g_l - gtest | |
g_mu = g_u - gtest | |
isBracketed, alpha_t, alpha_l, alpha_u = MoreThuenteLinesearchIterate\ | |
(delta, alpha_l, alpha_u, alpha_t, alpha_min, alpha_max, f_ml, \ | |
g_ml, f_mt, g_mt, f_mu, g_mu, isBracketed, Verbose) | |
f_l = f_ml + alpha_l * gtest | |
f_u = f_mu + alpha_u * gtest | |
g_l = g_ml + gtest | |
g_u = g_mu + gtest | |
else: | |
if Verbose: print 'Performing original More-Thuente line search' | |
isBracketed, alpha_t, alpha_l, alpha_u = MoreThuenteLinesearchIterate\ | |
(delta, alpha_l, alpha_u, alpha_t, alpha_min, alpha_max, f_l, g_l, \ | |
f_t, g_t, f_u, g_u, isBracketed, Verbose) | |
########################### | |
# Force sufficient decrease | |
########################### | |
if isBracketed: | |
if Verbose: print 'Force sufficient decrease' | |
if abs(alpha_u - alpha_l) >= delta * width1: | |
alpha_t = alpha_l + p5 * (alpha_u - alpha_l) | |
width1 = width | |
width = abs(alpha_u - alpha_l) | |
if Verbose: print k + 1, 'iterations in More-Thuente line search' | |
return alpha_t | |
def CubicLineSearch(f, df, x, p, df_x, f_x = None, args = (), | |
alpha = 0.0001, beta = 0.9, eps = 0.0001, Verbose = False): | |
if f_x is None: | |
f_x = f(x, *args) | |
assert df_x.T.shape == p.shape | |
assert 0 < alpha < 1, 'Invalid value of alpha in backtracking linesearch' | |
assert 0 < beta < 1, 'Invalid value of beta in backtracking linesearch' | |
phi = lambda c: f(x + (c * p), *args) | |
phi0 = f_x | |
derphi0 = dot(df_x, p) | |
assert derphi0.shape == (1, 1) or derphi0.shape == () | |
assert derphi0 < 0, 'Attempted to linesearch uphill' | |
stp = 1.0 | |
#Loop until Armijo condition is satisfied | |
while phi(stp) > phi0 + alpha * stp * derphi0: | |
#Quadratic interpolant | |
stp_q = -derphi0 * alpha ** 2 / (2 * phi(stp) - phi0 - derphi0 * stp) | |
if phi(stp_q) > f_x + alpha * stp_q * derphi0: | |
return stp_q | |
#Quadratic interpolant bad, use cubic interpolant | |
A = zeros((2, 2)) | |
b = zeros((2,)) | |
A[0, 0] = stp * stp | |
A[0, 1] = -stp_q ** 2 | |
A[1, 0] = -stp ** 3 | |
A[1, 1] = stp_q ** 3 | |
b[0] = phi(stp_q) - phi0 - derphi0 * stp_q | |
b[1] = phi(stp) - phi0 - derphi0 * stp | |
xx = dot(A, b) / ((stp * stp_q) ** 2 * (stp_q - stp)) | |
stp_c = (-xx[1] + (xx[1] ** 2 - 3 * xx[0] * derphi0) ** 0.5) / (3 * xx[0]) | |
#Safeguard: if new alpha too close to predecessor or too small | |
if abs(stp_c - stp) < 0.001 or stp_c < 0.001: | |
stp *= beta | |
else: | |
stp = stp_c | |
#print stp | |
#print stp | |
return stp | |
def BacktrackingLineSearch(f, df, x, p, df_x = None, f_x = None, args = (), | |
alpha = 0.0001, beta = 0.9, eps = _epsilon, Verbose = False): | |
""" | |
Backtracking linesearch | |
f: function | |
x: current point | |
p: direction of search | |
df_x: gradient at x | |
f_x = f(x) (Optional) | |
args: optional arguments to f (optional) | |
alpha, beta: backtracking parameters | |
eps: (Optional) quit if norm of step produced is less than this | |
Verbose: (Optional) Print lots of info about progress | |
Reference: Nocedal and Wright 2/e (2006), p. 37 | |
Usage notes: | |
----------- | |
Recommended for Newton methods; less appropriate for quasi-Newton or conjugate gradients | |
""" | |
if f_x is None: | |
f_x = f(x, *args) | |
if df_x is None: | |
df_x = df(x, *args) | |
assert df_x.T.shape == p.shape | |
assert 0 < alpha < 1, 'Invalid value of alpha in backtracking linesearch' | |
assert 0 < beta < 1, 'Invalid value of beta in backtracking linesearch' | |
derphi = dot(df_x, p) | |
assert derphi.shape == (1, 1) or derphi.shape == () | |
assert derphi < 0, 'Attempted to linesearch uphill' | |
stp = 1.0 | |
fc = 0 | |
len_p = norm(p) | |
#Loop until Armijo condition is satisfied | |
while f(x + stp * p, *args) > f_x + alpha * stp * derphi: | |
stp *= beta | |
fc += 1 | |
if Verbose: print 'linesearch iteration', fc, ':', stp, f(x + stp * p, *args), f_x + alpha * stp * derphi | |
if stp * len_p < eps: | |
print 'Step is too small, stop' | |
break | |
#if Verbose: print 'linesearch iteration 0 :', stp, f_x, f_x | |
if Verbose: print 'linesearch done' | |
#print fc, 'iterations in linesearch' | |
return stp | |
####################### | |
# Trust region models # | |
####################### | |
class DogLeg: | |
def __init__(self, f, df, B, x, g, f_x = None, df_x = None, TrustRadius = 1.0): | |
self.TrustRadius = TrustRadius | |
self.f = f #Function | |
self.df = df #Gradient function | |
self.B = B #Approximate Hessian | |
self.x = x #Current point | |
self.g = g #Search direction | |
def solve(self): | |
g = self.g | |
B = self.B | |
#Step 1: Find unconstrained solution | |
pB = g | |
#Step 2: Find Cauchy point | |
pU = -dot(g, g) / dot(g, dot(B, g)) * g | |
#INCOMPLETE | |
################ | |
# Extrapolator # | |
################ | |
class DIISExtrapolator: | |
def __init__(self, max = 300): | |
self.maxvectors = max | |
self.Reset() | |
def Reset(self): | |
self.errorvectors = [] | |
self.coordvectors = [] | |
def AddData(self, error, coord): | |
#Check max | |
while self.GetNumVectors() - self.maxvectors >= 0: | |
self.errorvectors.pop() | |
self.coordvectors.pop() | |
from numpy import asarray | |
#print asarray(error).flatten().shape, asarray(coord).flatten().shape | |
self.errorvectors.append(deepcopy(asarray(error).flatten())) | |
self.coordvectors.append(deepcopy(asarray(coord).flatten())) | |
def GetNumVectors(self): | |
assert len(self.errorvectors) == len(self.coordvectors) | |
return len(self.errorvectors) | |
def Extrapolate(self): | |
#Construct Overlap matrix (aka Gram matrix) | |
N = self.GetNumVectors() | |
if N == 0: return None | |
B = zeros((N + 1, N + 1)) | |
for i in range(N): | |
for j in range(N): | |
B[i, j] = dot(self.errorvectors[i], self.errorvectors[j]) | |
B[N, :] = -1 | |
B[:, N] = -1 | |
B[N, N] = 0 | |
v = zeros(N + 1) | |
v[N] = -1 | |
#Solve for linear combination | |
xex = self.coordvectors[0] * 0 | |
try: | |
c = solve(B, v) | |
assert c.shape == v.shape | |
#Generate interpolated coordinate | |
for i in range(N): | |
xex += c[i] * self.coordvectors[i] | |
except: #Linear dependency detected; trigger automatic reset | |
c = dot(pinv(B), v) | |
#Generate interpolated coordinate | |
for i in range(N): | |
xex += c[i] * self.coordvectors[i] | |
print 'Linear dependency detected in DIIS; resetting' | |
self.Reset() | |
return xex | |
####################### | |
# Quasi-Newton driver # | |
####################### | |
class ApproxHessianBase: | |
def __init__(self, N = None): | |
if N is None: | |
self.M = None | |
else: | |
self.M = eye(N) | |
def Reset(self): | |
self.SetToIdentity() | |
def SetToIdentity(self): | |
self.M = eye(self.M.shape[0]) | |
def SetToScaledIdentity(self, c = 1): | |
self.M = c * eye(self.M.shape[0]) | |
def SetToMatrix(self, K): | |
self.M = K | |
def GetHessian(self): | |
return self.M | |
def Operate(self, g): | |
assert False, 'Should never run this' | |
def Update(self, *args): | |
assert False, 'Should never run this' | |
def __rshift__(self, g): #Overload >> operator | |
"""If self.B is defined, solves for x in B x = -g | |
""" | |
try: | |
dx = self.Operate(g) | |
except ValueError, e: | |
print e | |
print g.shape, self.M.shape | |
exit() | |
try: | |
return asarray_chkfinite(dx) | |
except (LinAlgError, ValueError): | |
print 'Restarting approximate Hessian' | |
'Trigger restart, fall back to steepest descent' | |
self.SetToIdentity() | |
return g | |
class ApproxHessian(ApproxHessianBase): | |
def __init__(self, N = None): | |
ApproxHessianBase.__init__(self, N) | |
def SetToNumericalHessian(self, df, x, step = 0.001, UseStencil = None): | |
if UseStencil is None: | |
import Stencil | |
UseStencil = Stencil.FirstOrderCentralDifferenceStencil() | |
#Initialize Hessian as numerical Hessian | |
self.M = UseStencil.ApplyToFunction(df, x, step) | |
def Operate(self, g): | |
"""If self.B is defined, solves for x in B x = -g | |
""" | |
try: | |
dx = -solve(self.M, g) | |
except LinAlgError: | |
print 'Warning, using pseudoinverse' | |
dx = -pinv(self.M, g) | |
return dx | |
class ApproxInvHessian(ApproxHessianBase): | |
def __init__(self, N = None): | |
ApproxHessianBase.__init__(self, N) | |
def SetToNumericalHessian(self, df, x, step = 0.001, UseStencil = None): | |
if UseStencil is None: | |
import Stencil | |
UseStencil = Stencil.FirstOrderCentralDifferenceStencil() | |
#Initialize Hessian as numerical Hessian | |
K = UseStencil.ApplyToFunction(df, x, step) | |
try: | |
self.M = inv(K) | |
except LinAlgError: | |
self.M = pinv(K) | |
def GetHessian(self): | |
try: | |
return inv(self.M) | |
except LinAlgError: | |
print 'Warning, using pseudoinverse' | |
return pinv(self.M) | |
def Operate(self, g): | |
"""If self.H is defined, returns x = H g | |
""" | |
return - dot(self.M, g) | |
class BFGSInvHessian(ApproxInvHessian): | |
def Update(self, s, y, s_dot_y = None): | |
if s_dot_y is None: s_dot_y = dot(s, y) | |
rhok = 1.0 / s_dot_y | |
I = eye(self.M.shape[0]) | |
A1 = I - outer(s, y) * rhok | |
A2 = I - outer(y, s) * rhok | |
self.M = dot(A1, dot(self.M, A2)) | |
self.M += +rhok * outer(s, s) | |
class LBFGSInvHessian(ApproxInvHessian): | |
def __init__(self, M = 20): | |
self.maxnumvecs = int(M) | |
self.rho = [] | |
self.gamma = 1.0 | |
self.Reset() | |
print 'Initialized L-BFGS Hessian approximation with M =', self.maxnumvecs | |
def Reset(self): | |
self.s = [] | |
self.y = [] | |
self.rho = [] | |
def SetToIdentity(self): #Reset | |
self.Reset() | |
def SetToScaledIdentity(self, c = None): #Reset | |
self.Reset() | |
if c is not None: self.gamma = c | |
def Update(self, s, y, s_dot_y = None): | |
if len(self.s) == self.maxnumvecs: | |
self.s.pop(0) | |
self.y.pop(0) | |
self.rho.pop(0) | |
if s_dot_y is None: s_dot_y = dot(s, y) | |
rho = 1.0 / s_dot_y | |
if isinf(rho): | |
print 'Warning, s . y = 0; ignoring pair' | |
else: | |
self.s.append(s) | |
self.y.append(y) | |
self.rho.append(rho) | |
def GetNumVecs(self): | |
assert len(self.s) == len(self.y) | |
return len(self.s) | |
def __rshift__(self, g): #Overload >> as multiply -H g | |
q = g | |
m = self.GetNumVecs() | |
s = self.s | |
y = self.y | |
a = zeros(m) | |
for i in range(m - 1, -1, -1): | |
a[i] = self.rho[i] * dot(s[i], q) | |
q = q - a[i] * y[i] | |
#r = Hguess * q | |
gamma = self.gamma | |
if m > 0: gamma = dot(s[-1], y[-1]) / dot(y[-1], y[-1]) | |
#if m > 0: gamma = 0.5 * gamma + 0.5 * dot(s[-1], y[-1]) / dot(y[-1], y[-1]) | |
self.gamma = gamma | |
print 'gamma=',gamma | |
r = gamma * q | |
for i in range(m): | |
b = self.rho[i] * dot(y[i], r) | |
r = r + s[i] * (a[i] - b) | |
return - r | |
def Optimizer(f, x0, df = None, xtol = 0.1*_epsilon**0.5, | |
gtol = 0.1*_epsilon**0.5, maxstep = 1, maxiter = None, | |
line_search = BacktrackingLineSearch, DIISTrigger = 0): | |
#Hijack code with call to L-BFGS-B | |
#from scipy.optimize import fmin_l_bfgs_b | |
#return fmin_l_bfgs_b(f, x0, df, m = len(x0), pgtol = gtol, factr = 1.0/xtol, iprint=1)[0] | |
#Do LBFGS | |
#If true, will fall back on BFGS automatically | |
DoLBFGS = True | |
DoLineSearch = True | |
TrustRadius = 0.5 | |
MaxTrustRadius = 10.0 | |
#Cosine squared of uphill angle | |
UphillAngleThresh = 0.0 | |
UphillFThresh = 0.001 | |
#Massage x0 | |
x0 = asarray_chkfinite(x0).ravel() | |
if x0.ndim == 0: x0.shape = (1,) | |
#Set default number of maxiters | |
if maxiter is None: maxiter = max(20000, len(x0) * 4) | |
########### | |
#Initialize | |
########### | |
x = x0 | |
f_x, df_x = f(x0), df(x0) | |
norm_g = norm(df_x) | |
N = len(x0) | |
if DoLBFGS: | |
#Dimension of 3 --- 20 is normal | |
#H = LBFGSInvHessian(2 + N ** 0.2) | |
H = LBFGSInvHessian(N) | |
else: | |
H = BFGSInvHessian(N) | |
H.SetToNumericalHessian(df, x) | |
if DIISTrigger != 0: DIIS = DIISExtrapolator() | |
for k in range(maxiter): | |
####################################################################### | |
# Solve for unconstrained minimization direction p such that H p = -x # | |
####################################################################### | |
p = H >> df_x #overloaded | |
AcceptStep = True | |
########################################################## | |
# Safeguard: perform linesearch or trust-region limiting # | |
# to determine actual step s to take # | |
########################################################## | |
if DoLineSearch: | |
#################### | |
# Perform linesearch | |
#################### | |
alpha = line_search(f, df, x, p, df_x, f_x) | |
try: | |
alpha = line_search(f, df, x, p, df_x, f_x) | |
assert alpha > 0 | |
except AssertionError: | |
print 'Line search failed; falling back to exact line search' | |
alpha = MinpackLineSearch(f, df, x, p, df_x, f_x) | |
assert alpha > 0 | |
s = alpha * p | |
norm_s = norm(s) | |
else: | |
############## | |
# Do dog-leg # | |
############## | |
#First do line search along steepest descent direction | |
alpha = line_search(f, df, x, -df_x, df_x, f_x) | |
#Cauchy point is x - alpha * df_x | |
#Next, move toward unconstrained solution from Cauchy point | |
dogleg = p + alpha * df_x | |
alpha2 = line_search(f, df, x - alpha * df_x, dogleg) | |
s = -alpha * df_x + alpha2 * dogleg | |
#Stupidly enforce simple trust radius | |
norm_s = norm(s) | |
while norm_s > TrustRadius: | |
alpha2 *= 0.9 | |
s = -alpha * df_x + alpha2 * dogleg | |
norm_s = norm(s) | |
###################################### | |
# Do DIIS Extrapolation if requested # | |
###################################### | |
DidDIIS = False | |
if DIISTrigger != 0: | |
data = zeros(len(df_x) + 1) | |
data[1:] = df_x | |
data[0] = f_x | |
DIIS.AddData(error = data, coord = x) | |
if DIIS.GetNumVectors() % DIISTrigger == 0: | |
mindf = min([norm(xx[1:]) for xx in DIIS.errorvectors]) | |
xdnew = DIIS.Extrapolate() | |
norm_df_xdnew = norm(df(xdnew)) | |
f_xdnew = f(xdnew) | |
if norm_df_xdnew < mindf and f_xdnew < f_x: | |
#Accept only if there is actually a reduction in |df| | |
print 'Accepted DIIS extrapolation' | |
DidDIIS = True | |
s = xdnew - x | |
#if DIISTrigger > 1: DIISTrigger -= 1 | |
#print 'Improvement ratios', (f_x - f_xdnew) / f_x, (mindf - norm_df_xdnew) / mindf | |
if (f_x - f_xdnew) < 0.000001 * f_x and (mindf - norm_df_xdnew) < 0.00001 * mindf : | |
print 'DIIS reaching diminishing returns, resetting' | |
DIIS.Reset() | |
#DIISTrigger += 1 | |
else: | |
print 'DIIS wanted to go to', f_xdnew, norm_df_xdnew | |
print "Rejected DIIS extrapolation" | |
DIIS.Reset() | |
DIISTrigger += 1 | |
AcceptStep = False | |
if DIISTrigger > 50: | |
print 'Turning off DIIS' | |
DIISTrigger = 0 | |
#################################################### | |
# Take step; compute function value and derivative # | |
#################################################### | |
xnew = x + s | |
f_xnew = f(xnew) | |
df_xnew = df(xnew) | |
y = df_xnew - df_x | |
norm_y = norm(y) | |
norm_gnew = norm(df_xnew) | |
s_y = dot(s, y) | |
############################################# | |
# If doing line search, update trust radius # | |
############################################# | |
if True: | |
ReductionRatio = 2 * (f_xnew - f_x) / dot(s, df_x) | |
print '\nReduction ratio:', ReductionRatio | |
if ReductionRatio <= 0: | |
print 'Iterations are producing a WORSE answer!\nBailing.' | |
print 'Rejecting step and restarting BFGS' | |
AcceptStep = False | |
if DoLBFGS: | |
H.Reset() | |
H.Update(s, y) | |
else: | |
gamma = s_y / norm_y ** 2 | |
H.SetToScaledIdentity(gamma) | |
if not DoLineSearch: #ie do trust region | |
#Compute model quality? | |
ReductionRatio = 2 * (f_xnew - f_x) / dot(s, df_x) | |
print '\nReduction ratio:', ReductionRatio | |
if ReductionRatio > 0.75: | |
if norm(s) == TrustRadius: | |
TrustRadius = min (2 * TrustRadius, MaxTrustRadius) | |
elif ReductionRatio < 0.25: | |
TrustRadius *= 0.25 | |
################### | |
# Check convergence | |
################### | |
if norm_g < gtol: | |
break | |
################################# | |
# Make sure we are going downhill | |
################################# | |
NegCosDescentAngle = s_y / (norm_s * norm_y) | |
DescentAngle = arccos(-NegCosDescentAngle) | |
DescentRatio = (f_x - f_xnew) / f_x | |
isGoingUphill = NegCosDescentAngle < UphillAngleThresh | |
isIncreasing = -DescentRatio > UphillFThresh | |
if isGoingUphill or isIncreasing: | |
if isGoingUphill: | |
print '\nIteration %d: WARNING: Going uphill with angle %f' % \ | |
(k, -DescentAngle) | |
if isIncreasing: | |
print '\nIteration %d: WARNING: function increased by ratio %f' % \ | |
(k, -DescentRatio) | |
print 'Rejecting step and restarting BFGS' | |
AcceptStep = False | |
if DoLBFGS: | |
H.Reset() | |
H.Update(s, y) | |
else: | |
gamma = s_y / norm_y ** 2 | |
H.SetToScaledIdentity(gamma) | |
else: | |
##################################### | |
# Accept step, update quasi-Hessian # | |
##################################### | |
if not DidDIIS: | |
H.Update(s, y) | |
##################################################### | |
# EXPERIMENTAL: Heuristics for switching algorithms # | |
##################################################### | |
if False and DoLineSearch and DescentRatio < 1e-2: | |
print '\nTurning off linesearch' | |
DoLineSearch = False | |
if False and DescentRatio < 1e-2 and DIISTrigger==0: | |
print 'Do DIIS' | |
DIISTrigger = 2 | |
DIIS = DIISExtrapolator() | |
#print 'Switching to BFGS' | |
#H = BFGSInvHessian(N) | |
#gamma = dot(s, y) / dot(y, y) | |
#H.SetToScaledIdentity(gamma) | |
################################################### | |
# Accept step: update function value and gradient # | |
################################################### | |
if AcceptStep: | |
x = xnew | |
df_x = df_xnew | |
f_x = f_xnew | |
norm_g = norm_gnew | |
print "Iteration %d: f(x) = %f (%fx), ||f'(x)|| = %f, || dx || = %f, descent angle = %f" \ | |
% (k, f_x, DescentRatio, norm_g, norm_s, DescentAngle) | |
if k == maxiter - 1: print 'Maximum number of iterations reached' | |
print 'Quasi-Newton done' | |
""" | |
W = ApproxHessian() | |
W.SetToNumericalHessian(df, x) | |
Hess = W.GetHessian() | |
from numpy.linalg import svd | |
for eigval in svd(Hess)[1]: | |
print eigval | |
""" | |
exit() | |
return x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment