Skip to content

Instantly share code, notes, and snippets.

@rmcgibbo
Last active August 12, 2017 03:42
Show Gist options
  • Save rmcgibbo/4735287 to your computer and use it in GitHub Desktop.
Save rmcgibbo/4735287 to your computer and use it in GitHub Desktop.
BFGS optimization with only information about the function gradient (no knowledge of the function value). I've coded it in C++ (boost uBLAS) and python (numpy). Most of the code is copied directly from scipy.optimize._minimize_bfgs, with only minor modifications.
/* Copyright (c) 2013, SciPy Developers, Robert McGibbon
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*/
#include <iostream>
#include "boost/tuple/tuple.hpp"
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <assert.h>
using namespace boost::numeric::ublas;
typedef vector<double> dvec;
typedef matrix<double> dmat;
using std::cout;
using std::endl;
/*
* Derivative of the Rosenbrock function of two variables. This is just used for
* testing, but it illustrates the API that the functions need to implement for
* the optimizer. They need to take a const reference x as the first parameter,
* and a const void* that contains any additional parameters. They "return"
* their result by writing them into the grad vector, which is of the same size
* as x.
*/
void rosen_der(const dvec& x, const void* params, vector<double>& grad)
{
double x0 = x[0];
double x1 = x[1];
grad[0] = -400*x0*(x1 - x0*x0) - 2*(1-x0);
grad[1] = 200*(x1-x0*x0);
return;
}
/*
* Inexact line search with only the function gradient
*
* The step size is found only to satisfy the strong curvature condition, (i.e
* the second of the strong Wolfe conditions)
*
* Parameters
* ----------
* fprime : callable f(x, args, &grad)
* gradient of the objective function to be minimized
* xk : vector
* current value of x
* gk : vector
* current value of the gradient
* pk : vector
* search direction
* args : tuple, optional
* Extra arguments to be passed to `fprime`
*
* Returns
* -------
* alpha : float
* The step length
*
* Site Effects: these are set as additional return values
* ------------
* n_iters : int
* The number of times fprime() was evaluated
* gfkp1 : vector
* The gradient value at the alpha, `fprime(xk + alpha*pk)`
*
* Other Parameters
* -----------------
* alpha_guess : float, default = 1.0
* initial guess for the step size
* curvature_condition : float, default = 0.9
* strength of the curvature condition. this is the c_2 on the wikipedia
* http://en.wikipedia.org/wiki/Wolfe_conditions, and is recommended to be
* 0.9 according to that page. It should be between 0 and 1.
* update_rate : float, default=0.5
* Basically, we keep decreasing the alpha by multiplying by this constant
* every iteration. It should be between 0 and 1.
* maxiters : int, default=5
* Maximum number of iterations. The goal is that the line search step is
* really quick, so we don't want to spend too much time fiddling with alpha
*
*/
double line_search(void (*fprime)(const dvec &, const void*, dvec&),
const dvec& xk, const dvec& gk, const dvec& pk,
const void* args, dvec& gfkp1, int& n_iters,
const double alpha_guess=1.0, const double curvature_condition=0.9,
const double update_rate=0.5, const int maxiters=5)
{
int j;
double alpha = alpha_guess;
double initial_slope = inner_prod(gk, pk);
for (j = 0; j < maxiters; j++) {
fprime(xk + alpha * pk, args, gfkp1);
if (fabs(inner_prod(gfkp1, pk)) < fabs(curvature_condition * initial_slope)) {
break;
} else {
alpha *= update_rate;
}
}
n_iters = j+1;
return alpha;
}
/*
* Minimize a function, via only information about its gradient, using BFGS
*
* The difference between this and the "standard" BFGS algorithm is that the
* line search component uses a weaker criterion, because it can't check
* for sure that the function value actually decreased.
*
* Parameters
* ----------
* fprime : callable f(x, args, grad)
* gradient of the objective function to be minimized. `fprime` should take
* the input `x` and a void* of parameters (args) as the first two arguments
* and then place its results in the referenced
* x0 : vector
* Initial guess
* args : void*
* Extra arguments to be passed to `fprime`
* gtol : float, optional
* gradient norm must be less than `gtol` before succesful termination
* maxiter : int, optional
* Maximum number of iterations to perform.
*
* Returns
* -------
* xopt : vector
* Parameters which minimize `f`, and which are a root of the gradient,
* `fprime`
* gopt : vector
* value of the gradient at `xopt`, which should be near zero
* Hopt : matrix
* final estimate of the hessian matrix at `xopt`
* n_grad_calls : int
* number of gradient calls made
*/
boost::tuple< dvec, dvec, dmat, int> fmin_bfgs(void (*fprime)(const dvec &, const void*, dvec&),
const dvec& x0, const void* args, const double gtol=1e-5, int maxiter=-1)
{
int N = x0.size(); // degrees of freedom
int k = 0; // iteration counter
dvec gfk(N); // container for the current gradient
dvec gfkp1(N); // container for the next gradient
dmat Hk(N, N); // container for the Hessian
identity_matrix<double> I(N);
Hk = I;
if (maxiter < 0){
maxiter = 200*N;
}
fprime(x0, args, gfk); // set the initial gradient
int n_grad_calls = 1; // we want to keep a record, for reporting performance
double gnorm = norm_2(gfk); // initial norm
dvec xk(x0);
while ((gnorm > gtol) and (k < maxiter)) {
dvec pk(-prod(Hk, gfk));
int grad_calls;
double alpha_k = line_search(fprime, xk, gfk, pk, args, gfkp1, grad_calls);
n_grad_calls += grad_calls;
dvec xkp1(xk + (alpha_k * pk));
dvec sk(xkp1 - xk);
xk = xkp1;
dvec yk(gfkp1 - gfk);
gfk = gfkp1;
k++;
gnorm = norm_2(gfk);
if (gnorm < gtol) {
break;
}
double rhok = 1.0 / inner_prod(yk, sk);
if (std::isnan(rhok) || std::isinf(rhok)) {
cout << "Divide-by-zero encountered: rhok assumed large" << endl;
rhok = 1000.0;
}
// main BFGS update to the Hessian
dmat A1 = I - outer_prod(sk, yk) * rhok;
dmat A2 = I - outer_prod(yk, sk) * rhok;
dmat Hk_A2 = prod(Hk, A2);
dmat new_Hk = prod(A1, Hk_A2) + rhok * outer_prod(sk, sk);
Hk = new_Hk;
}
if (k >= maxiter) {
cout << "Warning " << maxiter << " iterations exceeded" << endl;
cout << " Current gnorm: " << gnorm << endl;
cout << " grad calls: " << n_grad_calls << endl;
cout << " iterations: " << k << endl;
} else if (gnorm < gtol) {
cout << "Optimization terminated successfully." << gtol << endl;
cout << " Current gnorm: " << gnorm << endl;
cout << " grad calls: " << n_grad_calls << endl;
cout << " iterations: " << k << endl;
}
return boost::make_tuple(xk, gfk, Hk, n_grad_calls);
}
struct Params {
int a;
};
// test function
int main(void) {
std::vector<double> x0(2);
Params p = {1};
x0[0] = -50;
x0[1] = 20.0;
// copy the initial values into ublas::vector containers
dvec ublas_x0(2);
std::copy(x0.begin(), x0.end(), ublas_x0.begin());
boost::tuple< dvec, dvec, dmat, int> result;
result = fmin_bfgs(&rosen_der, ublas_x0, &p);
dvec xopt = result.get<0>();
assert(fabs(xopt[0] - 1) < 1e-5);
assert(fabs(xopt[1] - 1) < 1e-5);
}
# Copyright (c) 2013, SciPy Developers, Robert McGibbon
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
import numpy as np
from collections import namedtuple
Result = namedtuple('Result', ['xopt', 'gopt', 'Hopt', 'n_grad_calls'])
def fmin_bfgs(fprime, x0, args=(), gtol=1e-5, callback=None, maxiter=None):
"""Minimize a function, via only information about its gradient, using BFGS
The difference between this and the "standard" BFGS algorithm is that the
line search component uses a weaker criterion, because it can't check
for sure that the function value actually decreased.
Parameters
----------
fprime : callable f(x, *args)
gradient of the objective function to be minimized
x0 : ndarray
Initial guess
args : tuple, optional
Extra arguments to be passed to `fprime`
gtol : float, optional
gradient norm must be less than `gtol` before succesful termination
callback : callable, optional
An optional user-supplied function to call after each iteration.
Called as `callback(xk)`, where `xk` is the current parameter vector.
maxiter : int, optional
Maximum number of iterations to perform.
Returns
-------
xopt : ndarray
Parameters which minimize `f`, and which are a root of the gradient,
`fprime`
gopt : ndrarray
value of the gradient at `xopt`, which should be near zero
Hopt : ndarray
final estimate of the hessian matrix at `xopt`
n_grad_calls : int
number of gradient calls made
"""
x0 = np.asarray(x0).flatten()
if maxiter is None:
maxiter = len(x0)*200
gfk = fprime(x0, *args) # initial gradient
n_grad_calls = 1 # number of calls to fprime()
k = 0 # iteration counter
N = len(x0) # degreees of freedom
I = np.eye(N, dtype=int)
Hk = I # initial guess of the Hessian
xk = x0
sk = [2*gtol]
gnorm = np.linalg.norm(gfk)
while (gnorm > gtol) and (k < maxiter):
# search direction
pk = -np.dot(Hk, gfk)
alpha_k, gfkp1, ls_grad_calls = _line_search(fprime, xk, gfk, pk, args)
n_grad_calls += ls_grad_calls
# advance in the direction of the step
xkp1 = xk + alpha_k * pk
sk = xkp1 - xk
xk = xkp1
if gfkp1 is None:
gfkp1 = fprime(xkp1, *args)
n_grad_calls += 1
yk = gfkp1 - gfk
gfk = gfkp1
if callback is not None:
callback(xk)
k += 1
gnorm = np.linalg.norm(gfk)
if gnorm < gtol:
break
try: #this was handled in numeric, let it remaines for more safety
rhok = 1.0 / (np.dot(yk, sk))
except ZeroDivisionError:
rhok = 1000.0
print "Divide-by-zero encountered: rhok assumed large"
if np.isinf(rhok): #this is patch for numpy
rhok = 1000.0
print "Divide-by-zero encountered: rhok assumed large"
# main bfgs update here. this is copied straight from
# scipy.optimize._minimize_bfgs
A1 = I - sk[:, np.newaxis] * yk[np.newaxis, :] * rhok
A2 = I - yk[:, np.newaxis] * sk[np.newaxis, :] * rhok
Hk = np.dot(A1, np.dot(Hk, A2)) + rhok * sk[:, np.newaxis] \
* sk[np.newaxis, :]
if k >= maxiter:
print "Warning: %d iterations exceeded" % maxiter
print " Current gnorm: %f" % gnorm
print " grad calls: %d" % n_grad_calls
print " iterations: %d" % k
elif gnorm < gtol:
print "Optimization terminated successfully."
print " Current gnorm: %f" % gnorm
print " grad calls: %d" % n_grad_calls
print " iterations: %d" % k
return Result(xopt=xk, gopt=gfk, Hopt=Hk, n_grad_calls=n_grad_calls)
def test_fmin_bfgs_1():
import scipy.optimize
#def callback(xk):
# pass
print fmin_bfgs(scipy.optimize.rosen_der, [-50, 20]).xopt
#scipy.optimize.fmin_bfgs(scipy.optimize.rosen, [10, 20], scipy.optimize.rosen_der)
def _line_search(fprime, xk, gk, pk, args=(), alpha_guess=1.0, curvature_condition=0.9,
update_rate=0.5, maxiters=5):
"""Inexact line search with only the function gradient
The step size is found only to satisfy the strong curvature condition, (i.e
the second of the strong Wolfe conditions)
Parameters
----------
fprime : callable f(x, *args)
gradient of the objective function to be minimized
xk : ndarray
current value of x
gk : ndarray
current value of the gradient
pk : ndarray
search direction
args : tuple, optional
Extra arguments to be passed to `fprime`
Returns
-------
alpha : float
The step length
n_evaluations : int
The number of times fprime() was evaluated
gk : ndarray
The gradient value at the alpha, `fprime(xk + alpha*pk)`
Other Parameters
-----------------
alpha_guess : float, default = 1.0
initial guess for the step size
curvature_condition : float, default = 0.9
strength of the curvature condition. this is the c_2 on the wikipedia
http://en.wikipedia.org/wiki/Wolfe_conditions, and is recommended to be
0.9 according to that page. It should be between 0 and 1.
update_rate : float, default=0.5
Basically, we keep decreasing the alpha by multiplying by this constant
every iteration. It should be between 0 and 1.
maxiters : int, default=5
Maximum number of iterations. The goal is that the line search step is
really quick, so we don't want to spend too much time fiddling with alpha
"""
alpha = alpha_guess
initial_slope = np.dot(gk, pk)
for j in xrange(maxiters):
gk = fprime(xk + alpha * pk, *args)
if np.abs(np.dot(gk, pk)) < np.abs(curvature_condition * initial_slope):
break
else:
alpha *= update_rate
# j+1 is the final number of calls to fprime()
return alpha, gk, j+1
if __name__ == '__main__':
test_fmin_bfgs_1()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment