Last active
August 12, 2017 03:42
-
-
Save rmcgibbo/4735287 to your computer and use it in GitHub Desktop.
BFGS optimization with only information about the function gradient (no knowledge of the function value). I've coded it in C++ (boost uBLAS) and python (numpy). Most of the code is copied directly from scipy.optimize._minimize_bfgs, with only minor modifications.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* Copyright (c) 2013, SciPy Developers, Robert McGibbon | |
| * All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * | |
| * 1. Redistributions of source code must retain the above copyright notice, this | |
| * list of conditions and the following disclaimer. | |
| * 2. Redistributions in binary form must reproduce the above copyright notice, | |
| * this list of conditions and the following disclaimer in the documentation | |
| * and/or other materials provided with the distribution. | |
| */ | |
| #include <iostream> | |
| #include "boost/tuple/tuple.hpp" | |
| #include <boost/numeric/ublas/io.hpp> | |
| #include <boost/numeric/ublas/vector.hpp> | |
| #include <boost/numeric/ublas/matrix.hpp> | |
| #include <assert.h> | |
| using namespace boost::numeric::ublas; | |
| typedef vector<double> dvec; | |
| typedef matrix<double> dmat; | |
| using std::cout; | |
| using std::endl; | |
| /* | |
| * Derivative of the Rosenbrock function of two variables. This is just used for | |
| * testing, but it illustrates the API that the functions need to implement for | |
| * the optimizer. They need to take a const reference x as the first parameter, | |
| * and a const void* that contains any additional parameters. They "return" | |
| * their result by writing them into the grad vector, which is of the same size | |
| * as x. | |
| */ | |
| void rosen_der(const dvec& x, const void* params, vector<double>& grad) | |
| { | |
| double x0 = x[0]; | |
| double x1 = x[1]; | |
| grad[0] = -400*x0*(x1 - x0*x0) - 2*(1-x0); | |
| grad[1] = 200*(x1-x0*x0); | |
| return; | |
| } | |
| /* | |
| * Inexact line search with only the function gradient | |
| * | |
| * The step size is found only to satisfy the strong curvature condition, (i.e | |
| * the second of the strong Wolfe conditions) | |
| * | |
| * Parameters | |
| * ---------- | |
| * fprime : callable f(x, args, &grad) | |
| * gradient of the objective function to be minimized | |
| * xk : vector | |
| * current value of x | |
| * gk : vector | |
| * current value of the gradient | |
| * pk : vector | |
| * search direction | |
| * args : tuple, optional | |
| * Extra arguments to be passed to `fprime` | |
| * | |
| * Returns | |
| * ------- | |
| * alpha : float | |
| * The step length | |
| * | |
| * Site Effects: these are set as additional return values | |
| * ------------ | |
| * n_iters : int | |
| * The number of times fprime() was evaluated | |
| * gfkp1 : vector | |
| * The gradient value at the alpha, `fprime(xk + alpha*pk)` | |
| * | |
| * Other Parameters | |
| * ----------------- | |
| * alpha_guess : float, default = 1.0 | |
| * initial guess for the step size | |
| * curvature_condition : float, default = 0.9 | |
| * strength of the curvature condition. this is the c_2 on the wikipedia | |
| * http://en.wikipedia.org/wiki/Wolfe_conditions, and is recommended to be | |
| * 0.9 according to that page. It should be between 0 and 1. | |
| * update_rate : float, default=0.5 | |
| * Basically, we keep decreasing the alpha by multiplying by this constant | |
| * every iteration. It should be between 0 and 1. | |
| * maxiters : int, default=5 | |
| * Maximum number of iterations. The goal is that the line search step is | |
| * really quick, so we don't want to spend too much time fiddling with alpha | |
| * | |
| */ | |
| double line_search(void (*fprime)(const dvec &, const void*, dvec&), | |
| const dvec& xk, const dvec& gk, const dvec& pk, | |
| const void* args, dvec& gfkp1, int& n_iters, | |
| const double alpha_guess=1.0, const double curvature_condition=0.9, | |
| const double update_rate=0.5, const int maxiters=5) | |
| { | |
| int j; | |
| double alpha = alpha_guess; | |
| double initial_slope = inner_prod(gk, pk); | |
| for (j = 0; j < maxiters; j++) { | |
| fprime(xk + alpha * pk, args, gfkp1); | |
| if (fabs(inner_prod(gfkp1, pk)) < fabs(curvature_condition * initial_slope)) { | |
| break; | |
| } else { | |
| alpha *= update_rate; | |
| } | |
| } | |
| n_iters = j+1; | |
| return alpha; | |
| } | |
| /* | |
| * Minimize a function, via only information about its gradient, using BFGS | |
| * | |
| * The difference between this and the "standard" BFGS algorithm is that the | |
| * line search component uses a weaker criterion, because it can't check | |
| * for sure that the function value actually decreased. | |
| * | |
| * Parameters | |
| * ---------- | |
| * fprime : callable f(x, args, grad) | |
| * gradient of the objective function to be minimized. `fprime` should take | |
| * the input `x` and a void* of parameters (args) as the first two arguments | |
| * and then place its results in the referenced | |
| * x0 : vector | |
| * Initial guess | |
| * args : void* | |
| * Extra arguments to be passed to `fprime` | |
| * gtol : float, optional | |
| * gradient norm must be less than `gtol` before succesful termination | |
| * maxiter : int, optional | |
| * Maximum number of iterations to perform. | |
| * | |
| * Returns | |
| * ------- | |
| * xopt : vector | |
| * Parameters which minimize `f`, and which are a root of the gradient, | |
| * `fprime` | |
| * gopt : vector | |
| * value of the gradient at `xopt`, which should be near zero | |
| * Hopt : matrix | |
| * final estimate of the hessian matrix at `xopt` | |
| * n_grad_calls : int | |
| * number of gradient calls made | |
| */ | |
| boost::tuple< dvec, dvec, dmat, int> fmin_bfgs(void (*fprime)(const dvec &, const void*, dvec&), | |
| const dvec& x0, const void* args, const double gtol=1e-5, int maxiter=-1) | |
| { | |
| int N = x0.size(); // degrees of freedom | |
| int k = 0; // iteration counter | |
| dvec gfk(N); // container for the current gradient | |
| dvec gfkp1(N); // container for the next gradient | |
| dmat Hk(N, N); // container for the Hessian | |
| identity_matrix<double> I(N); | |
| Hk = I; | |
| if (maxiter < 0){ | |
| maxiter = 200*N; | |
| } | |
| fprime(x0, args, gfk); // set the initial gradient | |
| int n_grad_calls = 1; // we want to keep a record, for reporting performance | |
| double gnorm = norm_2(gfk); // initial norm | |
| dvec xk(x0); | |
| while ((gnorm > gtol) and (k < maxiter)) { | |
| dvec pk(-prod(Hk, gfk)); | |
| int grad_calls; | |
| double alpha_k = line_search(fprime, xk, gfk, pk, args, gfkp1, grad_calls); | |
| n_grad_calls += grad_calls; | |
| dvec xkp1(xk + (alpha_k * pk)); | |
| dvec sk(xkp1 - xk); | |
| xk = xkp1; | |
| dvec yk(gfkp1 - gfk); | |
| gfk = gfkp1; | |
| k++; | |
| gnorm = norm_2(gfk); | |
| if (gnorm < gtol) { | |
| break; | |
| } | |
| double rhok = 1.0 / inner_prod(yk, sk); | |
| if (std::isnan(rhok) || std::isinf(rhok)) { | |
| cout << "Divide-by-zero encountered: rhok assumed large" << endl; | |
| rhok = 1000.0; | |
| } | |
| // main BFGS update to the Hessian | |
| dmat A1 = I - outer_prod(sk, yk) * rhok; | |
| dmat A2 = I - outer_prod(yk, sk) * rhok; | |
| dmat Hk_A2 = prod(Hk, A2); | |
| dmat new_Hk = prod(A1, Hk_A2) + rhok * outer_prod(sk, sk); | |
| Hk = new_Hk; | |
| } | |
| if (k >= maxiter) { | |
| cout << "Warning " << maxiter << " iterations exceeded" << endl; | |
| cout << " Current gnorm: " << gnorm << endl; | |
| cout << " grad calls: " << n_grad_calls << endl; | |
| cout << " iterations: " << k << endl; | |
| } else if (gnorm < gtol) { | |
| cout << "Optimization terminated successfully." << gtol << endl; | |
| cout << " Current gnorm: " << gnorm << endl; | |
| cout << " grad calls: " << n_grad_calls << endl; | |
| cout << " iterations: " << k << endl; | |
| } | |
| return boost::make_tuple(xk, gfk, Hk, n_grad_calls); | |
| } | |
| struct Params { | |
| int a; | |
| }; | |
| // test function | |
| int main(void) { | |
| std::vector<double> x0(2); | |
| Params p = {1}; | |
| x0[0] = -50; | |
| x0[1] = 20.0; | |
| // copy the initial values into ublas::vector containers | |
| dvec ublas_x0(2); | |
| std::copy(x0.begin(), x0.end(), ublas_x0.begin()); | |
| boost::tuple< dvec, dvec, dmat, int> result; | |
| result = fmin_bfgs(&rosen_der, ublas_x0, &p); | |
| dvec xopt = result.get<0>(); | |
| assert(fabs(xopt[0] - 1) < 1e-5); | |
| assert(fabs(xopt[1] - 1) < 1e-5); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2013, SciPy Developers, Robert McGibbon | |
| # All rights reserved. | |
| # Redistribution and use in source and binary forms, with or without | |
| # modification, are permitted provided that the following conditions are met: | |
| # 1. Redistributions of source code must retain the above copyright notice, this | |
| # list of conditions and the following disclaimer. | |
| # 2. Redistributions in binary form must reproduce the above copyright notice, | |
| # this list of conditions and the following disclaimer in the documentation | |
| # and/or other materials provided with the distribution. | |
| import numpy as np | |
| from collections import namedtuple | |
| Result = namedtuple('Result', ['xopt', 'gopt', 'Hopt', 'n_grad_calls']) | |
| def fmin_bfgs(fprime, x0, args=(), gtol=1e-5, callback=None, maxiter=None): | |
| """Minimize a function, via only information about its gradient, using BFGS | |
| The difference between this and the "standard" BFGS algorithm is that the | |
| line search component uses a weaker criterion, because it can't check | |
| for sure that the function value actually decreased. | |
| Parameters | |
| ---------- | |
| fprime : callable f(x, *args) | |
| gradient of the objective function to be minimized | |
| x0 : ndarray | |
| Initial guess | |
| args : tuple, optional | |
| Extra arguments to be passed to `fprime` | |
| gtol : float, optional | |
| gradient norm must be less than `gtol` before succesful termination | |
| callback : callable, optional | |
| An optional user-supplied function to call after each iteration. | |
| Called as `callback(xk)`, where `xk` is the current parameter vector. | |
| maxiter : int, optional | |
| Maximum number of iterations to perform. | |
| Returns | |
| ------- | |
| xopt : ndarray | |
| Parameters which minimize `f`, and which are a root of the gradient, | |
| `fprime` | |
| gopt : ndrarray | |
| value of the gradient at `xopt`, which should be near zero | |
| Hopt : ndarray | |
| final estimate of the hessian matrix at `xopt` | |
| n_grad_calls : int | |
| number of gradient calls made | |
| """ | |
| x0 = np.asarray(x0).flatten() | |
| if maxiter is None: | |
| maxiter = len(x0)*200 | |
| gfk = fprime(x0, *args) # initial gradient | |
| n_grad_calls = 1 # number of calls to fprime() | |
| k = 0 # iteration counter | |
| N = len(x0) # degreees of freedom | |
| I = np.eye(N, dtype=int) | |
| Hk = I # initial guess of the Hessian | |
| xk = x0 | |
| sk = [2*gtol] | |
| gnorm = np.linalg.norm(gfk) | |
| while (gnorm > gtol) and (k < maxiter): | |
| # search direction | |
| pk = -np.dot(Hk, gfk) | |
| alpha_k, gfkp1, ls_grad_calls = _line_search(fprime, xk, gfk, pk, args) | |
| n_grad_calls += ls_grad_calls | |
| # advance in the direction of the step | |
| xkp1 = xk + alpha_k * pk | |
| sk = xkp1 - xk | |
| xk = xkp1 | |
| if gfkp1 is None: | |
| gfkp1 = fprime(xkp1, *args) | |
| n_grad_calls += 1 | |
| yk = gfkp1 - gfk | |
| gfk = gfkp1 | |
| if callback is not None: | |
| callback(xk) | |
| k += 1 | |
| gnorm = np.linalg.norm(gfk) | |
| if gnorm < gtol: | |
| break | |
| try: #this was handled in numeric, let it remaines for more safety | |
| rhok = 1.0 / (np.dot(yk, sk)) | |
| except ZeroDivisionError: | |
| rhok = 1000.0 | |
| print "Divide-by-zero encountered: rhok assumed large" | |
| if np.isinf(rhok): #this is patch for numpy | |
| rhok = 1000.0 | |
| print "Divide-by-zero encountered: rhok assumed large" | |
| # main bfgs update here. this is copied straight from | |
| # scipy.optimize._minimize_bfgs | |
| A1 = I - sk[:, np.newaxis] * yk[np.newaxis, :] * rhok | |
| A2 = I - yk[:, np.newaxis] * sk[np.newaxis, :] * rhok | |
| Hk = np.dot(A1, np.dot(Hk, A2)) + rhok * sk[:, np.newaxis] \ | |
| * sk[np.newaxis, :] | |
| if k >= maxiter: | |
| print "Warning: %d iterations exceeded" % maxiter | |
| print " Current gnorm: %f" % gnorm | |
| print " grad calls: %d" % n_grad_calls | |
| print " iterations: %d" % k | |
| elif gnorm < gtol: | |
| print "Optimization terminated successfully." | |
| print " Current gnorm: %f" % gnorm | |
| print " grad calls: %d" % n_grad_calls | |
| print " iterations: %d" % k | |
| return Result(xopt=xk, gopt=gfk, Hopt=Hk, n_grad_calls=n_grad_calls) | |
| def test_fmin_bfgs_1(): | |
| import scipy.optimize | |
| #def callback(xk): | |
| # pass | |
| print fmin_bfgs(scipy.optimize.rosen_der, [-50, 20]).xopt | |
| #scipy.optimize.fmin_bfgs(scipy.optimize.rosen, [10, 20], scipy.optimize.rosen_der) | |
| def _line_search(fprime, xk, gk, pk, args=(), alpha_guess=1.0, curvature_condition=0.9, | |
| update_rate=0.5, maxiters=5): | |
| """Inexact line search with only the function gradient | |
| The step size is found only to satisfy the strong curvature condition, (i.e | |
| the second of the strong Wolfe conditions) | |
| Parameters | |
| ---------- | |
| fprime : callable f(x, *args) | |
| gradient of the objective function to be minimized | |
| xk : ndarray | |
| current value of x | |
| gk : ndarray | |
| current value of the gradient | |
| pk : ndarray | |
| search direction | |
| args : tuple, optional | |
| Extra arguments to be passed to `fprime` | |
| Returns | |
| ------- | |
| alpha : float | |
| The step length | |
| n_evaluations : int | |
| The number of times fprime() was evaluated | |
| gk : ndarray | |
| The gradient value at the alpha, `fprime(xk + alpha*pk)` | |
| Other Parameters | |
| ----------------- | |
| alpha_guess : float, default = 1.0 | |
| initial guess for the step size | |
| curvature_condition : float, default = 0.9 | |
| strength of the curvature condition. this is the c_2 on the wikipedia | |
| http://en.wikipedia.org/wiki/Wolfe_conditions, and is recommended to be | |
| 0.9 according to that page. It should be between 0 and 1. | |
| update_rate : float, default=0.5 | |
| Basically, we keep decreasing the alpha by multiplying by this constant | |
| every iteration. It should be between 0 and 1. | |
| maxiters : int, default=5 | |
| Maximum number of iterations. The goal is that the line search step is | |
| really quick, so we don't want to spend too much time fiddling with alpha | |
| """ | |
| alpha = alpha_guess | |
| initial_slope = np.dot(gk, pk) | |
| for j in xrange(maxiters): | |
| gk = fprime(xk + alpha * pk, *args) | |
| if np.abs(np.dot(gk, pk)) < np.abs(curvature_condition * initial_slope): | |
| break | |
| else: | |
| alpha *= update_rate | |
| # j+1 is the final number of calls to fprime() | |
| return alpha, gk, j+1 | |
| if __name__ == '__main__': | |
| test_fmin_bfgs_1() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment