rmcgibbo · August 12, 2017 03:42
diff --git a/bfgs_only_fprime.cpp b/bfgs_only_fprime.cpp
 /* Copyright (c) 2013, SciPy Developers, Robert McGibbon
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met: 
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer. 
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution. 
 */

 #include <iostream>
 #include "boost/tuple/tuple.hpp"
 #include <boost/numeric/ublas/io.hpp>
 #include <boost/numeric/ublas/vector.hpp>
 #include <boost/numeric/ublas/matrix.hpp>
 #include <assert.h>

 using namespace boost::numeric::ublas;
 typedef vector<double> dvec;
 typedef matrix<double> dmat;
 using std::cout;
 using std::endl;

 /*
 * Derivative of the Rosenbrock function of two variables. This is just used for
 * testing, but it illustrates the API that the functions need to implement for
 * the optimizer. They need to take a const reference x as the first parameter,
 * and a const void* that contains any additional parameters. They "return"
 * their result by writing them into the grad vector, which is of the same size
 * as x.
 */
 void rosen_der(const dvec& x, const void* params, vector<double>& grad)
 {
    double x0 = x[0];
    double x1 = x[1];
    
    grad[0] = -400*x0*(x1 - x0*x0) - 2*(1-x0);
    grad[1] = 200*(x1-x0*x0);
    
    return;
 }

 /* 
 * Inexact line search with only the function gradient
 *
 * The step size is found only to satisfy the strong curvature condition, (i.e
 * the second of the strong Wolfe conditions)
 *
 * Parameters
 * ----------
 * fprime : callable f(x, args, &grad)
 *     gradient of the objective function to be minimized
 * xk : vector
 *     current value of x
 * gk : vector
 *    current value of the gradient
 * pk : vector
 *     search direction
 * args : tuple, optional
 *     Extra arguments to be passed to `fprime`
 *     
 * Returns
 * -------
 * alpha : float
 *     The step length
 * 
 * Site Effects: these are set as additional return values
 * ------------
 * n_iters : int
 *    The number of times fprime() was evaluated
 * gfkp1 : vector
 *    The gradient value at the alpha, `fprime(xk + alpha*pk)`
 *
 * Other Parameters
 * -----------------
 * alpha_guess : float, default = 1.0
 *     initial guess for the step size
 * curvature_condition : float, default = 0.9
 *     strength of the curvature condition. this is the c_2 on the wikipedia
 *     http://en.wikipedia.org/wiki/Wolfe_conditions, and is recommended to be
 *     0.9 according to that page. It should be between 0 and 1.
 * update_rate : float, default=0.5
 *     Basically, we keep decreasing the alpha by multiplying by this constant
 *     every iteration. It should be between 0 and 1.
 * maxiters : int, default=5
 *     Maximum number of iterations. The goal is that the line search step is
 *     really quick, so we don't want to spend too much time fiddling with alpha
 *
 */
 double line_search(void (*fprime)(const dvec &, const void*, dvec&),
                   const dvec& xk, const dvec& gk, const dvec& pk, 
                   const void* args, dvec& gfkp1, int& n_iters,
                   const double alpha_guess=1.0, const double curvature_condition=0.9,
                   const double update_rate=0.5, const int maxiters=5)
 {
    int j;
    double alpha = alpha_guess;
    double initial_slope = inner_prod(gk, pk);
    
    for (j = 0; j < maxiters; j++) {
        fprime(xk + alpha * pk, args, gfkp1);
        if (fabs(inner_prod(gfkp1, pk)) < fabs(curvature_condition * initial_slope)) {
            break;
        } else {
            alpha *= update_rate;
        }
    }
    n_iters = j+1;
    return alpha;

 }

 /*
 * Minimize a function, via only information about its gradient, using BFGS
 *
 * The difference between this and the "standard" BFGS algorithm is that the
 * line search component uses a weaker criterion, because it can't check
 * for sure that the function value actually decreased.
 * 
 * Parameters
 * ----------
 * fprime : callable f(x, args, grad)
 *     gradient of the objective function to be minimized. `fprime` should take
 *     the input `x` and a void* of parameters (args) as the first two arguments
 *     and then place its results in the referenced
 * x0 : vector
 *     Initial guess
 * args : void*
 *     Extra arguments to be passed to `fprime`
 * gtol : float, optional
 *     gradient norm must be less than `gtol` before succesful termination
 * maxiter : int, optional
 *     Maximum number of iterations to perform.
 *
 * Returns
 * -------
 * xopt : vector
 *     Parameters which minimize `f`, and which are a root of the gradient,
 *     `fprime`
 * gopt : vector
 *     value of the gradient at `xopt`, which should be near zero
 * Hopt : matrix
 *     final estimate of the hessian matrix at `xopt`
 * n_grad_calls : int
 *     number of gradient calls made
 */
 boost::tuple< dvec, dvec, dmat, int> fmin_bfgs(void (*fprime)(const dvec &, const void*, dvec&),
    const dvec& x0, const void* args, const double gtol=1e-5, int maxiter=-1)
 {
    int N = x0.size();  // degrees of freedom
    int k = 0;          // iteration counter
    dvec gfk(N);        // container for the current gradient
    dvec gfkp1(N);      // container for the next gradient
    dmat Hk(N, N);      // container for the Hessian
    identity_matrix<double> I(N);
    Hk = I;
    
    if (maxiter < 0){
        maxiter = 200*N;
    }
 
    fprime(x0, args, gfk);  // set the initial gradient
    int n_grad_calls = 1;   // we want to keep a record, for reporting performance
    
    double gnorm = norm_2(gfk);  // initial norm
    dvec xk(x0);
    
    while ((gnorm > gtol) and (k < maxiter)) {
        dvec pk(-prod(Hk, gfk));

        int grad_calls;
        double alpha_k = line_search(fprime, xk, gfk, pk, args, gfkp1, grad_calls);
        n_grad_calls += grad_calls;
        
        dvec xkp1(xk + (alpha_k * pk));
        dvec sk(xkp1 - xk);
        xk = xkp1;

        dvec yk(gfkp1 - gfk);
        gfk = gfkp1;
        
        k++;
        
        gnorm = norm_2(gfk);
        if (gnorm < gtol) {
            break;
        }
        
        double rhok = 1.0 / inner_prod(yk, sk);
        if (std::isnan(rhok) || std::isinf(rhok)) {
            cout << "Divide-by-zero encountered: rhok assumed large" << endl;
            rhok = 1000.0;
        }
        
        // main BFGS update to the Hessian
        dmat A1 = I - outer_prod(sk, yk) * rhok;
        dmat A2 = I - outer_prod(yk, sk) * rhok;
        dmat Hk_A2 = prod(Hk, A2);
        dmat new_Hk = prod(A1, Hk_A2) + rhok * outer_prod(sk, sk);
        Hk = new_Hk;
    }
    
    if (k >= maxiter) {
        cout << "Warning " << maxiter << " iterations exceeded" << endl;
        cout << "     Current gnorm: " << gnorm << endl;
        cout << "     grad calls: " << n_grad_calls << endl;
        cout << "     iterations: " << k << endl;
        
    } else if (gnorm < gtol) {
        cout << "Optimization terminated successfully." << gtol << endl;
        cout << "     Current gnorm: " << gnorm << endl;
        cout << "     grad calls: " << n_grad_calls << endl;
        cout << "     iterations: " << k << endl;
    }
    
    return boost::make_tuple(xk, gfk, Hk, n_grad_calls);
 }

 struct Params {
    int a;
 };

 // test function
 int main(void) {
    std::vector<double> x0(2);
    Params p = {1};
    x0[0] = -50;
    x0[1] = 20.0;
    
    // copy the initial values into ublas::vector containers
    dvec ublas_x0(2);
    std::copy(x0.begin(), x0.end(), ublas_x0.begin());
    boost::tuple< dvec, dvec, dmat, int> result;
    result = fmin_bfgs(&rosen_der, ublas_x0, &p);
    
    dvec xopt = result.get<0>();
    assert(fabs(xopt[0] - 1) < 1e-5);
    assert(fabs(xopt[1] - 1) < 1e-5);
 }
diff --git a/bfgs_only_fprime.py b/bfgs_only_fprime.py
 # Copyright (c) 2013, SciPy Developers, Robert McGibbon
 # All rights reserved.

 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met: 

 # 1. Redistributions of source code must retain the above copyright notice, this
 #    list of conditions and the following disclaimer. 
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 #    this list of conditions and the following disclaimer in the documentation
 #    and/or other materials provided with the distribution. 

 import numpy as np
 from collections import namedtuple
 Result = namedtuple('Result', ['xopt', 'gopt', 'Hopt', 'n_grad_calls'])

 def fmin_bfgs(fprime, x0, args=(), gtol=1e-5, callback=None, maxiter=None):
    """Minimize a function, via only information about its gradient, using BFGS
    
    The difference between this and the "standard" BFGS algorithm is that the
    line search component uses a weaker criterion, because it can't check
    for sure that the function value actually decreased.
    
    Parameters
    ----------
    fprime : callable f(x, *args)
        gradient of the objective function to be minimized
    x0 : ndarray
        Initial guess
    args : tuple, optional
        Extra arguments to be passed to `fprime`
    gtol : float, optional
        gradient norm must be less than `gtol` before succesful termination
    callback : callable, optional
        An optional user-supplied function to call after each iteration.
        Called as `callback(xk)`, where `xk` is the current parameter vector.
    maxiter : int, optional
        Maximum number of iterations to perform.
    
    Returns
    -------
    xopt : ndarray
        Parameters which minimize `f`, and which are a root of the gradient,
        `fprime`
    gopt : ndrarray
        value of the gradient at `xopt`, which should be near zero
    Hopt : ndarray
        final estimate of the hessian matrix at `xopt`
    n_grad_calls : int
        number of gradient calls made
    """
    
    x0 = np.asarray(x0).flatten()
    if maxiter is None:
            maxiter = len(x0)*200
    
    gfk = fprime(x0, *args)  # initial gradient
    n_grad_calls = 1  # number of calls to fprime()
    
    k = 0  # iteration counter
    N = len(x0)  # degreees of freedom
    I = np.eye(N, dtype=int)
    
    Hk = I  # initial guess of the Hessian
    xk = x0
    sk = [2*gtol]
    
    gnorm = np.linalg.norm(gfk)
    while (gnorm > gtol) and (k < maxiter):
        # search direction
        pk = -np.dot(Hk, gfk)
        
        alpha_k, gfkp1, ls_grad_calls = _line_search(fprime, xk, gfk, pk, args)
        
        n_grad_calls += ls_grad_calls
        
        # advance in the direction of the step
        xkp1 = xk + alpha_k * pk
        sk = xkp1 - xk
        xk = xkp1

        if gfkp1 is None:
            gfkp1 = fprime(xkp1, *args)
            n_grad_calls += 1
        
        yk = gfkp1 - gfk
        gfk = gfkp1
        
        if callback is not None:
            callback(xk)
        
        k += 1
        gnorm = np.linalg.norm(gfk)
        if gnorm < gtol:
            break
            
        try:  #this was handled in numeric, let it remaines for more safety
            rhok = 1.0 / (np.dot(yk, sk))
        except ZeroDivisionError:
            rhok = 1000.0
            print "Divide-by-zero encountered: rhok assumed large"
        if np.isinf(rhok):  #this is patch for numpy
            rhok = 1000.0
            print "Divide-by-zero encountered: rhok assumed large"

        # main bfgs update here. this is copied straight from
        # scipy.optimize._minimize_bfgs
        A1 = I - sk[:, np.newaxis] * yk[np.newaxis, :] * rhok
        A2 = I - yk[:, np.newaxis] * sk[np.newaxis, :] * rhok
        Hk = np.dot(A1, np.dot(Hk, A2)) + rhok * sk[:, np.newaxis] \
                * sk[np.newaxis, :]
                
    
    if k >= maxiter:
        print "Warning: %d iterations exceeded" % maxiter
        print "         Current gnorm: %f" % gnorm
        print "         grad calls: %d" % n_grad_calls
        print "         iterations: %d" % k
        
    
    elif gnorm < gtol:
        print "Optimization terminated successfully."
        print "         Current gnorm: %f" % gnorm
        print "         grad calls: %d" % n_grad_calls
        print "         iterations: %d" % k
        
    return Result(xopt=xk, gopt=gfk, Hopt=Hk, n_grad_calls=n_grad_calls)


 def test_fmin_bfgs_1():
    import scipy.optimize
    #def callback(xk):
    #    pass
    print fmin_bfgs(scipy.optimize.rosen_der, [-50, 20]).xopt

    #scipy.optimize.fmin_bfgs(scipy.optimize.rosen, [10, 20], scipy.optimize.rosen_der)


 def _line_search(fprime, xk, gk, pk, args=(), alpha_guess=1.0, curvature_condition=0.9,
    update_rate=0.5, maxiters=5):
    """Inexact line search with only the function gradient
    
    The step size is found only to satisfy the strong curvature condition, (i.e
    the second of the strong Wolfe conditions)
    
    Parameters
    ----------
    fprime : callable f(x, *args)
        gradient of the objective function to be minimized
    xk : ndarray
        current value of x
    gk : ndarray
        current value of the gradient
    pk : ndarray
        search direction
    args : tuple, optional
        Extra arguments to be passed to `fprime`
        
    Returns
    -------
    alpha : float
        The step length
    n_evaluations : int
        The number of times fprime() was evaluated
    gk : ndarray
        The gradient value at the alpha, `fprime(xk + alpha*pk)`
        
    Other Parameters
    -----------------
    alpha_guess : float, default = 1.0
        initial guess for the step size
    curvature_condition : float, default = 0.9
        strength of the curvature condition. this is the c_2 on the wikipedia
        http://en.wikipedia.org/wiki/Wolfe_conditions, and is recommended to be
        0.9 according to that page. It should be between 0 and 1.
    update_rate : float, default=0.5
        Basically, we keep decreasing the alpha by multiplying by this constant
        every iteration. It should be between 0 and 1.
    maxiters : int, default=5
        Maximum number of iterations. The goal is that the line search step is
        really quick, so we don't want to spend too much time fiddling with alpha
    """
    alpha = alpha_guess
    initial_slope = np.dot(gk, pk)

    for j in xrange(maxiters):
        gk = fprime(xk + alpha * pk, *args)
        if np.abs(np.dot(gk, pk)) < np.abs(curvature_condition * initial_slope):
            break
        else:
            alpha *= update_rate
            
    # j+1 is the final number of calls to fprime()
    return alpha, gk, j+1

 if __name__ == '__main__':
    test_fmin_bfgs_1()
	/* Copyright (c) 2013, SciPy Developers, Robert McGibbon
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*/

	#include <iostream>
	#include "boost/tuple/tuple.hpp"
	#include <boost/numeric/ublas/io.hpp>
	#include <boost/numeric/ublas/vector.hpp>
	#include <boost/numeric/ublas/matrix.hpp>
	#include <assert.h>

	using namespace boost::numeric::ublas;
	typedef vector<double> dvec;
	typedef matrix<double> dmat;
	using std::cout;
	using std::endl;

	/*
	* Derivative of the Rosenbrock function of two variables. This is just used for
	* testing, but it illustrates the API that the functions need to implement for
	* the optimizer. They need to take a const reference x as the first parameter,
	* and a const void* that contains any additional parameters. They "return"
	* their result by writing them into the grad vector, which is of the same size
	* as x.
	*/
	void rosen_der(const dvec& x, const void* params, vector<double>& grad)
	{
	double x0 = x[0];
	double x1 = x[1];

	grad[0] = -400x0(x1 - x0x0) - 2(1-x0);
	grad[1] = 200(x1-x0x0);

	return;
	}

	/*
	* Inexact line search with only the function gradient
	*
	* The step size is found only to satisfy the strong curvature condition, (i.e
	* the second of the strong Wolfe conditions)
	*
	* Parameters
	* ----------
	* fprime : callable f(x, args, &grad)
	* gradient of the objective function to be minimized
	* xk : vector
	* current value of x
	* gk : vector
	* current value of the gradient
	* pk : vector
	* search direction
	* args : tuple, optional
	* Extra arguments to be passed to `fprime`
	*
	* Returns
	* -------
	* alpha : float
	* The step length
	*
	* Site Effects: these are set as additional return values
	* ------------
	* n_iters : int
	* The number of times fprime() was evaluated
	* gfkp1 : vector
	* The gradient value at the alpha, `fprime(xk + alpha*pk)`
	*
	* Other Parameters
	* -----------------
	* alpha_guess : float, default = 1.0
	* initial guess for the step size
	* curvature_condition : float, default = 0.9
	* strength of the curvature condition. this is the c_2 on the wikipedia
	* http://en.wikipedia.org/wiki/Wolfe_conditions, and is recommended to be
	* 0.9 according to that page. It should be between 0 and 1.
	* update_rate : float, default=0.5
	* Basically, we keep decreasing the alpha by multiplying by this constant
	* every iteration. It should be between 0 and 1.
	* maxiters : int, default=5
	* Maximum number of iterations. The goal is that the line search step is
	* really quick, so we don't want to spend too much time fiddling with alpha
	*
	*/
	double line_search(void (fprime)(const dvec &, const void, dvec&),
	const dvec& xk, const dvec& gk, const dvec& pk,
	const void* args, dvec& gfkp1, int& n_iters,
	const double alpha_guess=1.0, const double curvature_condition=0.9,
	const double update_rate=0.5, const int maxiters=5)
	{
	int j;
	double alpha = alpha_guess;
	double initial_slope = inner_prod(gk, pk);

	for (j = 0; j < maxiters; j++) {
	fprime(xk + alpha * pk, args, gfkp1);
	if (fabs(inner_prod(gfkp1, pk)) < fabs(curvature_condition * initial_slope)) {
	break;
	} else {
	alpha *= update_rate;
	}
	}
	n_iters = j+1;
	return alpha;

	}

	/*
	* Minimize a function, via only information about its gradient, using BFGS
	*
	* The difference between this and the "standard" BFGS algorithm is that the
	* line search component uses a weaker criterion, because it can't check
	* for sure that the function value actually decreased.
	*
	* Parameters
	* ----------
	* fprime : callable f(x, args, grad)
	* gradient of the objective function to be minimized. `fprime` should take
	* the input `x` and a void* of parameters (args) as the first two arguments
	* and then place its results in the referenced
	* x0 : vector
	* Initial guess
	* args : void*
	* Extra arguments to be passed to `fprime`
	* gtol : float, optional
	* gradient norm must be less than `gtol` before succesful termination
	* maxiter : int, optional
	* Maximum number of iterations to perform.
	*
	* Returns
	* -------
	* xopt : vector
	* Parameters which minimize `f`, and which are a root of the gradient,
	* `fprime`
	* gopt : vector
	* value of the gradient at `xopt`, which should be near zero
	* Hopt : matrix
	* final estimate of the hessian matrix at `xopt`
	* n_grad_calls : int
	* number of gradient calls made
	*/
	boost::tuple< dvec, dvec, dmat, int> fmin_bfgs(void (fprime)(const dvec &, const void, dvec&),
	const dvec& x0, const void* args, const double gtol=1e-5, int maxiter=-1)
	{
	int N = x0.size(); // degrees of freedom
	int k = 0; // iteration counter
	dvec gfk(N); // container for the current gradient
	dvec gfkp1(N); // container for the next gradient
	dmat Hk(N, N); // container for the Hessian
	identity_matrix<double> I(N);
	Hk = I;

	if (maxiter < 0){
	maxiter = 200*N;
	}

	fprime(x0, args, gfk); // set the initial gradient
	int n_grad_calls = 1; // we want to keep a record, for reporting performance

	double gnorm = norm_2(gfk); // initial norm
	dvec xk(x0);

	while ((gnorm > gtol) and (k < maxiter)) {
	dvec pk(-prod(Hk, gfk));

	int grad_calls;
	double alpha_k = line_search(fprime, xk, gfk, pk, args, gfkp1, grad_calls);
	n_grad_calls += grad_calls;

	dvec xkp1(xk + (alpha_k * pk));
	dvec sk(xkp1 - xk);
	xk = xkp1;

	dvec yk(gfkp1 - gfk);
	gfk = gfkp1;

	k++;

	gnorm = norm_2(gfk);
	if (gnorm < gtol) {
	break;
	}

	double rhok = 1.0 / inner_prod(yk, sk);
	if (std::isnan(rhok) \|\| std::isinf(rhok)) {
	cout << "Divide-by-zero encountered: rhok assumed large" << endl;
	rhok = 1000.0;
	}

	// main BFGS update to the Hessian
	dmat A1 = I - outer_prod(sk, yk) * rhok;
	dmat A2 = I - outer_prod(yk, sk) * rhok;
	dmat Hk_A2 = prod(Hk, A2);
	dmat new_Hk = prod(A1, Hk_A2) + rhok * outer_prod(sk, sk);
	Hk = new_Hk;
	}

	if (k >= maxiter) {
	cout << "Warning " << maxiter << " iterations exceeded" << endl;
	cout << " Current gnorm: " << gnorm << endl;
	cout << " grad calls: " << n_grad_calls << endl;
	cout << " iterations: " << k << endl;

	} else if (gnorm < gtol) {
	cout << "Optimization terminated successfully." << gtol << endl;
	cout << " Current gnorm: " << gnorm << endl;
	cout << " grad calls: " << n_grad_calls << endl;
	cout << " iterations: " << k << endl;
	}

	return boost::make_tuple(xk, gfk, Hk, n_grad_calls);
	}

	struct Params {
	int a;
	};

	// test function
	int main(void) {
	std::vector<double> x0(2);
	Params p = {1};
	x0[0] = -50;
	x0[1] = 20.0;

	// copy the initial values into ublas::vector containers
	dvec ublas_x0(2);
	std::copy(x0.begin(), x0.end(), ublas_x0.begin());
	boost::tuple< dvec, dvec, dmat, int> result;
	result = fmin_bfgs(&rosen_der, ublas_x0, &p);

	dvec xopt = result.get<0>();
	assert(fabs(xopt[0] - 1) < 1e-5);
	assert(fabs(xopt[1] - 1) < 1e-5);
	}
	# Copyright (c) 2013, SciPy Developers, Robert McGibbon
	# All rights reserved.

	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are met:

	# 1. Redistributions of source code must retain the above copyright notice, this
	# list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright notice,
	# this list of conditions and the following disclaimer in the documentation
	# and/or other materials provided with the distribution.

	import numpy as np
	from collections import namedtuple
	Result = namedtuple('Result', ['xopt', 'gopt', 'Hopt', 'n_grad_calls'])

	def fmin_bfgs(fprime, x0, args=(), gtol=1e-5, callback=None, maxiter=None):
	"""Minimize a function, via only information about its gradient, using BFGS

	The difference between this and the "standard" BFGS algorithm is that the
	line search component uses a weaker criterion, because it can't check
	for sure that the function value actually decreased.

	Parameters
	----------
	fprime : callable f(x, *args)
	gradient of the objective function to be minimized
	x0 : ndarray
	Initial guess
	args : tuple, optional
	Extra arguments to be passed to `fprime`
	gtol : float, optional
	gradient norm must be less than `gtol` before succesful termination
	callback : callable, optional
	An optional user-supplied function to call after each iteration.
	Called as `callback(xk)`, where `xk` is the current parameter vector.
	maxiter : int, optional
	Maximum number of iterations to perform.

	Returns
	-------
	xopt : ndarray
	Parameters which minimize `f`, and which are a root of the gradient,
	`fprime`
	gopt : ndrarray
	value of the gradient at `xopt`, which should be near zero
	Hopt : ndarray
	final estimate of the hessian matrix at `xopt`
	n_grad_calls : int
	number of gradient calls made
	"""

	x0 = np.asarray(x0).flatten()
	if maxiter is None:
	maxiter = len(x0)*200

	gfk = fprime(x0, *args) # initial gradient
	n_grad_calls = 1 # number of calls to fprime()

	k = 0 # iteration counter
	N = len(x0) # degreees of freedom
	I = np.eye(N, dtype=int)

	Hk = I # initial guess of the Hessian
	xk = x0
	sk = [2*gtol]

	gnorm = np.linalg.norm(gfk)
	while (gnorm > gtol) and (k < maxiter):
	# search direction
	pk = -np.dot(Hk, gfk)

	alpha_k, gfkp1, ls_grad_calls = _line_search(fprime, xk, gfk, pk, args)

	n_grad_calls += ls_grad_calls

	# advance in the direction of the step
	xkp1 = xk + alpha_k * pk
	sk = xkp1 - xk
	xk = xkp1

	if gfkp1 is None:
	gfkp1 = fprime(xkp1, *args)
	n_grad_calls += 1

	yk = gfkp1 - gfk
	gfk = gfkp1

	if callback is not None:
	callback(xk)

	k += 1
	gnorm = np.linalg.norm(gfk)
	if gnorm < gtol:
	break

	try: #this was handled in numeric, let it remaines for more safety
	rhok = 1.0 / (np.dot(yk, sk))
	except ZeroDivisionError:
	rhok = 1000.0
	print "Divide-by-zero encountered: rhok assumed large"
	if np.isinf(rhok): #this is patch for numpy
	rhok = 1000.0
	print "Divide-by-zero encountered: rhok assumed large"

	# main bfgs update here. this is copied straight from
	# scipy.optimize._minimize_bfgs
	A1 = I - sk[:, np.newaxis] * yk[np.newaxis, :] * rhok
	A2 = I - yk[:, np.newaxis] * sk[np.newaxis, :] * rhok
	Hk = np.dot(A1, np.dot(Hk, A2)) + rhok * sk[:, np.newaxis] \
	* sk[np.newaxis, :]


	if k >= maxiter:
	print "Warning: %d iterations exceeded" % maxiter
	print " Current gnorm: %f" % gnorm
	print " grad calls: %d" % n_grad_calls
	print " iterations: %d" % k


	elif gnorm < gtol:
	print "Optimization terminated successfully."
	print " Current gnorm: %f" % gnorm
	print " grad calls: %d" % n_grad_calls
	print " iterations: %d" % k

	return Result(xopt=xk, gopt=gfk, Hopt=Hk, n_grad_calls=n_grad_calls)


	def test_fmin_bfgs_1():
	import scipy.optimize
	#def callback(xk):
	# pass
	print fmin_bfgs(scipy.optimize.rosen_der, [-50, 20]).xopt

	#scipy.optimize.fmin_bfgs(scipy.optimize.rosen, [10, 20], scipy.optimize.rosen_der)


	def _line_search(fprime, xk, gk, pk, args=(), alpha_guess=1.0, curvature_condition=0.9,
	update_rate=0.5, maxiters=5):
	"""Inexact line search with only the function gradient

	The step size is found only to satisfy the strong curvature condition, (i.e
	the second of the strong Wolfe conditions)

	Parameters
	----------
	fprime : callable f(x, *args)
	gradient of the objective function to be minimized
	xk : ndarray
	current value of x
	gk : ndarray
	current value of the gradient
	pk : ndarray
	search direction
	args : tuple, optional
	Extra arguments to be passed to `fprime`

	Returns
	-------
	alpha : float
	The step length
	n_evaluations : int
	The number of times fprime() was evaluated
	gk : ndarray
	The gradient value at the alpha, `fprime(xk + alpha*pk)`

	Other Parameters
	-----------------
	alpha_guess : float, default = 1.0
	initial guess for the step size
	curvature_condition : float, default = 0.9
	strength of the curvature condition. this is the c_2 on the wikipedia
	http://en.wikipedia.org/wiki/Wolfe_conditions, and is recommended to be
	0.9 according to that page. It should be between 0 and 1.
	update_rate : float, default=0.5
	Basically, we keep decreasing the alpha by multiplying by this constant
	every iteration. It should be between 0 and 1.
	maxiters : int, default=5
	Maximum number of iterations. The goal is that the line search step is
	really quick, so we don't want to spend too much time fiddling with alpha
	"""
	alpha = alpha_guess
	initial_slope = np.dot(gk, pk)

	for j in xrange(maxiters):
	gk = fprime(xk + alpha * pk, *args)
	if np.abs(np.dot(gk, pk)) < np.abs(curvature_condition * initial_slope):
	break
	else:
	alpha *= update_rate

	# j+1 is the final number of calls to fprime()
	return alpha, gk, j+1

	if __name__ == '__main__':
	test_fmin_bfgs_1()