Skip to content

Instantly share code, notes, and snippets.

@gburtini
Last active July 26, 2025 23:57
Show Gist options
  • Select an option

  • Save gburtini/82b9a68e46700802e09dc4b87ea9d381 to your computer and use it in GitHub Desktop.

Select an option

Save gburtini/82b9a68e46700802e09dc4b87ea9d381 to your computer and use it in GitHub Desktop.
Gradient descent, Newton's method and BFGS, all in one go.
// NOTE: return to revision 2 for a no-dependency, one dimensional version of gradient and Newton descent.
const { Matrconst { Matrix, Vector } = require('sylvester');
const { isEqual, omit } = require('lodash');
const defaults = {
gradientTolerance: 0.01,
maxIterates: 10,
dieOnBadStep: false,
allowNumericalDifferentiation: true,
allowBfgs: true,
};
function computeBfgsUpdate(newGradient, lastGradient, step, appliedHessian) {
const gradientChange = newGradient.transpose().subtract(lastGradient.transpose());
const firstScalar = step
.transpose()
.multiply(gradientChange)
.add(gradientChange
.transpose()
.multiply(appliedHessian)
.multiply(gradientChange))
.e(1, 1);
const secondScalar = step
.transpose()
.multiply(gradientChange)
.e(1, 1);
if (secondScalar === 0) {
throw new Error('We appear to have overflowed trying to compute the BFGS update. Reduce your precision or rewrite this code to be more stable.');
}
const firstTerm = step.multiply(step.transpose()).multiply(firstScalar / secondScalar ** 2);
const secondTerm = appliedHessian
.multiply(gradientChange)
.multiply(step.transpose())
.add(step.multiply(gradientChange.transpose()).multiply(appliedHessian))
.multiply(1 / secondScalar);
return appliedHessian.add(firstTerm).subtract(secondTerm);
}
function descent({ f, g, h } = {}, x0 = Vector.create([0]), alpha = 0.01, options = defaults) {
options = Object.assign({}, defaults, options);
// Here be some sanity checks.
if (!g) {
// TODO: if the gradient isn't provided, approximate it numerically.
throw new Error('You must provide a gradient.');
}
if (!isEqual(g(x0).dimensions(), x0.dimensions())) {
throw new Error('Gradient must provide a result the same dimensionality as x.');
}
if (options.wolfe.c1 <= 0 || options.wolfe.c1 >= options.wolfe.c2 || options.wolfe.c2 >= 1) {
throw new Error('Wolfe conditions constants must satisfy 0 < c1 < c2 < 1.');
}
// if the Hessian isn't provided, just make it the identity so this collapses to gradient descent.
let useBfgs = false;
if (!h) {
useBfgs = options.allowBfgs;
h = () => Matrix.I(x0.dimensions().cols);
} else {
const hessianDimensions = h(x0).dimensions();
if (
hessianDimensions.cols !== x0.dimensions().cols ||
hessianDimensions.rows !== hessianDimensions.cols
) {
throw new Error('Hessian function should provide a square matrix the same size as x.');
}
}
let x = x0;
let i;
// appliedHessian takes one of three forms:
// 1. In the gradient descent case, it is the identity matrix.
// 2. In the Newton method case (when h(x) is available), it is the Hessian.
// 3. In the BFGS case (when h(x) is not available), it is B, the BFGS approximation.
let appliedHessian = h(x0).inverse();
let lastValue = Infinity;
for (i = 0; i < options.maxIterates; i += 1) {
const value = f(x);
const gradient = g(x);
if (Math.abs(gradient.modulus()) < options.gradientTolerance) {
console.log('Finished because gradient is smaller than tolerance.');
break;
}
// take a step of size alpha in the direction defined by the gradient.
if (!useBfgs) {
// TODO: it would be valuable to document why gradient divided by hessian is approximately
// the right step direction.
appliedHessian = h(x).inverse();
}
// TODO: alpha should actually be adaptive/line search, alpha = arg min f(x + step)
const direction = appliedHessian.multiply(gradient);
const step = direction.multiply(-1 * alpha);
console.log('x = ', x, 'f(x) =', value, 'g(x) =', gradient, 'h^-1(x) ~=', appliedHessian);
x = x.add(step);
console.log('Moving x by ', step);
const nextGradient = g(x);
if (useBfgs) {
// If we're using BFGS (no Hessian provided and allowBfgs is on), this updates the
// appliedHessian to be a closer estimate according to the following formula.
// https://wikimedia.org/api/rest_v1/media/math/render/svg/a69329f93f8e0c14b43d47cfff241160d4ab4c96
appliedHessian = computeBfgsUpdate(nextGradient, gradient, step.transpose(), appliedHessian);
}
// TODO: Wolfe conditions/step size considerations seem to vary a lot.
// -- in gradient descent, updating step size is essential.
// -- in BFGS, doing this seems to cause problems
// -- in pure Newton, step size = 1.
lastValue = value;
}
return {
x,
value: f(x),
gradient: g(x),
// NOTE: in gradient descent case, this is I. In Newton case, its not estimated.
estimatedHessian: appliedHessian.inverse(),
iterationCount: i,
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment