Last active
July 26, 2025 23:57
-
-
Save gburtini/82b9a68e46700802e09dc4b87ea9d381 to your computer and use it in GitHub Desktop.
Gradient descent, Newton's method and BFGS, all in one go.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // NOTE: return to revision 2 for a no-dependency, one dimensional version of gradient and Newton descent. | |
| const { Matrconst { Matrix, Vector } = require('sylvester'); | |
| const { isEqual, omit } = require('lodash'); | |
| const defaults = { | |
| gradientTolerance: 0.01, | |
| maxIterates: 10, | |
| dieOnBadStep: false, | |
| allowNumericalDifferentiation: true, | |
| allowBfgs: true, | |
| }; | |
| function computeBfgsUpdate(newGradient, lastGradient, step, appliedHessian) { | |
| const gradientChange = newGradient.transpose().subtract(lastGradient.transpose()); | |
| const firstScalar = step | |
| .transpose() | |
| .multiply(gradientChange) | |
| .add(gradientChange | |
| .transpose() | |
| .multiply(appliedHessian) | |
| .multiply(gradientChange)) | |
| .e(1, 1); | |
| const secondScalar = step | |
| .transpose() | |
| .multiply(gradientChange) | |
| .e(1, 1); | |
| if (secondScalar === 0) { | |
| throw new Error('We appear to have overflowed trying to compute the BFGS update. Reduce your precision or rewrite this code to be more stable.'); | |
| } | |
| const firstTerm = step.multiply(step.transpose()).multiply(firstScalar / secondScalar ** 2); | |
| const secondTerm = appliedHessian | |
| .multiply(gradientChange) | |
| .multiply(step.transpose()) | |
| .add(step.multiply(gradientChange.transpose()).multiply(appliedHessian)) | |
| .multiply(1 / secondScalar); | |
| return appliedHessian.add(firstTerm).subtract(secondTerm); | |
| } | |
| function descent({ f, g, h } = {}, x0 = Vector.create([0]), alpha = 0.01, options = defaults) { | |
| options = Object.assign({}, defaults, options); | |
| // Here be some sanity checks. | |
| if (!g) { | |
| // TODO: if the gradient isn't provided, approximate it numerically. | |
| throw new Error('You must provide a gradient.'); | |
| } | |
| if (!isEqual(g(x0).dimensions(), x0.dimensions())) { | |
| throw new Error('Gradient must provide a result the same dimensionality as x.'); | |
| } | |
| if (options.wolfe.c1 <= 0 || options.wolfe.c1 >= options.wolfe.c2 || options.wolfe.c2 >= 1) { | |
| throw new Error('Wolfe conditions constants must satisfy 0 < c1 < c2 < 1.'); | |
| } | |
| // if the Hessian isn't provided, just make it the identity so this collapses to gradient descent. | |
| let useBfgs = false; | |
| if (!h) { | |
| useBfgs = options.allowBfgs; | |
| h = () => Matrix.I(x0.dimensions().cols); | |
| } else { | |
| const hessianDimensions = h(x0).dimensions(); | |
| if ( | |
| hessianDimensions.cols !== x0.dimensions().cols || | |
| hessianDimensions.rows !== hessianDimensions.cols | |
| ) { | |
| throw new Error('Hessian function should provide a square matrix the same size as x.'); | |
| } | |
| } | |
| let x = x0; | |
| let i; | |
| // appliedHessian takes one of three forms: | |
| // 1. In the gradient descent case, it is the identity matrix. | |
| // 2. In the Newton method case (when h(x) is available), it is the Hessian. | |
| // 3. In the BFGS case (when h(x) is not available), it is B, the BFGS approximation. | |
| let appliedHessian = h(x0).inverse(); | |
| let lastValue = Infinity; | |
| for (i = 0; i < options.maxIterates; i += 1) { | |
| const value = f(x); | |
| const gradient = g(x); | |
| if (Math.abs(gradient.modulus()) < options.gradientTolerance) { | |
| console.log('Finished because gradient is smaller than tolerance.'); | |
| break; | |
| } | |
| // take a step of size alpha in the direction defined by the gradient. | |
| if (!useBfgs) { | |
| // TODO: it would be valuable to document why gradient divided by hessian is approximately | |
| // the right step direction. | |
| appliedHessian = h(x).inverse(); | |
| } | |
| // TODO: alpha should actually be adaptive/line search, alpha = arg min f(x + step) | |
| const direction = appliedHessian.multiply(gradient); | |
| const step = direction.multiply(-1 * alpha); | |
| console.log('x = ', x, 'f(x) =', value, 'g(x) =', gradient, 'h^-1(x) ~=', appliedHessian); | |
| x = x.add(step); | |
| console.log('Moving x by ', step); | |
| const nextGradient = g(x); | |
| if (useBfgs) { | |
| // If we're using BFGS (no Hessian provided and allowBfgs is on), this updates the | |
| // appliedHessian to be a closer estimate according to the following formula. | |
| // https://wikimedia.org/api/rest_v1/media/math/render/svg/a69329f93f8e0c14b43d47cfff241160d4ab4c96 | |
| appliedHessian = computeBfgsUpdate(nextGradient, gradient, step.transpose(), appliedHessian); | |
| } | |
| // TODO: Wolfe conditions/step size considerations seem to vary a lot. | |
| // -- in gradient descent, updating step size is essential. | |
| // -- in BFGS, doing this seems to cause problems | |
| // -- in pure Newton, step size = 1. | |
| lastValue = value; | |
| } | |
| return { | |
| x, | |
| value: f(x), | |
| gradient: g(x), | |
| // NOTE: in gradient descent case, this is I. In Newton case, its not estimated. | |
| estimatedHessian: appliedHessian.inverse(), | |
| iterationCount: i, | |
| }; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment