gburtini · July 26, 2025 23:57
diff --git a/optimization-descent-method.js b/optimization-descent-method.js
 // NOTE: return to revision 2 for a no-dependency, one dimensional version of gradient and Newton descent.
 const { Matrconst { Matrix, Vector } = require('sylvester');
 const { isEqual, omit } = require('lodash');

 const defaults = {
  gradientTolerance: 0.01,
  maxIterates: 10,
  dieOnBadStep: false,
  allowNumericalDifferentiation: true,
  allowBfgs: true,
 };

 function computeBfgsUpdate(newGradient, lastGradient, step, appliedHessian) {
  const gradientChange = newGradient.transpose().subtract(lastGradient.transpose());

  const firstScalar = step
    .transpose()
    .multiply(gradientChange)
    .add(gradientChange
      .transpose()
      .multiply(appliedHessian)
      .multiply(gradientChange))
    .e(1, 1);

  const secondScalar = step
    .transpose()
    .multiply(gradientChange)
    .e(1, 1);

  if (secondScalar === 0) {
    throw new Error('We appear to have overflowed trying to compute the BFGS update. Reduce your precision or rewrite this code to be more stable.');
  }

  const firstTerm = step.multiply(step.transpose()).multiply(firstScalar / secondScalar ** 2);
  const secondTerm = appliedHessian
    .multiply(gradientChange)
    .multiply(step.transpose())
    .add(step.multiply(gradientChange.transpose()).multiply(appliedHessian))
    .multiply(1 / secondScalar);

  return appliedHessian.add(firstTerm).subtract(secondTerm);
 }

 function descent({ f, g, h } = {}, x0 = Vector.create([0]), alpha = 0.01, options = defaults) {
  options = Object.assign({}, defaults, options);

  // Here be some sanity checks.
  if (!g) {
    // TODO: if the gradient isn't provided, approximate it numerically.
    throw new Error('You must provide a gradient.');
  }
  if (!isEqual(g(x0).dimensions(), x0.dimensions())) {
    throw new Error('Gradient must provide a result the same dimensionality as x.');
  }
  if (options.wolfe.c1 <= 0 || options.wolfe.c1 >= options.wolfe.c2 || options.wolfe.c2 >= 1) {
    throw new Error('Wolfe conditions constants must satisfy 0 < c1 < c2 < 1.');
  }

  // if the Hessian isn't provided, just make it the identity so this collapses to gradient descent.
  let useBfgs = false;
  if (!h) {
    useBfgs = options.allowBfgs;
    h = () => Matrix.I(x0.dimensions().cols);
  } else {
    const hessianDimensions = h(x0).dimensions();
    if (
      hessianDimensions.cols !== x0.dimensions().cols ||
      hessianDimensions.rows !== hessianDimensions.cols
    ) {
      throw new Error('Hessian function should provide a square matrix the same size as x.');
    }
  }

  let x = x0;
  let i;

  // appliedHessian takes one of three forms:
  // 1. In the gradient descent case, it is the identity matrix.
  // 2. In the Newton method case (when h(x) is available), it is the Hessian.
  // 3. In the BFGS case (when h(x) is not available), it is B, the BFGS approximation.
  let appliedHessian = h(x0).inverse();
  let lastValue = Infinity;
  for (i = 0; i < options.maxIterates; i += 1) {
    const value = f(x);
    const gradient = g(x);

    if (Math.abs(gradient.modulus()) < options.gradientTolerance) {
      console.log('Finished because gradient is smaller than tolerance.');
      break;
    }

    // take a step of size alpha in the direction defined by the gradient.
    if (!useBfgs) {
      // TODO: it would be valuable to document why gradient divided by hessian is approximately
      // the right step direction.
      appliedHessian = h(x).inverse();
    }

    // TODO: alpha should actually be adaptive/line search, alpha = arg min f(x + step)
    const direction = appliedHessian.multiply(gradient);
    const step = direction.multiply(-1 * alpha);
    console.log('x = ', x, 'f(x) =', value, 'g(x) =', gradient, 'h^-1(x) ~=', appliedHessian);
    x = x.add(step);
    console.log('Moving x by ', step);

    const nextGradient = g(x);
    if (useBfgs) {
      // If we're using BFGS (no Hessian provided and allowBfgs is on), this updates the
      // appliedHessian to be a closer estimate according to the following formula.
      // https://wikimedia.org/api/rest_v1/media/math/render/svg/a69329f93f8e0c14b43d47cfff241160d4ab4c96
      appliedHessian = computeBfgsUpdate(nextGradient, gradient, step.transpose(), appliedHessian);
    }

    // TODO: Wolfe conditions/step size considerations seem to vary a lot.
    // -- in gradient descent, updating step size is essential.
    // -- in BFGS, doing this seems to cause problems
    // -- in pure Newton, step size = 1.
    
    lastValue = value;
  }

  return {
    x,
    value: f(x),
    gradient: g(x),

    // NOTE: in gradient descent case, this is I. In Newton case, its not estimated.
    estimatedHessian: appliedHessian.inverse(),
    iterationCount: i,
  };
 }
	// NOTE: return to revision 2 for a no-dependency, one dimensional version of gradient and Newton descent.
	const { Matrconst { Matrix, Vector } = require('sylvester');
	const { isEqual, omit } = require('lodash');

	const defaults = {
	gradientTolerance: 0.01,
	maxIterates: 10,
	dieOnBadStep: false,
	allowNumericalDifferentiation: true,
	allowBfgs: true,
	};

	function computeBfgsUpdate(newGradient, lastGradient, step, appliedHessian) {
	const gradientChange = newGradient.transpose().subtract(lastGradient.transpose());

	const firstScalar = step
	.transpose()
	.multiply(gradientChange)
	.add(gradientChange
	.transpose()
	.multiply(appliedHessian)
	.multiply(gradientChange))
	.e(1, 1);

	const secondScalar = step
	.transpose()
	.multiply(gradientChange)
	.e(1, 1);

	if (secondScalar === 0) {
	throw new Error('We appear to have overflowed trying to compute the BFGS update. Reduce your precision or rewrite this code to be more stable.');
	}

	const firstTerm = step.multiply(step.transpose()).multiply(firstScalar / secondScalar ** 2);
	const secondTerm = appliedHessian
	.multiply(gradientChange)
	.multiply(step.transpose())
	.add(step.multiply(gradientChange.transpose()).multiply(appliedHessian))
	.multiply(1 / secondScalar);

	return appliedHessian.add(firstTerm).subtract(secondTerm);
	}

	function descent({ f, g, h } = {}, x0 = Vector.create([0]), alpha = 0.01, options = defaults) {
	options = Object.assign({}, defaults, options);

	// Here be some sanity checks.
	if (!g) {
	// TODO: if the gradient isn't provided, approximate it numerically.
	throw new Error('You must provide a gradient.');
	}
	if (!isEqual(g(x0).dimensions(), x0.dimensions())) {
	throw new Error('Gradient must provide a result the same dimensionality as x.');
	}
	if (options.wolfe.c1 <= 0 \|\| options.wolfe.c1 >= options.wolfe.c2 \|\| options.wolfe.c2 >= 1) {
	throw new Error('Wolfe conditions constants must satisfy 0 < c1 < c2 < 1.');
	}

	// if the Hessian isn't provided, just make it the identity so this collapses to gradient descent.
	let useBfgs = false;
	if (!h) {
	useBfgs = options.allowBfgs;
	h = () => Matrix.I(x0.dimensions().cols);
	} else {
	const hessianDimensions = h(x0).dimensions();
	if (
	hessianDimensions.cols !== x0.dimensions().cols \|\|
	hessianDimensions.rows !== hessianDimensions.cols
	) {
	throw new Error('Hessian function should provide a square matrix the same size as x.');
	}
	}

	let x = x0;
	let i;

	// appliedHessian takes one of three forms:
	// 1. In the gradient descent case, it is the identity matrix.
	// 2. In the Newton method case (when h(x) is available), it is the Hessian.
	// 3. In the BFGS case (when h(x) is not available), it is B, the BFGS approximation.
	let appliedHessian = h(x0).inverse();
	let lastValue = Infinity;
	for (i = 0; i < options.maxIterates; i += 1) {
	const value = f(x);
	const gradient = g(x);

	if (Math.abs(gradient.modulus()) < options.gradientTolerance) {
	console.log('Finished because gradient is smaller than tolerance.');
	break;
	}

	// take a step of size alpha in the direction defined by the gradient.
	if (!useBfgs) {
	// TODO: it would be valuable to document why gradient divided by hessian is approximately
	// the right step direction.
	appliedHessian = h(x).inverse();
	}

	// TODO: alpha should actually be adaptive/line search, alpha = arg min f(x + step)
	const direction = appliedHessian.multiply(gradient);
	const step = direction.multiply(-1 * alpha);
	console.log('x = ', x, 'f(x) =', value, 'g(x) =', gradient, 'h^-1(x) ~=', appliedHessian);
	x = x.add(step);
	console.log('Moving x by ', step);

	const nextGradient = g(x);
	if (useBfgs) {
	// If we're using BFGS (no Hessian provided and allowBfgs is on), this updates the
	// appliedHessian to be a closer estimate according to the following formula.
	// https://wikimedia.org/api/rest_v1/media/math/render/svg/a69329f93f8e0c14b43d47cfff241160d4ab4c96
	appliedHessian = computeBfgsUpdate(nextGradient, gradient, step.transpose(), appliedHessian);
	}

	// TODO: Wolfe conditions/step size considerations seem to vary a lot.
	// -- in gradient descent, updating step size is essential.
	// -- in BFGS, doing this seems to cause problems
	// -- in pure Newton, step size = 1.

	lastValue = value;
	}

	return {
	x,
	value: f(x),
	gradient: g(x),

	// NOTE: in gradient descent case, this is I. In Newton case, its not estimated.
	estimatedHessian: appliedHessian.inverse(),
	iterationCount: i,
	};
	}
No results found