ogrisel · March 8, 2022 22:30 · ogrisel · Oct 7, 2018
diff --git a/non_degenerate_mlp_gram.py b/non_degenerate_mlp_gram.py
 """Empirical evaluation of the extended feature Gram matrix of a ReLU MLP

 Here we try to estimate the spectrum of the H^\infty matrix as defined in:

 Gradient Descent Provably Optimizes Over-parameterized Neural Networks (2018)
 Simon S. Du, Xiyu Zhai, Barnabas Poczos, Aarti Singh
 https://arxiv.org/abs/1810.02054

 Theorem 4.1 relies on the assumption that H^\infty has a strictly positive
 minimum eigenvalue. The following computes an estimate of this eigenvalue
 for a toy digits dataset with 1797 samples of 64 dimensions. In this case
 we find that this assumption holds with \lambda_0 > 1.3e-2.

 """

 from time import time
 import numpy as np
 import numba
 import matplotlib.pyplot as plt
 from sklearn.datasets import load_digits
 from sklearn.preprocessing import normalize

 # Workaround: https://github.com/numba/numba/issues/3341
 numba.config.THREADING_LAYER = 'workqueue'


 @numba.jit(parallel=True)
 def compute_h_inf(X, n_iter=int(1e4), seed=0):
    n_samples, n_features = X.shape
    H_inf = np.zeros(shape=(n_samples, n_samples), dtype=X.dtype)
    W = np.random.RandomState(seed).randn(n_iter, n_features)
    W_X = W @ X.T > 0
    Gram = X @ X.T
    # Could be implemented with einsum as follows:
    # np.einsum('ij,ki,kj->ij', Gram, W_X, W_X) / n_iter
    # but using explicit numba loops makes it possible to use multi-threading.
    scale = 1. / n_iter
    for k in range(n_iter):
        for i in numba.prange(n_samples):
            for j in range(n_samples):
                H_inf[i, j] += scale * Gram[i, j] * W_X[k, i] * W_X[k, j]
    return H_inf


 digits = load_digits()
 X, y = digits.data, digits.target
 n_samples, n_features = X.shape
 print(f"Loaded digits data (n_samples={n_samples}, n_features={n_features})")

 print("Normalizing X...")
 X = normalize(X)

 print("Computing the spectrum of the data Gram matrix", end="", flush=True)
 t0 = time()
 eigvals_gram = np.linalg.eigvalsh(X @ X.T)
 print(f" done in {time() - t0:0.3f}s")
 print(f"lambda_min(XX^T): {eigvals_gram.min():0.3e}")
 # We only have 64 features, so the rank of this Gram matrix is bounded by 64.

 fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, constrained_layout=True,
                               figsize=(12, 8))
 ax0.semilogy(eigvals_gram[::-1])
 ax0.set_title('Spectrum of the data Gram matrix $XX^T$')
 ax0.set_ylabel('Eigenvalue (logscale)')


 for n_iter in [1_000, 10_000, 100_000]:
    print(f"Computing extended feature Gram H_inf with n_iter={n_iter}...",
          end="", flush=True)
    t0 = time()
    H_inf = compute_h_inf(X, n_iter=n_iter)
    print(f" done in {time() - t0:0.3f}s")
    print(f"H_inf.shape={H_inf.shape}")

    print("Checking that H_inf is symmetric...", end="", flush=True)
    np.testing.assert_allclose(H_inf, H_inf.T)
    print(" ok")

    print("Computing the spectrum of H_inf...", end="", flush=True)
    t0 = time()
    eigvals = np.linalg.eigvalsh(H_inf)
    print(f" done in {time() - t0:0.3f}s")
    print(f"lambda_min(H_inf): {eigvals.min():0.3e}")


 ax1.semilogy(eigvals[::-1])
 ax1.set_title('Spectrum of the extended feature Gram matrix: $H^\infty$')
 ax1.set_ylabel('Eigenvalue (logscale)')
 ax1.set_xlabel('Eigenvalue rank')
 plt.show()
diff --git a/output.txt b/output.txt
 Loaded digits data (n_samples=1797, n_features=64)
 Normalizing X...
 Computing the spectrum of the data Gram matrix done in 0.763s
 lambda_min(XX^T): -2.796e-13

 Computing extended feature Gram H_inf with n_iter=1000... done in 2.793s
 H_inf.shape=(1797, 1797)
 Checking that H_inf is symmetric... ok
 Computing the spectrum of H_inf... done in 0.592s
 lambda_min(H_inf): 3.083e-03
 Computing extended feature Gram H_inf with n_iter=10000... done in 18.585s
 H_inf.shape=(1797, 1797)
 Checking that H_inf is symmetric... ok
 Computing the spectrum of H_inf... done in 0.578s
 lambda_min(H_inf): 1.112e-02
 Computing extended feature Gram H_inf with n_iter=100000... done in 209.125s
 H_inf.shape=(1797, 1797)
 Checking that H_inf is symmetric... ok
 Computing the spectrum of H_inf... done in 0.607s
 lambda_min(H_inf): 1.354e-02
	"""Empirical evaluation of the extended feature Gram matrix of a ReLU MLP

	Here we try to estimate the spectrum of the H^\infty matrix as defined in:

	Gradient Descent Provably Optimizes Over-parameterized Neural Networks (2018)
	Simon S. Du, Xiyu Zhai, Barnabas Poczos, Aarti Singh
	https://arxiv.org/abs/1810.02054

	Theorem 4.1 relies on the assumption that H^\infty has a strictly positive
	minimum eigenvalue. The following computes an estimate of this eigenvalue
	for a toy digits dataset with 1797 samples of 64 dimensions. In this case
	we find that this assumption holds with \lambda_0 > 1.3e-2.

	"""

	from time import time
	import numpy as np
	import numba
	import matplotlib.pyplot as plt
	from sklearn.datasets import load_digits
	from sklearn.preprocessing import normalize

	# Workaround: https://github.com/numba/numba/issues/3341
	numba.config.THREADING_LAYER = 'workqueue'


	@numba.jit(parallel=True)
	def compute_h_inf(X, n_iter=int(1e4), seed=0):
	n_samples, n_features = X.shape
	H_inf = np.zeros(shape=(n_samples, n_samples), dtype=X.dtype)
	W = np.random.RandomState(seed).randn(n_iter, n_features)
	W_X = W @ X.T > 0
	Gram = X @ X.T
	# Could be implemented with einsum as follows:
	# np.einsum('ij,ki,kj->ij', Gram, W_X, W_X) / n_iter
	# but using explicit numba loops makes it possible to use multi-threading.
	scale = 1. / n_iter
	for k in range(n_iter):
	for i in numba.prange(n_samples):
	for j in range(n_samples):
	H_inf[i, j] += scale * Gram[i, j] * W_X[k, i] * W_X[k, j]
	return H_inf


	digits = load_digits()
	X, y = digits.data, digits.target
	n_samples, n_features = X.shape
	print(f"Loaded digits data (n_samples={n_samples}, n_features={n_features})")

	print("Normalizing X...")
	X = normalize(X)

	print("Computing the spectrum of the data Gram matrix", end="", flush=True)
	t0 = time()
	eigvals_gram = np.linalg.eigvalsh(X @ X.T)
	print(f" done in {time() - t0:0.3f}s")
	print(f"lambda_min(XX^T): {eigvals_gram.min():0.3e}")
	# We only have 64 features, so the rank of this Gram matrix is bounded by 64.

	fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, constrained_layout=True,
	figsize=(12, 8))
	ax0.semilogy(eigvals_gram[::-1])
	ax0.set_title('Spectrum of the data Gram matrix $XX^T$')
	ax0.set_ylabel('Eigenvalue (logscale)')


	for n_iter in [1_000, 10_000, 100_000]:
	print(f"Computing extended feature Gram H_inf with n_iter={n_iter}...",
	end="", flush=True)
	t0 = time()
	H_inf = compute_h_inf(X, n_iter=n_iter)
	print(f" done in {time() - t0:0.3f}s")
	print(f"H_inf.shape={H_inf.shape}")

	print("Checking that H_inf is symmetric...", end="", flush=True)
	np.testing.assert_allclose(H_inf, H_inf.T)
	print(" ok")

	print("Computing the spectrum of H_inf...", end="", flush=True)
	t0 = time()
	eigvals = np.linalg.eigvalsh(H_inf)
	print(f" done in {time() - t0:0.3f}s")
	print(f"lambda_min(H_inf): {eigvals.min():0.3e}")


	ax1.semilogy(eigvals[::-1])
	ax1.set_title('Spectrum of the extended feature Gram matrix: $H^\infty$')
	ax1.set_ylabel('Eigenvalue (logscale)')
	ax1.set_xlabel('Eigenvalue rank')
	plt.show()
	Loaded digits data (n_samples=1797, n_features=64)
	Normalizing X...
	Computing the spectrum of the data Gram matrix done in 0.763s
	lambda_min(XX^T): -2.796e-13

	Computing extended feature Gram H_inf with n_iter=1000... done in 2.793s
	H_inf.shape=(1797, 1797)
	Checking that H_inf is symmetric... ok
	Computing the spectrum of H_inf... done in 0.592s
	lambda_min(H_inf): 3.083e-03
	Computing extended feature Gram H_inf with n_iter=10000... done in 18.585s
	H_inf.shape=(1797, 1797)
	Checking that H_inf is symmetric... ok
	Computing the spectrum of H_inf... done in 0.578s
	lambda_min(H_inf): 1.112e-02
	Computing extended feature Gram H_inf with n_iter=100000... done in 209.125s
	H_inf.shape=(1797, 1797)
	Checking that H_inf is symmetric... ok
	Computing the spectrum of H_inf... done in 0.607s
	lambda_min(H_inf): 1.354e-02