Skip to content

Instantly share code, notes, and snippets.

@norabelrose
Created October 17, 2024 05:41
Show Gist options
  • Save norabelrose/6a13deb46d9895f10330294c50b7b9fc to your computer and use it in GitHub Desktop.
Save norabelrose/6a13deb46d9895f10330294c50b7b9fc to your computer and use it in GitHub Desktop.
Analytically computing the least-squares linear approximation to a ReLU network
import numpy as np
from scipy.stats import norm
def compute_E_xf(W1, W2, b1):
"""
Computes the analytical expectation E[x f(x)^T] for a single hidden layer ReLU network.
Parameters:
- W1: numpy.ndarray, shape (k, n)
Weight matrix of the first layer (W^{(1)}).
- W2: numpy.ndarray, shape (m, k)
Weight matrix of the second layer (W^{(2)}).
- b1: numpy.ndarray, shape (k,)
Bias vector of the first layer (b^{(1)}).
Returns:
- E_xf: numpy.ndarray, shape (n, m)
The computed expectation E[x f(x)^T].
"""
# Compute sigma_y: standard deviations of y_j
sigma_y = np.linalg.norm(W1, axis=1) # Shape: (k,)
# Avoid division by zero in case sigma_y has zeros
sigma_y_safe = np.where(sigma_y == 0, 1e-8, sigma_y)
# Compute alpha: standardized biases
alpha = b1 / sigma_y_safe # Shape: (k,)
# Compute Phi(alpha)
Phi_alpha = norm.cdf(alpha) # CDF of standard normal at alpha_j
# Construct diagonal matrix D
D = np.diag(Phi_alpha) # Shape: (k, k)
# Compute E[x f(x)^T] analytically
E_xf = W1.T @ D @ W2.T # Shape: (n, m)
return E_xf
# Monte Carlo simulation to estimate E[x f(x)^T]
def monte_carlo_E_xf(W1, W2, b1, N_samples=100_000):
"""
Estimates E[x f(x)^T] using Monte Carlo simulation.
"""
n = W1.shape[1] # Input dimension
m = W2.shape[0] # Output dimension
# Initialize accumulator for E[x f(x)^T]
E_xf_mc = np.zeros((n, m))
# Generate N_samples of x ~ N(0, I)
x_samples = np.random.randn(N_samples, n) # Shape: (N_samples, n)
# Compute f(x) for each sample
# First compute y = W1 x + b1
y_samples = x_samples @ W1.T + b1 # Shape: (N_samples, k)
# Apply ReLU activation
relu_y = np.maximum(0, y_samples) # Shape: (N_samples, k)
# Compute f(x) = W2 ReLU(y) + b2 (we can ignore b2 since E[x] = 0)
f_samples = relu_y @ W2.T # Shape: (N_samples, m)
# Compute x f(x)^T for each sample and accumulate
for i in range(N_samples):
E_xf_mc += np.outer(x_samples[i], f_samples[i])
# Divide by number of samples to get the expectation
E_xf_mc /= N_samples
return E_xf_mc
# Main code to compute and compare analytical and Monte Carlo results
def main():
# Example dimensions
n = 5 # Input dimension
k = 10 # Number of neurons in the hidden layer
m = 3 # Output dimension
# Random initialization of weights and biases
np.random.seed(0) # For reproducibility
W1 = np.random.randn(k, n)
W2 = np.random.randn(m, k)
b1 = np.random.randn(k)
# Compute analytical E[x f(x)^T]
E_xf_analytical = compute_E_xf(W1, W2, b1)
# Estimate E[x f(x)^T] using Monte Carlo simulation
N_samples = 1000_000 # Number of Monte Carlo samples
E_xf_monte_carlo = monte_carlo_E_xf(W1, W2, b1, N_samples)
# Compute the difference between analytical and Monte Carlo results
difference = E_xf_analytical - E_xf_monte_carlo
error_norm = np.linalg.norm(difference)
# Print the results
print("Analytical E[x f(x)^T]:\n", E_xf_analytical)
print("\nMonte Carlo E[x f(x)^T]:\n", E_xf_monte_carlo)
print("\nDifference (Analytical - Monte Carlo):\n", difference)
print("\nFrobenius norm of the difference:", error_norm)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment