Created
October 17, 2024 05:41
-
-
Save norabelrose/6a13deb46d9895f10330294c50b7b9fc to your computer and use it in GitHub Desktop.
Analytically computing the least-squares linear approximation to a ReLU network
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.stats import norm | |
def compute_E_xf(W1, W2, b1): | |
""" | |
Computes the analytical expectation E[x f(x)^T] for a single hidden layer ReLU network. | |
Parameters: | |
- W1: numpy.ndarray, shape (k, n) | |
Weight matrix of the first layer (W^{(1)}). | |
- W2: numpy.ndarray, shape (m, k) | |
Weight matrix of the second layer (W^{(2)}). | |
- b1: numpy.ndarray, shape (k,) | |
Bias vector of the first layer (b^{(1)}). | |
Returns: | |
- E_xf: numpy.ndarray, shape (n, m) | |
The computed expectation E[x f(x)^T]. | |
""" | |
# Compute sigma_y: standard deviations of y_j | |
sigma_y = np.linalg.norm(W1, axis=1) # Shape: (k,) | |
# Avoid division by zero in case sigma_y has zeros | |
sigma_y_safe = np.where(sigma_y == 0, 1e-8, sigma_y) | |
# Compute alpha: standardized biases | |
alpha = b1 / sigma_y_safe # Shape: (k,) | |
# Compute Phi(alpha) | |
Phi_alpha = norm.cdf(alpha) # CDF of standard normal at alpha_j | |
# Construct diagonal matrix D | |
D = np.diag(Phi_alpha) # Shape: (k, k) | |
# Compute E[x f(x)^T] analytically | |
E_xf = W1.T @ D @ W2.T # Shape: (n, m) | |
return E_xf | |
# Monte Carlo simulation to estimate E[x f(x)^T] | |
def monte_carlo_E_xf(W1, W2, b1, N_samples=100_000): | |
""" | |
Estimates E[x f(x)^T] using Monte Carlo simulation. | |
""" | |
n = W1.shape[1] # Input dimension | |
m = W2.shape[0] # Output dimension | |
# Initialize accumulator for E[x f(x)^T] | |
E_xf_mc = np.zeros((n, m)) | |
# Generate N_samples of x ~ N(0, I) | |
x_samples = np.random.randn(N_samples, n) # Shape: (N_samples, n) | |
# Compute f(x) for each sample | |
# First compute y = W1 x + b1 | |
y_samples = x_samples @ W1.T + b1 # Shape: (N_samples, k) | |
# Apply ReLU activation | |
relu_y = np.maximum(0, y_samples) # Shape: (N_samples, k) | |
# Compute f(x) = W2 ReLU(y) + b2 (we can ignore b2 since E[x] = 0) | |
f_samples = relu_y @ W2.T # Shape: (N_samples, m) | |
# Compute x f(x)^T for each sample and accumulate | |
for i in range(N_samples): | |
E_xf_mc += np.outer(x_samples[i], f_samples[i]) | |
# Divide by number of samples to get the expectation | |
E_xf_mc /= N_samples | |
return E_xf_mc | |
# Main code to compute and compare analytical and Monte Carlo results | |
def main(): | |
# Example dimensions | |
n = 5 # Input dimension | |
k = 10 # Number of neurons in the hidden layer | |
m = 3 # Output dimension | |
# Random initialization of weights and biases | |
np.random.seed(0) # For reproducibility | |
W1 = np.random.randn(k, n) | |
W2 = np.random.randn(m, k) | |
b1 = np.random.randn(k) | |
# Compute analytical E[x f(x)^T] | |
E_xf_analytical = compute_E_xf(W1, W2, b1) | |
# Estimate E[x f(x)^T] using Monte Carlo simulation | |
N_samples = 1000_000 # Number of Monte Carlo samples | |
E_xf_monte_carlo = monte_carlo_E_xf(W1, W2, b1, N_samples) | |
# Compute the difference between analytical and Monte Carlo results | |
difference = E_xf_analytical - E_xf_monte_carlo | |
error_norm = np.linalg.norm(difference) | |
# Print the results | |
print("Analytical E[x f(x)^T]:\n", E_xf_analytical) | |
print("\nMonte Carlo E[x f(x)^T]:\n", E_xf_monte_carlo) | |
print("\nDifference (Analytical - Monte Carlo):\n", difference) | |
print("\nFrobenius norm of the difference:", error_norm) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment