aeturrell · January 11, 2025 15:23 · aeturrell · Jan 10, 2025 · s3alfisc · Jan 11, 2025
diff --git a/vanilla_regression_benchmark.py b/vanilla_regression_benchmark.py
 # /// script
 # python = ">=3.10"
 # dependencies = [
 #     "numpy>=1.21.0",
 #     "pandas>=1.3.0",
 #     "statsmodels>=0.13.0",
 #     "pyfixest>=0.11.0",
 #     "matplotlib>=3.4.0"
 # ]
 # ///

 import time
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from statsmodels.regression.linear_model import OLS
 from statsmodels.tools import add_constant
 from pyfixest import feols

 def generate_data(n_samples, n_features):
    """Generate synthetic data for regression testing."""
    np.random.seed(42)
    X = np.random.randn(n_samples, n_features)
    coefficients = np.random.randn(n_features)
    y = X @ coefficients + np.random.randn(n_samples) * 0.1
    return X, y

 def run_benchmark(func, X, y, iterations=100):
    """Run a timing benchmark on a regression function."""
    times = []
    for _ in range(iterations):
        start = time.perf_counter()
        func(X, y)
        times.append(time.perf_counter() - start)
    return np.mean(times), np.std(times)

 def statsmodels_regression(X, y):
    """Run OLS regression using statsmodels."""
    X_with_constant = add_constant(X)
    return OLS(y, X_with_constant).fit()

 def pyfixest_regression(X, y):
    """Run OLS regression using pyfixest."""
    df = pd.DataFrame(X, columns=[f'x{i}' for i in range(X.shape[1])])
    df['y'] = y
    formula = 'y ~ ' + ' + '.join(df.columns[:-1])
    return feols(formula, data=df)

 def create_comparison_plot(title, xlabel):
    """Create a plot with common styling."""
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Time (seconds)')
    ax.set_title(title)
    ax.grid(True)
    ax.set_xscale('log')
    ax.set_yscale('log')
    return fig, ax

 def add_result_to_plot(ax, x_values, times, label, marker):
    """Add benchmark results to the plot."""
    means, stds = zip(*times)
    ax.errorbar(x_values, means, yerr=stds, 
                label=label, marker=marker,
                linestyle='-', capsize=3)

 def run_size_comparison():
    """Compare performance across different sample sizes."""
    sizes = [100, 500, 1000, 5000, 10000, 100000]
    n_features = 10
    results = {'statsmodels': [], 'pyfixest': []}
    
    for size in sizes:
        print(f"\nTesting with {size} samples...")
        X, y = generate_data(size, n_features)
        results['statsmodels'].append(run_benchmark(statsmodels_regression, X, y))
        results['pyfixest'].append(run_benchmark(pyfixest_regression, X, y))
    
    return sizes, results

 def run_feature_comparison():
    """Compare performance across different numbers of features."""
    n_samples = 10000
    features = [2, 10, 100, 200]
    results = {'statsmodels': [], 'pyfixest': []}
    
    for n_features in features:
        print(f"\nTesting with {n_features} features...")
        X, y = generate_data(n_samples, n_features)
        results['statsmodels'].append(run_benchmark(statsmodels_regression, X, y))
        results['pyfixest'].append(run_benchmark(pyfixest_regression, X, y))
    
    return features, results


 # Run both comparisons
 print("Running sample size comparison...")
 sizes, size_results = run_size_comparison()

 print("\nRunning feature count comparison...")
 features, feature_results = run_feature_comparison()

 # Create sample size comparison plot
 fig1, ax1 = create_comparison_plot(
    'OLS Performance vs Sample Size', 'Number of Samples'
 )
 add_result_to_plot(ax1, sizes, size_results['statsmodels'], 'statsmodels', 'o')
 add_result_to_plot(ax1, sizes, size_results['pyfixest'], 'pyfixest', 's')
 ax1.legend()
 ax1.set_xlim(10, max(sizes)*5)
 plt.tight_layout()

 # Create feature count comparison plot
 fig2, ax2 = create_comparison_plot(
    'OLS Performance vs Number of Features', 'Number of Features'
 )
 add_result_to_plot(ax2, features, feature_results['statsmodels'], 'statsmodels', 'o')
 add_result_to_plot(ax2, features, feature_results['pyfixest'], 'pyfixest', 's')
 ax2.legend()
 ax2.set_xlim(1, max(features)*5)
 plt.tight_layout()

 plt.show()
	# /// script
	# python = ">=3.10"
	# dependencies = [
	# "numpy>=1.21.0",
	# "pandas>=1.3.0",
	# "statsmodels>=0.13.0",
	# "pyfixest>=0.11.0",
	# "matplotlib>=3.4.0"
	# ]
	# ///

	import time
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from statsmodels.regression.linear_model import OLS
	from statsmodels.tools import add_constant
	from pyfixest import feols

	def generate_data(n_samples, n_features):
	"""Generate synthetic data for regression testing."""
	np.random.seed(42)
	X = np.random.randn(n_samples, n_features)
	coefficients = np.random.randn(n_features)
	y = X @ coefficients + np.random.randn(n_samples) * 0.1
	return X, y

	def run_benchmark(func, X, y, iterations=100):
	"""Run a timing benchmark on a regression function."""
	times = []
	for _ in range(iterations):
	start = time.perf_counter()
	func(X, y)
	times.append(time.perf_counter() - start)
	return np.mean(times), np.std(times)

	def statsmodels_regression(X, y):
	"""Run OLS regression using statsmodels."""
	X_with_constant = add_constant(X)
	return OLS(y, X_with_constant).fit()

	def pyfixest_regression(X, y):
	"""Run OLS regression using pyfixest."""
	df = pd.DataFrame(X, columns=[f'x{i}' for i in range(X.shape[1])])
	df['y'] = y
	formula = 'y ~ ' + ' + '.join(df.columns[:-1])
	return feols(formula, data=df)

	def create_comparison_plot(title, xlabel):
	"""Create a plot with common styling."""
	fig, ax = plt.subplots(figsize=(10, 6))
	ax.set_xlabel(xlabel)
	ax.set_ylabel('Time (seconds)')
	ax.set_title(title)
	ax.grid(True)
	ax.set_xscale('log')
	ax.set_yscale('log')
	return fig, ax

	def add_result_to_plot(ax, x_values, times, label, marker):
	"""Add benchmark results to the plot."""
	means, stds = zip(*times)
	ax.errorbar(x_values, means, yerr=stds,
	label=label, marker=marker,
	linestyle='-', capsize=3)

	def run_size_comparison():
	"""Compare performance across different sample sizes."""
	sizes = [100, 500, 1000, 5000, 10000, 100000]
	n_features = 10
	results = {'statsmodels': [], 'pyfixest': []}

	for size in sizes:
	print(f"\nTesting with {size} samples...")
	X, y = generate_data(size, n_features)
	results['statsmodels'].append(run_benchmark(statsmodels_regression, X, y))
	results['pyfixest'].append(run_benchmark(pyfixest_regression, X, y))

	return sizes, results

	def run_feature_comparison():
	"""Compare performance across different numbers of features."""
	n_samples = 10000
	features = [2, 10, 100, 200]
	results = {'statsmodels': [], 'pyfixest': []}

	for n_features in features:
	print(f"\nTesting with {n_features} features...")
	X, y = generate_data(n_samples, n_features)
	results['statsmodels'].append(run_benchmark(statsmodels_regression, X, y))
	results['pyfixest'].append(run_benchmark(pyfixest_regression, X, y))

	return features, results


	# Run both comparisons
	print("Running sample size comparison...")
	sizes, size_results = run_size_comparison()

	print("\nRunning feature count comparison...")
	features, feature_results = run_feature_comparison()

	# Create sample size comparison plot
	fig1, ax1 = create_comparison_plot(
	'OLS Performance vs Sample Size', 'Number of Samples'
	)
	add_result_to_plot(ax1, sizes, size_results['statsmodels'], 'statsmodels', 'o')
	add_result_to_plot(ax1, sizes, size_results['pyfixest'], 'pyfixest', 's')
	ax1.legend()
	ax1.set_xlim(10, max(sizes)*5)
	plt.tight_layout()

	# Create feature count comparison plot
	fig2, ax2 = create_comparison_plot(
	'OLS Performance vs Number of Features', 'Number of Features'
	)
	add_result_to_plot(ax2, features, feature_results['statsmodels'], 'statsmodels', 'o')
	add_result_to_plot(ax2, features, feature_results['pyfixest'], 'pyfixest', 's')
	ax2.legend()
	ax2.set_xlim(1, max(features)*5)
	plt.tight_layout()

	plt.show()
No results found