Skip to content

Instantly share code, notes, and snippets.

@aeturrell
Last active January 11, 2025 15:23
Show Gist options
  • Select an option

  • Save aeturrell/874fd6dbaa903ad231e99dd5834741c2 to your computer and use it in GitHub Desktop.

Select an option

Save aeturrell/874fd6dbaa903ad231e99dd5834741c2 to your computer and use it in GitHub Desktop.
Self-contained benchmarking script for vanilla regressions with Pyfixest and Statsmodels
# /// script
# python = ">=3.10"
# dependencies = [
# "numpy>=1.21.0",
# "pandas>=1.3.0",
# "statsmodels>=0.13.0",
# "pyfixest>=0.11.0",
# "matplotlib>=3.4.0"
# ]
# ///
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
from pyfixest import feols
def generate_data(n_samples, n_features):
"""Generate synthetic data for regression testing."""
np.random.seed(42)
X = np.random.randn(n_samples, n_features)
coefficients = np.random.randn(n_features)
y = X @ coefficients + np.random.randn(n_samples) * 0.1
return X, y
def run_benchmark(func, X, y, iterations=100):
"""Run a timing benchmark on a regression function."""
times = []
for _ in range(iterations):
start = time.perf_counter()
func(X, y)
times.append(time.perf_counter() - start)
return np.mean(times), np.std(times)
def statsmodels_regression(X, y):
"""Run OLS regression using statsmodels."""
X_with_constant = add_constant(X)
return OLS(y, X_with_constant).fit()
def pyfixest_regression(X, y):
"""Run OLS regression using pyfixest."""
df = pd.DataFrame(X, columns=[f'x{i}' for i in range(X.shape[1])])
df['y'] = y
formula = 'y ~ ' + ' + '.join(df.columns[:-1])
return feols(formula, data=df)
def create_comparison_plot(title, xlabel):
"""Create a plot with common styling."""
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_xlabel(xlabel)
ax.set_ylabel('Time (seconds)')
ax.set_title(title)
ax.grid(True)
ax.set_xscale('log')
ax.set_yscale('log')
return fig, ax
def add_result_to_plot(ax, x_values, times, label, marker):
"""Add benchmark results to the plot."""
means, stds = zip(*times)
ax.errorbar(x_values, means, yerr=stds,
label=label, marker=marker,
linestyle='-', capsize=3)
def run_size_comparison():
"""Compare performance across different sample sizes."""
sizes = [100, 500, 1000, 5000, 10000, 100000]
n_features = 10
results = {'statsmodels': [], 'pyfixest': []}
for size in sizes:
print(f"\nTesting with {size} samples...")
X, y = generate_data(size, n_features)
results['statsmodels'].append(run_benchmark(statsmodels_regression, X, y))
results['pyfixest'].append(run_benchmark(pyfixest_regression, X, y))
return sizes, results
def run_feature_comparison():
"""Compare performance across different numbers of features."""
n_samples = 10000
features = [2, 10, 100, 200]
results = {'statsmodels': [], 'pyfixest': []}
for n_features in features:
print(f"\nTesting with {n_features} features...")
X, y = generate_data(n_samples, n_features)
results['statsmodels'].append(run_benchmark(statsmodels_regression, X, y))
results['pyfixest'].append(run_benchmark(pyfixest_regression, X, y))
return features, results
# Run both comparisons
print("Running sample size comparison...")
sizes, size_results = run_size_comparison()
print("\nRunning feature count comparison...")
features, feature_results = run_feature_comparison()
# Create sample size comparison plot
fig1, ax1 = create_comparison_plot(
'OLS Performance vs Sample Size', 'Number of Samples'
)
add_result_to_plot(ax1, sizes, size_results['statsmodels'], 'statsmodels', 'o')
add_result_to_plot(ax1, sizes, size_results['pyfixest'], 'pyfixest', 's')
ax1.legend()
ax1.set_xlim(10, max(sizes)*5)
plt.tight_layout()
# Create feature count comparison plot
fig2, ax2 = create_comparison_plot(
'OLS Performance vs Number of Features', 'Number of Features'
)
add_result_to_plot(ax2, features, feature_results['statsmodels'], 'statsmodels', 'o')
add_result_to_plot(ax2, features, feature_results['pyfixest'], 'pyfixest', 's')
ax2.legend()
ax2.set_xlim(1, max(features)*5)
plt.tight_layout()
plt.show()
@aeturrell
Copy link
Author

The charts below show the results of running the code.

sample_sizes

features

@s3alfisc
Copy link

Hi, thanks @aeturrell! I updated your script to add a comparison against statsmodels formula API: link . Things start to look slightly more favorably, but statsmodels is still faster (though not at significant margins):

image

image

Possible reasons for remaining performance differences between both formula APIs I can think of:

  • different default checks
  • different OLS solvers (pyfixest supports a range of options, I'm not 100% sure what sm does)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment