Skip to content

Instantly share code, notes, and snippets.

@jkclem
Created August 8, 2020 21:44
Show Gist options
  • Save jkclem/06b3972fd5f91b142941872c54f4ee08 to your computer and use it in GitHub Desktop.
Save jkclem/06b3972fd5f91b142941872c54f4ee08 to your computer and use it in GitHub Desktop.
# suppresses warnings from sklearn
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
# import LassoCV
from sklearn.linear_model import LassoCV
# import RidgeCV
from sklearn.linear_model import RidgeCV
# set the number of simulations to run
num_sims = 500
# set n and k and make a n by k+1 matrix of standard normal random numbers
n = 200
k = 100
# set the number of predictors, including the intercept that are significant
k_significant = 51
# create a column vector of zeros, with k + 1 betas, where the first value is the true intercept
beta = np.zeros(shape=(k + 1, 1))
# set the first k_significant betas to be non-zero values, using random N(10, 2) random variables
beta[:k_significant] = np.random.normal(loc=5.0, scale=1.0, size=(k_significant, 1))
# create an array of zeros to hold our UNBIASED estimates of the variance of our error term (from OLS)
unbiased_sigma_estimates = np.zeros(shape=num_sims)
# create an array of zeros to hold our LASSO estimates of the variance of our error term (from OLS)
lasso_sigma_estimates = np.zeros(shape=num_sims)
# create an array of zeros to hold our LASSO estimates of the variance of our error term (from OLS)
ridge_sigma_estimates = np.zeros(shape=num_sims)
# perform num_sims simulations
for i in range(num_sims):
###
# This block generates the random sample
###
# create the temporary design n by k + 1 design matrix
temp_X = np.random.normal(loc=0.0, scale=1.0, size=(n, k + 1))
# set the first column of the design matrix to 1.0 to include an intercept
temp_X[:,0] = 1.0
# create temporary y values using the design matrix and beta vector, and add standard normal random noise
temp_y = np.matmul(temp_X, beta) + np.random.normal(loc=0.0, scale=1.0, size=(n, 1))
###
# This block estimates the variance using the true model
###
# fit an OLS linear model to the temporary data
temp_true_mod = sm.OLS(temp_y, temp_X[:,0:k_significant]).fit()
# estimate the variance of the error term (note: ddof=1 is to use the sample variance formula)
temp_unbiased_sigma_estimate = np.sqrt(np.sum(temp_true_mod.resid **2) /
(n - temp_true_mod.params.shape[0]))
# add the unbiased estimate to the array at the ith slot
unbiased_sigma_estimates[i] = temp_unbiased_sigma_estimate
###
# This block estimates the variance using the LASSO model with the best alpha chosen by cross-validation
###
# create an instance of a LASSO model
lasso_mod = LassoCV(alphas=(0.01, 0.1, 1.0, 10.0), fit_intercept=False, cv=3)
# reshape temp_y to be compatible with sklearn standards
reshaped_temp_y = temp_y.reshape(temp_y.shape[0])
# fit the LASSO model with the best alpha
fit_lasso_mod = lasso_mod.fit(temp_X, reshaped_temp_y)
# calculate the residuals from the temporary LASSO model
temp_lasso_resids = reshaped_temp_y - fit_lasso_mod.predict(temp_X)
# count the number of non-zero parameters in the LASSO model
non_zero_param_count = (fit_lasso_mod.coef_ != np.repeat(0, fit_lasso_mod.coef_.shape[0])).sum()
# estimate the standard deviation of the error term
temp_lasso_sigma_estimate = np.sqrt(np.sum(temp_lasso_resids ** 2) / (n - non_zero_param_count))
# add the unbiased estimate to the array at the ith slot
lasso_sigma_estimates[i] = temp_lasso_sigma_estimate
###
# This block estimates the variance using the Ridge model with the best alpha chosen by cross-validation
###
# create an instance of a ridge model
ridge_mod = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0), fit_intercept=False, cv=3)
# reshape temp_y to be compatible with sklearn standards
reshaped_temp_y = temp_y.reshape(temp_y.shape[0])
# fit the ridge model with the best alpha
fit_ridge_mod = ridge_mod.fit(temp_X, reshaped_temp_y)
# calculate the residuals from the temporary ridge model
temp_ridge_resids = reshaped_temp_y - fit_ridge_mod.predict(temp_X)
# estimate the variance of the error term (note: ddof=1 is to use the sample variance formula)
temp_ridge_sigma_estimate = np.sqrt(np.sum(temp_ridge_resids ** 2) / (n - k - 1))
# add the unbiased estimate to the array at the ith slot
ridge_sigma_estimates[i] = temp_ridge_sigma_estimate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment