jkclem · August 8, 2020 21:44
diff --git a/simulation_study2.py b/simulation_study2.py
 # suppresses warnings from sklearn
 def warn(*args, **kwargs):
    pass
 import warnings
 warnings.warn = warn

 # import LassoCV
 from sklearn.linear_model import LassoCV
 # import RidgeCV
 from sklearn.linear_model import RidgeCV

 # set the number of simulations to run
 num_sims = 500

 # set n and k and make a n by k+1 matrix of standard normal random numbers
 n = 200
 k = 100

 # set the number of predictors, including the intercept that are significant
 k_significant = 51

 # create a column vector of zeros, with k + 1 betas, where the first value is the true intercept
 beta = np.zeros(shape=(k + 1, 1))
 # set the first k_significant betas to be non-zero values, using random N(10, 2) random variables
 beta[:k_significant] = np.random.normal(loc=5.0, scale=1.0, size=(k_significant, 1))

 # create an array of zeros to hold our UNBIASED estimates of the variance of our error term (from OLS)
 unbiased_sigma_estimates = np.zeros(shape=num_sims)
 # create an array of zeros to hold our LASSO estimates of the variance of our error term (from OLS)
 lasso_sigma_estimates = np.zeros(shape=num_sims)
 # create an array of zeros to hold our LASSO estimates of the variance of our error term (from OLS)
 ridge_sigma_estimates = np.zeros(shape=num_sims)

 # perform num_sims simulations
 for i in range(num_sims):
    
    ###
    # This block generates the random sample
    ###
    
    # create the temporary design n by k + 1 design matrix
    temp_X = np.random.normal(loc=0.0, scale=1.0, size=(n, k + 1))
    
    # set the first column of the design matrix to 1.0 to include an intercept
    temp_X[:,0] = 1.0
    
    # create temporary y values using the design matrix and beta vector, and add standard normal random noise
    temp_y = np.matmul(temp_X, beta) + np.random.normal(loc=0.0, scale=1.0, size=(n, 1))
    
    
    ###
    # This block estimates the variance using the true model
    ###
    
    # fit an OLS linear model to the temporary data
    temp_true_mod = sm.OLS(temp_y, temp_X[:,0:k_significant]).fit()
    
    # estimate the variance of the error term (note: ddof=1 is to use the sample variance formula)
    temp_unbiased_sigma_estimate = np.sqrt(np.sum(temp_true_mod.resid **2) / 
                                         (n - temp_true_mod.params.shape[0]))
    
    # add the unbiased estimate to the array at the ith slot
    unbiased_sigma_estimates[i] = temp_unbiased_sigma_estimate 
    
    
    ###
    # This block estimates the variance using the LASSO model with the best alpha chosen by cross-validation
    ###
    
    # create an instance of a LASSO model
    lasso_mod = LassoCV(alphas=(0.01, 0.1, 1.0, 10.0), fit_intercept=False, cv=3)
    
    # reshape temp_y to be compatible with sklearn standards
    reshaped_temp_y = temp_y.reshape(temp_y.shape[0])
    
    # fit the LASSO model with the best alpha
    fit_lasso_mod = lasso_mod.fit(temp_X, reshaped_temp_y)
    
    # calculate the residuals from the temporary LASSO model
    temp_lasso_resids = reshaped_temp_y - fit_lasso_mod.predict(temp_X)
    
    # count the number of non-zero parameters in the LASSO model
    non_zero_param_count = (fit_lasso_mod.coef_ != np.repeat(0, fit_lasso_mod.coef_.shape[0])).sum()
    
    # estimate the standard deviation of the error term
    temp_lasso_sigma_estimate = np.sqrt(np.sum(temp_lasso_resids ** 2) / (n - non_zero_param_count))
                            
    # add the unbiased estimate to the array at the ith slot
    lasso_sigma_estimates[i] = temp_lasso_sigma_estimate 
    
    
    ###
    # This block estimates the variance using the Ridge model with the best alpha chosen by cross-validation
    ###
    
    # create an instance of a ridge model
    ridge_mod = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0), fit_intercept=False, cv=3)
    
    # reshape temp_y to be compatible with sklearn standards
    reshaped_temp_y = temp_y.reshape(temp_y.shape[0])
    
    # fit the ridge model with the best alpha
    fit_ridge_mod = ridge_mod.fit(temp_X, reshaped_temp_y)

    # calculate the residuals from the temporary ridge model
    temp_ridge_resids = reshaped_temp_y - fit_ridge_mod.predict(temp_X)
    
    # estimate the variance of the error term (note: ddof=1 is to use the sample variance formula)
    temp_ridge_sigma_estimate = np.sqrt(np.sum(temp_ridge_resids ** 2) / (n - k - 1))
    
    # add the unbiased estimate to the array at the ith slot
    ridge_sigma_estimates[i] = temp_ridge_sigma_estimate
	# suppresses warnings from sklearn
	def warn(args, *kwargs):
	pass
	import warnings
	warnings.warn = warn

	# import LassoCV
	from sklearn.linear_model import LassoCV
	# import RidgeCV
	from sklearn.linear_model import RidgeCV

	# set the number of simulations to run
	num_sims = 500

	# set n and k and make a n by k+1 matrix of standard normal random numbers
	n = 200
	k = 100

	# set the number of predictors, including the intercept that are significant
	k_significant = 51

	# create a column vector of zeros, with k + 1 betas, where the first value is the true intercept
	beta = np.zeros(shape=(k + 1, 1))
	# set the first k_significant betas to be non-zero values, using random N(10, 2) random variables
	beta[:k_significant] = np.random.normal(loc=5.0, scale=1.0, size=(k_significant, 1))

	# create an array of zeros to hold our UNBIASED estimates of the variance of our error term (from OLS)
	unbiased_sigma_estimates = np.zeros(shape=num_sims)
	# create an array of zeros to hold our LASSO estimates of the variance of our error term (from OLS)
	lasso_sigma_estimates = np.zeros(shape=num_sims)
	# create an array of zeros to hold our LASSO estimates of the variance of our error term (from OLS)
	ridge_sigma_estimates = np.zeros(shape=num_sims)

	# perform num_sims simulations
	for i in range(num_sims):

	###
	# This block generates the random sample
	###

	# create the temporary design n by k + 1 design matrix
	temp_X = np.random.normal(loc=0.0, scale=1.0, size=(n, k + 1))

	# set the first column of the design matrix to 1.0 to include an intercept
	temp_X[:,0] = 1.0

	# create temporary y values using the design matrix and beta vector, and add standard normal random noise
	temp_y = np.matmul(temp_X, beta) + np.random.normal(loc=0.0, scale=1.0, size=(n, 1))


	###
	# This block estimates the variance using the true model
	###

	# fit an OLS linear model to the temporary data
	temp_true_mod = sm.OLS(temp_y, temp_X[:,0:k_significant]).fit()

	# estimate the variance of the error term (note: ddof=1 is to use the sample variance formula)
	temp_unbiased_sigma_estimate = np.sqrt(np.sum(temp_true_mod.resid **2) /
	(n - temp_true_mod.params.shape[0]))

	# add the unbiased estimate to the array at the ith slot
	unbiased_sigma_estimates[i] = temp_unbiased_sigma_estimate


	###
	# This block estimates the variance using the LASSO model with the best alpha chosen by cross-validation
	###

	# create an instance of a LASSO model
	lasso_mod = LassoCV(alphas=(0.01, 0.1, 1.0, 10.0), fit_intercept=False, cv=3)

	# reshape temp_y to be compatible with sklearn standards
	reshaped_temp_y = temp_y.reshape(temp_y.shape[0])

	# fit the LASSO model with the best alpha
	fit_lasso_mod = lasso_mod.fit(temp_X, reshaped_temp_y)

	# calculate the residuals from the temporary LASSO model
	temp_lasso_resids = reshaped_temp_y - fit_lasso_mod.predict(temp_X)

	# count the number of non-zero parameters in the LASSO model
	non_zero_param_count = (fit_lasso_mod.coef_ != np.repeat(0, fit_lasso_mod.coef_.shape[0])).sum()

	# estimate the standard deviation of the error term
	temp_lasso_sigma_estimate = np.sqrt(np.sum(temp_lasso_resids ** 2) / (n - non_zero_param_count))

	# add the unbiased estimate to the array at the ith slot
	lasso_sigma_estimates[i] = temp_lasso_sigma_estimate


	###
	# This block estimates the variance using the Ridge model with the best alpha chosen by cross-validation
	###

	# create an instance of a ridge model
	ridge_mod = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0), fit_intercept=False, cv=3)

	# reshape temp_y to be compatible with sklearn standards
	reshaped_temp_y = temp_y.reshape(temp_y.shape[0])

	# fit the ridge model with the best alpha
	fit_ridge_mod = ridge_mod.fit(temp_X, reshaped_temp_y)

	# calculate the residuals from the temporary ridge model
	temp_ridge_resids = reshaped_temp_y - fit_ridge_mod.predict(temp_X)

	# estimate the variance of the error term (note: ddof=1 is to use the sample variance formula)
	temp_ridge_sigma_estimate = np.sqrt(np.sum(temp_ridge_resids ** 2) / (n - k - 1))

	# add the unbiased estimate to the array at the ith slot
	ridge_sigma_estimates[i] = temp_ridge_sigma_estimate