Created
August 8, 2020 21:38
-
-
Save jkclem/bee8f17bd4a0389fb53c7b0a8c1159bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def best_information_criterion_selection(y, X, criterion='AIC'): | |
''' | |
This function takes in a column numpy array (y) and design matrix (X) (with the first column as all 1s for | |
the intercept) which is also a numpy array, and returns the OLS model with the lowest Information | |
Criterion. The default criterion is AIC; and the other option is BIC. | |
''' | |
# check inputs are valid | |
assert y.shape[0] == X.shape[0], 'The number of rows in y and X do not match!' | |
assert (criterion == 'AIC') or (criterion == 'BIC'), 'Valid criterions are AIC and BIC!' | |
# for linear regression | |
import statsmodels.api as sm | |
# for generating combinations of explanatory variables for model selection based on AIC | |
from itertools import combinations | |
# calculate the number of non-intercept predictors in the design matrix | |
number_of_predictors = X.shape[1] - 1 | |
# create an array of the column numbers of the predictors | |
var_cols = np.arange(1, number_of_predictors + 1, step=1) | |
# create an empty list to hold all possible combinations of the predictor columns from 1 to all | |
subsets_of_vars = [] | |
### | |
# This nested for loop appends all possible combinations of predictor columns from just 1 variable to | |
# all number_of_predictors | |
### | |
for num_predictors in range(1, number_of_predictors + 1): | |
for subset in combinations(var_cols, num_predictors): | |
subsets_of_vars.append(subset) | |
# create an array of zeros to hold the Information Criterion of each model | |
Info_Criterions = np.zeros(shape=len(subsets_of_vars)) | |
# create an empty list to hold the models | |
models = [] | |
# create a counter to be used for indexing in Info_Criterions and temp_biased_var_estimates | |
counter = 0 | |
# loop through all subsets of predictors | |
for subset in subsets_of_vars: | |
# select the temporary subset of our design matrix | |
temp_X = X[:, (0,) + subset] | |
# fit an OLS linear model to the temporary data | |
temp_OLS_mod = sm.OLS(temp_y, temp_X).fit() | |
if criterion == 'AIC': | |
# add the AIC to the Info_Criterions array at the counter th slot | |
Info_Criterions[counter] = temp_OLS_mod.aic | |
else: | |
# add the AIC to the Info_Criterions array at the counter th slot | |
Info_Criterions[counter] = temp_OLS_mod.bic | |
# add the biased estimate to the temp_biased_var_estimates array at the counter th slot | |
models.append(temp_OLS_mod) | |
# increment the counter | |
counter += 1 | |
# find the index of the model with the lowest Information Criterion | |
lowest_IC_index = np.argmin(Info_Criterions) | |
# return the model with the lowest Information Criterion | |
return models[lowest_IC_index] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment