Skip to content

Instantly share code, notes, and snippets.

@jkclem
Created August 8, 2020 21:38
Show Gist options
  • Save jkclem/bee8f17bd4a0389fb53c7b0a8c1159bf to your computer and use it in GitHub Desktop.
Save jkclem/bee8f17bd4a0389fb53c7b0a8c1159bf to your computer and use it in GitHub Desktop.
def best_information_criterion_selection(y, X, criterion='AIC'):
'''
This function takes in a column numpy array (y) and design matrix (X) (with the first column as all 1s for
the intercept) which is also a numpy array, and returns the OLS model with the lowest Information
Criterion. The default criterion is AIC; and the other option is BIC.
'''
# check inputs are valid
assert y.shape[0] == X.shape[0], 'The number of rows in y and X do not match!'
assert (criterion == 'AIC') or (criterion == 'BIC'), 'Valid criterions are AIC and BIC!'
# for linear regression
import statsmodels.api as sm
# for generating combinations of explanatory variables for model selection based on AIC
from itertools import combinations
# calculate the number of non-intercept predictors in the design matrix
number_of_predictors = X.shape[1] - 1
# create an array of the column numbers of the predictors
var_cols = np.arange(1, number_of_predictors + 1, step=1)
# create an empty list to hold all possible combinations of the predictor columns from 1 to all
subsets_of_vars = []
###
# This nested for loop appends all possible combinations of predictor columns from just 1 variable to
# all number_of_predictors
###
for num_predictors in range(1, number_of_predictors + 1):
for subset in combinations(var_cols, num_predictors):
subsets_of_vars.append(subset)
# create an array of zeros to hold the Information Criterion of each model
Info_Criterions = np.zeros(shape=len(subsets_of_vars))
# create an empty list to hold the models
models = []
# create a counter to be used for indexing in Info_Criterions and temp_biased_var_estimates
counter = 0
# loop through all subsets of predictors
for subset in subsets_of_vars:
# select the temporary subset of our design matrix
temp_X = X[:, (0,) + subset]
# fit an OLS linear model to the temporary data
temp_OLS_mod = sm.OLS(temp_y, temp_X).fit()
if criterion == 'AIC':
# add the AIC to the Info_Criterions array at the counter th slot
Info_Criterions[counter] = temp_OLS_mod.aic
else:
# add the AIC to the Info_Criterions array at the counter th slot
Info_Criterions[counter] = temp_OLS_mod.bic
# add the biased estimate to the temp_biased_var_estimates array at the counter th slot
models.append(temp_OLS_mod)
# increment the counter
counter += 1
# find the index of the model with the lowest Information Criterion
lowest_IC_index = np.argmin(Info_Criterions)
# return the model with the lowest Information Criterion
return models[lowest_IC_index]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment