jkclem · August 8, 2020 21:38
diff --git a/best_information_criterion_selection.py b/best_information_criterion_selection.py
 def best_information_criterion_selection(y, X, criterion='AIC'):
    '''
    This function takes in a column numpy array (y) and design matrix (X) (with the first column as all 1s for
    the intercept) which is also a numpy array, and returns the OLS model with the lowest Information 
    Criterion. The default criterion is AIC; and the other option is BIC.
    '''
    
    # check inputs are valid
    assert y.shape[0] == X.shape[0], 'The number of rows in y and X do not match!'
    assert (criterion == 'AIC') or (criterion == 'BIC'), 'Valid criterions are AIC and BIC!'
    
    # for linear regression
    import statsmodels.api as sm
    # for generating combinations of explanatory variables for model selection based on AIC
    from itertools import combinations
    
    # calculate the number of non-intercept predictors in the design matrix
    number_of_predictors = X.shape[1] - 1
    
    # create an array of the column numbers of the predictors
    var_cols = np.arange(1, number_of_predictors + 1, step=1)
    
    # create an empty list to hold all possible combinations of the predictor columns from 1 to all
    subsets_of_vars = []
    ###
    # This nested for loop appends all possible combinations of predictor columns from just 1 variable to 
    # all number_of_predictors
    ###
    for num_predictors in range(1, number_of_predictors + 1):
        for subset in combinations(var_cols, num_predictors):
            subsets_of_vars.append(subset)
    
    # create an array of zeros to hold the Information Criterion of each model
    Info_Criterions = np.zeros(shape=len(subsets_of_vars))
    
    # create an empty list to hold the models
    models = []
    
    # create a counter to be used for indexing in Info_Criterions and temp_biased_var_estimates
    counter = 0
    
    # loop through all subsets of predictors
    for subset in subsets_of_vars:
        
        # select the temporary subset of our design matrix
        temp_X = X[:, (0,) + subset]
        
        # fit an OLS linear model to the temporary data
        temp_OLS_mod = sm.OLS(temp_y, temp_X).fit()
        
        if criterion == 'AIC':
            # add the AIC to the Info_Criterions array at the counter th slot
            Info_Criterions[counter] = temp_OLS_mod.aic
            
        else:
            # add the AIC to the Info_Criterions array at the counter th slot
            Info_Criterions[counter] = temp_OLS_mod.bic
        
        # add the biased estimate to the temp_biased_var_estimates array at the counter th slot
        models.append(temp_OLS_mod)
        
        # increment the counter
        counter += 1
    
    # find the index of the model with the lowest Information Criterion
    lowest_IC_index = np.argmin(Info_Criterions)
    
    # return the model with the lowest Information Criterion
    return models[lowest_IC_index]
	def best_information_criterion_selection(y, X, criterion='AIC'):
	'''
	This function takes in a column numpy array (y) and design matrix (X) (with the first column as all 1s for
	the intercept) which is also a numpy array, and returns the OLS model with the lowest Information
	Criterion. The default criterion is AIC; and the other option is BIC.
	'''

	# check inputs are valid
	assert y.shape[0] == X.shape[0], 'The number of rows in y and X do not match!'
	assert (criterion == 'AIC') or (criterion == 'BIC'), 'Valid criterions are AIC and BIC!'

	# for linear regression
	import statsmodels.api as sm
	# for generating combinations of explanatory variables for model selection based on AIC
	from itertools import combinations

	# calculate the number of non-intercept predictors in the design matrix
	number_of_predictors = X.shape[1] - 1

	# create an array of the column numbers of the predictors
	var_cols = np.arange(1, number_of_predictors + 1, step=1)

	# create an empty list to hold all possible combinations of the predictor columns from 1 to all
	subsets_of_vars = []
	###
	# This nested for loop appends all possible combinations of predictor columns from just 1 variable to
	# all number_of_predictors
	###
	for num_predictors in range(1, number_of_predictors + 1):
	for subset in combinations(var_cols, num_predictors):
	subsets_of_vars.append(subset)

	# create an array of zeros to hold the Information Criterion of each model
	Info_Criterions = np.zeros(shape=len(subsets_of_vars))

	# create an empty list to hold the models
	models = []

	# create a counter to be used for indexing in Info_Criterions and temp_biased_var_estimates
	counter = 0

	# loop through all subsets of predictors
	for subset in subsets_of_vars:

	# select the temporary subset of our design matrix
	temp_X = X[:, (0,) + subset]

	# fit an OLS linear model to the temporary data
	temp_OLS_mod = sm.OLS(temp_y, temp_X).fit()

	if criterion == 'AIC':
	# add the AIC to the Info_Criterions array at the counter th slot
	Info_Criterions[counter] = temp_OLS_mod.aic

	else:
	# add the AIC to the Info_Criterions array at the counter th slot
	Info_Criterions[counter] = temp_OLS_mod.bic

	# add the biased estimate to the temp_biased_var_estimates array at the counter th slot
	models.append(temp_OLS_mod)

	# increment the counter
	counter += 1

	# find the index of the model with the lowest Information Criterion
	lowest_IC_index = np.argmin(Info_Criterions)

	# return the model with the lowest Information Criterion
	return models[lowest_IC_index]