Skip to content

Instantly share code, notes, and snippets.

@tansey
Created July 6, 2021 01:10
Show Gist options
  • Save tansey/615b36f34669134977a7ff31ae08bbe6 to your computer and use it in GitHub Desktop.
Save tansey/615b36f34669134977a7ff31ae08bbe6 to your computer and use it in GitHub Desktop.
Post-selection inference example for AICc-based model screening
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
def generalized_liang_sim_xy(N=500, P=500, S=100):
'''Generates data from a simple linear model'''
X = (np.random.normal(size=(N,1)) + np.random.normal(size=(N,P))) / 2.
w0 = np.random.normal(1, size=S//4)
w1 = np.random.normal(2, size=S//4)
w2 = np.random.normal(2, size=S//4)
w3 = np.random.normal(2, size=S//4)
y = X[:,0:S:4].dot(w0) + X[:,1:S:4].dot(w1) + X[:,2:S:4].dot(w2) + X[:,3:S:4].dot(w3) + np.random.normal(0, 0.5, size=N)
# Return X, y, and the binary true discovery labels
return X, y, np.concatenate([np.ones(S), np.zeros(P-S)])
def powerset(iterable):
'''powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)'''
from itertools import chain, combinations
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
def enumerate_ols(X, y):
'''Enumerates out the powerset of possible covariate subsets and fits each one.'''
return [(np.array(variables), sm.OLS(y, X[:,variables]).fit()) for variables in powerset(np.arange(X.shape[1])) if len(variables) > 0]
if __name__ == '__main__':
n_trials = 100
delta_threshold = 2
N = 50
P = 8
S = 4
null_p_values = []
for trial in range(n_trials):
print(trial)
# Generate 50 samples with 8 covariates where the first 4 are true positives.
# All covariates have correlation 0.5.
X, y, _ = generalized_liang_sim_xy(N=N, P=P, S=S)
# Try all the different models and evaluate via AICc
fits = enumerate_ols(X, y)
scores = np.array([sm.tools.eval_measures.aicc(r.llf, r.nobs, r.df_model+1) for v,r in fits])
# Select all models with at most AICc delta of 2
# deltas = scores - scores.min()
# selected = deltas <= delta_threshold
# Get the best as selected by AICc
best = np.argmin(scores)
best_variables, best_fit = fits[best]
# Add all p-values for null variables to the list
print(best_fit.pvalues, best_variables, best_fit.pvalues[best_variables >= S])
null_p_values.extend(best_fit.pvalues[best_variables >= S])
null_p_values = np.array(null_p_values)
# Get the sorted values from smallest to largest
p_sorted = null_p_values[np.argsort(null_p_values)]
# Get the theoretical and empirical CDFs
theoretical = np.linspace(0,1,len(p_sorted))
empirical = (p_sorted[:,None] < theoretical[None]).mean(axis=0)
plt.plot(theoretical, empirical, color='orange', label='Actual p-values')
plt.plot([0,1], [0,1], color='black', label='Valid p-values')
plt.ylim([0,1])
plt.xlim([0,1])
plt.xlabel('Theoretical null p-value CDF')
plt.ylabel('Actual null p-value CDF')
plt.legend(loc='lower right')
plt.savefig('aicc-select.pdf', bbox_inches='tight')
plt.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment