Created
July 6, 2021 01:10
-
-
Save tansey/615b36f34669134977a7ff31ae08bbe6 to your computer and use it in GitHub Desktop.
Post-selection inference example for AICc-based model screening
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
import statsmodels.api as sm | |
def generalized_liang_sim_xy(N=500, P=500, S=100): | |
'''Generates data from a simple linear model''' | |
X = (np.random.normal(size=(N,1)) + np.random.normal(size=(N,P))) / 2. | |
w0 = np.random.normal(1, size=S//4) | |
w1 = np.random.normal(2, size=S//4) | |
w2 = np.random.normal(2, size=S//4) | |
w3 = np.random.normal(2, size=S//4) | |
y = X[:,0:S:4].dot(w0) + X[:,1:S:4].dot(w1) + X[:,2:S:4].dot(w2) + X[:,3:S:4].dot(w3) + np.random.normal(0, 0.5, size=N) | |
# Return X, y, and the binary true discovery labels | |
return X, y, np.concatenate([np.ones(S), np.zeros(P-S)]) | |
def powerset(iterable): | |
'''powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)''' | |
from itertools import chain, combinations | |
s = list(iterable) | |
return chain.from_iterable(combinations(s, r) for r in range(len(s)+1)) | |
def enumerate_ols(X, y): | |
'''Enumerates out the powerset of possible covariate subsets and fits each one.''' | |
return [(np.array(variables), sm.OLS(y, X[:,variables]).fit()) for variables in powerset(np.arange(X.shape[1])) if len(variables) > 0] | |
if __name__ == '__main__': | |
n_trials = 100 | |
delta_threshold = 2 | |
N = 50 | |
P = 8 | |
S = 4 | |
null_p_values = [] | |
for trial in range(n_trials): | |
print(trial) | |
# Generate 50 samples with 8 covariates where the first 4 are true positives. | |
# All covariates have correlation 0.5. | |
X, y, _ = generalized_liang_sim_xy(N=N, P=P, S=S) | |
# Try all the different models and evaluate via AICc | |
fits = enumerate_ols(X, y) | |
scores = np.array([sm.tools.eval_measures.aicc(r.llf, r.nobs, r.df_model+1) for v,r in fits]) | |
# Select all models with at most AICc delta of 2 | |
# deltas = scores - scores.min() | |
# selected = deltas <= delta_threshold | |
# Get the best as selected by AICc | |
best = np.argmin(scores) | |
best_variables, best_fit = fits[best] | |
# Add all p-values for null variables to the list | |
print(best_fit.pvalues, best_variables, best_fit.pvalues[best_variables >= S]) | |
null_p_values.extend(best_fit.pvalues[best_variables >= S]) | |
null_p_values = np.array(null_p_values) | |
# Get the sorted values from smallest to largest | |
p_sorted = null_p_values[np.argsort(null_p_values)] | |
# Get the theoretical and empirical CDFs | |
theoretical = np.linspace(0,1,len(p_sorted)) | |
empirical = (p_sorted[:,None] < theoretical[None]).mean(axis=0) | |
plt.plot(theoretical, empirical, color='orange', label='Actual p-values') | |
plt.plot([0,1], [0,1], color='black', label='Valid p-values') | |
plt.ylim([0,1]) | |
plt.xlim([0,1]) | |
plt.xlabel('Theoretical null p-value CDF') | |
plt.ylabel('Actual null p-value CDF') | |
plt.legend(loc='lower right') | |
plt.savefig('aicc-select.pdf', bbox_inches='tight') | |
plt.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment