Created
April 4, 2022 02:32
-
-
Save Dpananos/852dbd0b48d2c23da2183539180d7716 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from statsmodels.nonparametric.smoothers_lowess import lowess | |
from sklearn.datasets import load_breast_cancer | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import KFold, RepeatedKFold, GridSearchCV, cross_val_score | |
from sklearn.metrics import make_scorer, brier_score_loss | |
from sklearn.utils import resample | |
import matplotlib.pyplot as plt | |
# Load in the data to be used. | |
data = load_breast_cancer() | |
X, y = data['data'], data['target'] | |
# The model we will be using will be an L2 penalized logistic regression. | |
# We will have to select the appropriate penalty strength, and that requires cross validation. | |
# There are some convergence issues, so pick a stupid large max iteration | |
pipeline_components = [ | |
('scaling',StandardScaler()), | |
('logistic_regression', LogisticRegression(penalty='l2', max_iter = 1_000_000)) | |
] | |
a_model = Pipeline(pipeline_components) | |
param_grid = {'logistic_regression__C': 1/np.logspace(-2, 2, base = np.exp(1))} | |
# Choose a proper scoring rule for model selection | |
brier_score = make_scorer(brier_score_loss, needs_proba=True, pos_label=1) | |
# According to Frank Harrell, 100 repeats of 10-fold CV is about as good as optimism corrected bootstrap | |
outer_cv = RepeatedKFold(n_splits=10, n_repeats=100) | |
# We will choose our regularization strength through 10 fold cv | |
inner_cv = KFold(n_splits=10) | |
gscv = GridSearchCV(model, param_grid=param_grid, cv = inner_cv, scoring = brier_score, verbose=0) | |
# Now let's estimate how well a model selected via 10 fold cv would do by using repeated cv | |
# This is going to take a long time. | |
if False: | |
cv_results = cross_val_score(gscv, X, y, cv=outer_cv, scoring = brier_score) | |
print(cv_results.mean()) | |
# Construct the apparent calibration | |
prange = np.linspace(0, 1, 25) | |
# Fit our model on all the data | |
best_model = gscv.fit(X, y).best_estimator_ | |
# Estimate the risks from the best model | |
predicted_p = best_model.predict_proba(X)[:, 1] | |
# Compute the apparent calibration | |
apparent_cal = lowess(y, predicted_p, it=0, xvals=prange) | |
# To the Optimism Corrected Bootstrap for the Calibration Curve | |
nsim = 500 | |
optimism = np.zeros((nsim, prange.size)) | |
for i in range(nsim): | |
# Bootstrap the original dataset | |
Xb, yb = resample(X, y) | |
# Fit the model, including the hyperparameter selection, on the bootstrapped data | |
fit = gscv.fit(Xb, yb).best_estimator_ | |
# Get the risk estimates from the model fit on the bootstrapped predictions | |
predicted_risk_bs = fit.predict_proba(Xb)[:, 1] | |
# Fit a calibration curve to the predicted risk on bootstrapped data | |
smooth_p_bs = lowess(yb, predicted_risk_bs, it=0, xvals=prange) | |
# Apply the bootstrap model on the original data | |
predicted_risk_orig = fit.predict_proba(X)[:, 1] | |
# Fit a calibration curve on the original data using predictions from bootstrapped model | |
smooth_p_bs_orig = lowess(y, predicted_risk_orig, it=0, xvals=prange) | |
optimism[i] = smooth_p_bs - smooth_p_bs_orig | |
# Here is the bias corrected calibration curve | |
bias_corrected_cal = apparent_cal - optimism.mean(0) | |
# and finally the plot | |
fig, ax = plt.subplots(dpi=240) | |
ax.set_aspect('equal') | |
plt.scatter(predicted_p, y, s=1, alpha = 0.4, c='k') | |
plt.plot(prange, apparent_cal, 'red', label = 'Non-Parametric Estimate') | |
plt.plot([0, 1], [0, 1], 'k--', label = 'Perfect Calibration') | |
plt.plot(prange, bias_corrected_cal, 'C0', label = 'Optimism Corrected Calibration') | |
plt.legend(loc = 'upper left', prop = {'size':6}) | |
bias_corrected_cal = apparent_cal - optimism.mean(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment