Last active
June 14, 2018 17:55
-
-
Save sergeyf/08e5af7674b4d2c6d36dcb7872745c40 to your computer and use it in GitHub Desktop.
mice_n_imputations_experiment.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.impute import MICEImputer | |
from sklearn.datasets import load_boston | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import cross_val_score | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
sns.set(context='poster') | |
rng = np.random.RandomState(0) | |
X_full, y_full = load_boston(return_X_y=True) | |
n_samples = X_full.shape[0] | |
n_features = X_full.shape[1] | |
# original score | |
rfr = RandomForestRegressor(random_state=0, n_estimators=100) | |
full_scores = cross_val_score(rfr, X_full, y_full, | |
scoring='neg_mean_squared_error') | |
# Add missing values in 75% of the rows | |
missing_rate = 0.75 | |
n_missing_samples = int(np.floor(n_samples * missing_rate)) | |
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, | |
dtype=np.bool), | |
np.ones(n_missing_samples, | |
dtype=np.bool))) | |
rng.shuffle(missing_samples) | |
missing_features = rng.randint(0, n_features, n_missing_samples) | |
X_missing = X_full.copy() | |
X_missing[np.where(missing_samples)[0], missing_features] = 0 | |
y_missing = y_full.copy() | |
# Estimate the score after imputation (MICE strategy) of the missing values | |
def get_mice_impute_scores(n_burn_in, n_imputations, random_state): | |
estimator = Pipeline([("imputer", MICEImputer(n_burn_in=n_burn_in, | |
n_imputations=n_imputations, | |
missing_values=0, | |
random_state=random_state)), | |
("forest", RandomForestRegressor(random_state=random_state, n_estimators=100))]) | |
mice_impute_scores = cross_val_score(estimator, X_missing, y_missing, | |
scoring='neg_mean_squared_error') | |
return mice_impute_scores.mean() | |
# keep n_burn_in + n_imputations = 100 | |
n_imputations_sweep = np.arange(1, 100, 10) | |
runs = 100 | |
mice_impute_scores = np.zeros((runs, len(n_imputations_sweep))) | |
for i in range(runs): | |
for j, n_imputations in enumerate(n_imputations_sweep): | |
n_burn_in = 100 - n_imputations | |
score = get_mice_impute_scores(n_burn_in, n_imputations, i) | |
mice_impute_scores[i, j] = score | |
# plot results | |
plt.figure(figsize=(20, 10)) | |
plt.plot(np.arange(1, 100, 1), -full_scores.mean() * np.ones(99)) | |
plt.errorbar(n_imputations_sweep, -1*mice_impute_scores.mean(0), mice_impute_scores.std(0)) | |
plt.xlabel('last n_imputations averaged') | |
plt.ylabel('average left-out mean_squared_error') | |
plt.xlabel('Effect of averaging last n_imputations in MICE when n_burn_in + n_imputations = 100') | |
plt.legend(('results without missing data', 'results with missing data')) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment