sergeyf · June 14, 2018 17:55
diff --git a/mice_n_imputations_experiment.py b/mice_n_imputations_experiment.py
 import numpy as np
 from sklearn.impute import MICEImputer
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import cross_val_score

 import matplotlib.pyplot as plt
 import seaborn as sns
 sns.set(context='poster')

 rng = np.random.RandomState(0)

 X_full, y_full = load_boston(return_X_y=True)
 n_samples = X_full.shape[0]
 n_features = X_full.shape[1]

 # original score
 rfr = RandomForestRegressor(random_state=0, n_estimators=100)
 full_scores = cross_val_score(rfr, X_full, y_full, 
                              scoring='neg_mean_squared_error')

 # Add missing values in 75% of the rows
 missing_rate = 0.75
 n_missing_samples = int(np.floor(n_samples * missing_rate))
 missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
 rng.shuffle(missing_samples)
 missing_features = rng.randint(0, n_features, n_missing_samples)

 X_missing = X_full.copy()
 X_missing[np.where(missing_samples)[0], missing_features] = 0
 y_missing = y_full.copy()

 # Estimate the score after imputation (MICE strategy) of the missing values
 def get_mice_impute_scores(n_burn_in, n_imputations, random_state):
    estimator = Pipeline([("imputer", MICEImputer(n_burn_in=n_burn_in,
                                                  n_imputations=n_imputations,
                                                  missing_values=0,
                                                  random_state=random_state)),
                          ("forest", RandomForestRegressor(random_state=random_state, n_estimators=100))])
    mice_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')
    return mice_impute_scores.mean()

 # keep n_burn_in + n_imputations = 100
 n_imputations_sweep = np.arange(1, 100, 10)
 runs = 100
 mice_impute_scores = np.zeros((runs, len(n_imputations_sweep)))
 for i in range(runs):
    for j, n_imputations in enumerate(n_imputations_sweep):
        n_burn_in = 100 - n_imputations
        score = get_mice_impute_scores(n_burn_in, n_imputations, i)
        mice_impute_scores[i, j] = score
    
 # plot results
 plt.figure(figsize=(20, 10))
 plt.plot(np.arange(1, 100, 1), -full_scores.mean() * np.ones(99))
 plt.errorbar(n_imputations_sweep, -1*mice_impute_scores.mean(0), mice_impute_scores.std(0))
 plt.xlabel('last n_imputations averaged')
 plt.ylabel('average left-out mean_squared_error')
 plt.xlabel('Effect of averaging last n_imputations in MICE when n_burn_in + n_imputations = 100')
 plt.legend(('results without missing data', 'results with missing data'))
 plt.show()
	import numpy as np
	from sklearn.impute import MICEImputer
	from sklearn.datasets import load_boston
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import cross_val_score

	import matplotlib.pyplot as plt
	import seaborn as sns
	sns.set(context='poster')

	rng = np.random.RandomState(0)

	X_full, y_full = load_boston(return_X_y=True)
	n_samples = X_full.shape[0]
	n_features = X_full.shape[1]

	# original score
	rfr = RandomForestRegressor(random_state=0, n_estimators=100)
	full_scores = cross_val_score(rfr, X_full, y_full,
	scoring='neg_mean_squared_error')

	# Add missing values in 75% of the rows
	missing_rate = 0.75
	n_missing_samples = int(np.floor(n_samples * missing_rate))
	missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
	dtype=np.bool),
	np.ones(n_missing_samples,
	dtype=np.bool)))
	rng.shuffle(missing_samples)
	missing_features = rng.randint(0, n_features, n_missing_samples)

	X_missing = X_full.copy()
	X_missing[np.where(missing_samples)[0], missing_features] = 0
	y_missing = y_full.copy()

	# Estimate the score after imputation (MICE strategy) of the missing values
	def get_mice_impute_scores(n_burn_in, n_imputations, random_state):
	estimator = Pipeline([("imputer", MICEImputer(n_burn_in=n_burn_in,
	n_imputations=n_imputations,
	missing_values=0,
	random_state=random_state)),
	("forest", RandomForestRegressor(random_state=random_state, n_estimators=100))])
	mice_impute_scores = cross_val_score(estimator, X_missing, y_missing,
	scoring='neg_mean_squared_error')
	return mice_impute_scores.mean()

	# keep n_burn_in + n_imputations = 100
	n_imputations_sweep = np.arange(1, 100, 10)
	runs = 100
	mice_impute_scores = np.zeros((runs, len(n_imputations_sweep)))
	for i in range(runs):
	for j, n_imputations in enumerate(n_imputations_sweep):
	n_burn_in = 100 - n_imputations
	score = get_mice_impute_scores(n_burn_in, n_imputations, i)
	mice_impute_scores[i, j] = score

	# plot results
	plt.figure(figsize=(20, 10))
	plt.plot(np.arange(1, 100, 1), -full_scores.mean() * np.ones(99))
	plt.errorbar(n_imputations_sweep, -1*mice_impute_scores.mean(0), mice_impute_scores.std(0))
	plt.xlabel('last n_imputations averaged')
	plt.ylabel('average left-out mean_squared_error')
	plt.xlabel('Effect of averaging last n_imputations in MICE when n_burn_in + n_imputations = 100')
	plt.legend(('results without missing data', 'results with missing data'))
	plt.show()