Created
May 28, 2020 15:36
-
-
Save pierrelouisbescond/02479152212163d82de635b66299978c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We create a list of incremental steps to perform | |
steps = [] | |
for i in range (1,21): steps.append(i) | |
# We define the imputation methods used | |
imputation_methods = [SimpleImputer(strategy='constant', fill_value=0), | |
SimpleImputer(strategy='mean'), | |
IterativeImputer(), | |
KNNImputer()] | |
# And their corresponding names | |
imputation_methods_names = ["0","mean","iter","KNN"] | |
# We combine the models and imputation methods into a list, used as columns titles | |
# to register the performance of each combination | |
variations_columns = [] | |
for model in model_names: | |
for imputation_method in imputation_methods_names: | |
variations_columns.append(model+"_"+imputation_method) | |
# Benchmarks results will be stored into a DataFrame | |
results=pd.DataFrame(np.zeros((len(steps),len(variations_columns))), columns=variations_columns, index=steps) | |
# At each iteration, the percentage of NaN will be multiplied by step_pct | |
# Ex. step_pct = 1 -> 1%, 2%, 3%, ... | |
# Ex. step_pct = 2 -> 2%, 4%, 6%, ... | |
step_pct = 0.5 | |
for i in steps: | |
# We transform that initial X_train to include a %age of NaN (=1% *step_pct) | |
X_train_NaN = X_train.mask(np.random.random(X_train.shape)<0.01*step_pct*i) | |
k=0 | |
# For every imputation method and model train with the corrupted and imputed | |
# X_train_NaN_corrected, we evaluate the performance on the original test dataset | |
for imputation_method in imputation_methods: | |
# We imput the NaN values | |
X_train_NaN_corrected = imputation_method.fit_transform(X_train_NaN) | |
for model in models: | |
# Model is trained with the corrupted / imputed data | |
model.fit(X_train_NaN_corrected, y_train) | |
# Model performance is evaluated against the original test dataset | |
score = model.score(X_test, y_test) | |
# And recorded into the results DataFrame | |
results.loc[i,type(model).__name__+"_"+imputation_methods_names[k]] = model.score(X_test, y_test) | |
k+=1 | |
print("[X] Benchmark done for", i*step_pct,"%.") | |
results.index = results.index*step_pct |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment