Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save pierrelouisbescond/02479152212163d82de635b66299978c to your computer and use it in GitHub Desktop.
Save pierrelouisbescond/02479152212163d82de635b66299978c to your computer and use it in GitHub Desktop.
# We create a list of incremental steps to perform
steps = []
for i in range (1,21): steps.append(i)
# We define the imputation methods used
imputation_methods = [SimpleImputer(strategy='constant', fill_value=0),
SimpleImputer(strategy='mean'),
IterativeImputer(),
KNNImputer()]
# And their corresponding names
imputation_methods_names = ["0","mean","iter","KNN"]
# We combine the models and imputation methods into a list, used as columns titles
# to register the performance of each combination
variations_columns = []
for model in model_names:
for imputation_method in imputation_methods_names:
variations_columns.append(model+"_"+imputation_method)
# Benchmarks results will be stored into a DataFrame
results=pd.DataFrame(np.zeros((len(steps),len(variations_columns))), columns=variations_columns, index=steps)
# At each iteration, the percentage of NaN will be multiplied by step_pct
# Ex. step_pct = 1 -> 1%, 2%, 3%, ...
# Ex. step_pct = 2 -> 2%, 4%, 6%, ...
step_pct = 0.5
for i in steps:
# We transform that initial X_train to include a %age of NaN (=1% *step_pct)
X_train_NaN = X_train.mask(np.random.random(X_train.shape)<0.01*step_pct*i)
k=0
# For every imputation method and model train with the corrupted and imputed
# X_train_NaN_corrected, we evaluate the performance on the original test dataset
for imputation_method in imputation_methods:
# We imput the NaN values
X_train_NaN_corrected = imputation_method.fit_transform(X_train_NaN)
for model in models:
# Model is trained with the corrupted / imputed data
model.fit(X_train_NaN_corrected, y_train)
# Model performance is evaluated against the original test dataset
score = model.score(X_test, y_test)
# And recorded into the results DataFrame
results.loc[i,type(model).__name__+"_"+imputation_methods_names[k]] = model.score(X_test, y_test)
k+=1
print("[X] Benchmark done for", i*step_pct,"%.")
results.index = results.index*step_pct
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment