pierrelouisbescond · May 28, 2020 15:36
diff --git a/abnormal_values_impact_corrupt_impute_and_test.py b/abnormal_values_impact_corrupt_impute_and_test.py
 # We create a list of incremental steps to perform
 steps = []
 for i in range (1,21): steps.append(i)

 # We define the imputation methods used
 imputation_methods = [SimpleImputer(strategy='constant', fill_value=0),
                      SimpleImputer(strategy='mean'),
                      IterativeImputer(),
                      KNNImputer()]

 # And their corresponding names
 imputation_methods_names = ["0","mean","iter","KNN"]

 # We combine the models and imputation methods into a list, used as columns titles
 # to register the performance of each combination
 variations_columns = []

 for model in model_names:
  for imputation_method in imputation_methods_names:
    variations_columns.append(model+"_"+imputation_method)

 # Benchmarks results will be stored into a DataFrame
 results=pd.DataFrame(np.zeros((len(steps),len(variations_columns))), columns=variations_columns, index=steps)

 # At each iteration, the percentage of NaN will be multiplied by step_pct
 # Ex. step_pct = 1 -> 1%, 2%, 3%, ... 
 # Ex. step_pct = 2 -> 2%, 4%, 6%, ...
 step_pct = 0.5

 for i in steps:

  # We transform that initial X_train to include a %age of NaN (=1% *step_pct)
  X_train_NaN = X_train.mask(np.random.random(X_train.shape)<0.01*step_pct*i)
  
  k=0

  # For every imputation method and model train with the corrupted and imputed
  # X_train_NaN_corrected, we evaluate the performance on the original test dataset
  for imputation_method in imputation_methods:

    # We imput the NaN values
    X_train_NaN_corrected =   imputation_method.fit_transform(X_train_NaN)

    for model in models:

      # Model is trained with the corrupted / imputed data
      model.fit(X_train_NaN_corrected, y_train)

      # Model performance is evaluated against the original test dataset
      score = model.score(X_test, y_test)

      # And recorded into the results DataFrame
      results.loc[i,type(model).__name__+"_"+imputation_methods_names[k]] = model.score(X_test, y_test)

    k+=1

  print("[X] Benchmark done for", i*step_pct,"%.")

 results.index = results.index*step_pct
	# We create a list of incremental steps to perform
	steps = []
	for i in range (1,21): steps.append(i)

	# We define the imputation methods used
	imputation_methods = [SimpleImputer(strategy='constant', fill_value=0),
	SimpleImputer(strategy='mean'),
	IterativeImputer(),
	KNNImputer()]

	# And their corresponding names
	imputation_methods_names = ["0","mean","iter","KNN"]

	# We combine the models and imputation methods into a list, used as columns titles
	# to register the performance of each combination
	variations_columns = []

	for model in model_names:
	for imputation_method in imputation_methods_names:
	variations_columns.append(model+"_"+imputation_method)

	# Benchmarks results will be stored into a DataFrame
	results=pd.DataFrame(np.zeros((len(steps),len(variations_columns))), columns=variations_columns, index=steps)

	# At each iteration, the percentage of NaN will be multiplied by step_pct
	# Ex. step_pct = 1 -> 1%, 2%, 3%, ...
	# Ex. step_pct = 2 -> 2%, 4%, 6%, ...
	step_pct = 0.5

	for i in steps:

	# We transform that initial X_train to include a %age of NaN (=1% *step_pct)
	X_train_NaN = X_train.mask(np.random.random(X_train.shape)<0.01step_pcti)

	k=0

	# For every imputation method and model train with the corrupted and imputed
	# X_train_NaN_corrected, we evaluate the performance on the original test dataset
	for imputation_method in imputation_methods:

	# We imput the NaN values
	X_train_NaN_corrected = imputation_method.fit_transform(X_train_NaN)

	for model in models:

	# Model is trained with the corrupted / imputed data
	model.fit(X_train_NaN_corrected, y_train)

	# Model performance is evaluated against the original test dataset
	score = model.score(X_test, y_test)

	# And recorded into the results DataFrame
	results.loc[i,type(model).__name__+"_"+imputation_methods_names[k]] = model.score(X_test, y_test)

	k+=1

	print("[X] Benchmark done for", i*step_pct,"%.")

	results.index = results.index*step_pct