Skip to content

Instantly share code, notes, and snippets.

@adhadse
Last active April 5, 2021 11:58
Show Gist options
  • Save adhadse/d89be5b222c17253155b9ef878d9aa3a to your computer and use it in GitHub Desktop.
Save adhadse/d89be5b222c17253155b9ef878d9aa3a to your computer and use it in GitHub Desktop.
def check_outliers_std(dataframe, col_name):
lower_boundary, upper_boundary = determine_outlier_thresholds_std(dataframe, col_name)
if dataframe[(dataframe[col_name] > upper_boundary) | (dataframe[col_name] < lower_boundary)].any(axis=None):
return True
else:
return False
def replace_with_thresholds_std(dataframe, cols, replace=False):
from tabulate import tabulate
data = []
for col_name in cols:
if col_name != 'Outcome':
outliers_ = check_outliers_std(dataframe, col_name)
count = None
lower_limit, upper_limit = determine_outlier_thresholds_std(dataframe, col_name)
if outliers_:
count = dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)][col_name].count()
if replace:
if lower_limit < 0:
# We don't want to replace with negative values, right!
dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
else:
dataframe.loc[(dataframe[col_name] < lower_limit), col_name] = lower_limit
dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
outliers_status = check_outliers_std(dataframe, col_name)
data.append([outliers_, outliers_status,count, col_name, lower_limit, upper_limit])
table = tabulate(data, headers=['Outlier (Previously)','Outliers','Count', 'Column','Lower Limit', 'Upper Limit'], tablefmt='rst', numalign='right')
print("Removing Outliers using 3 Standard Deviation")
print(table)
replace_with_thresholds_std(df, df.columns,replace=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment