adhadse · April 5, 2021 11:58
diff --git a/determining_removing_outliers_std.py b/determining_removing_outliers_std.py
 def check_outliers_std(dataframe, col_name):
    lower_boundary, upper_boundary = determine_outlier_thresholds_std(dataframe, col_name)
    if dataframe[(dataframe[col_name] > upper_boundary) | (dataframe[col_name] < lower_boundary)].any(axis=None):
        return True
    else: 
        return False

 def replace_with_thresholds_std(dataframe, cols, replace=False):
    from tabulate import tabulate
    data = []
    for col_name in cols:
        if col_name != 'Outcome':
            outliers_ = check_outliers_std(dataframe, col_name)
            count = None
            lower_limit, upper_limit = determine_outlier_thresholds_std(dataframe, col_name)
            if outliers_:
                count = dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)][col_name].count()
                if replace:
                    if lower_limit < 0:
                        # We don't want to replace with negative values, right!
                        dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
                    else:
                        dataframe.loc[(dataframe[col_name] < lower_limit), col_name] = lower_limit
                        dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
            outliers_status = check_outliers_std(dataframe, col_name)
            data.append([outliers_, outliers_status,count, col_name, lower_limit, upper_limit])
    table = tabulate(data, headers=['Outlier (Previously)','Outliers','Count', 'Column','Lower Limit', 'Upper Limit'], tablefmt='rst', numalign='right')
    print("Removing Outliers using 3 Standard Deviation")
    print(table)

 replace_with_thresholds_std(df, df.columns,replace=False)
	def check_outliers_std(dataframe, col_name):
	lower_boundary, upper_boundary = determine_outlier_thresholds_std(dataframe, col_name)
	if dataframe[(dataframe[col_name] > upper_boundary) \| (dataframe[col_name] < lower_boundary)].any(axis=None):
	return True
	else:
	return False

	def replace_with_thresholds_std(dataframe, cols, replace=False):
	from tabulate import tabulate
	data = []
	for col_name in cols:
	if col_name != 'Outcome':
	outliers_ = check_outliers_std(dataframe, col_name)
	count = None
	lower_limit, upper_limit = determine_outlier_thresholds_std(dataframe, col_name)
	if outliers_:
	count = dataframe[(dataframe[col_name] > upper_limit) \| (dataframe[col_name] < lower_limit)][col_name].count()
	if replace:
	if lower_limit < 0:
	# We don't want to replace with negative values, right!
	dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
	else:
	dataframe.loc[(dataframe[col_name] < lower_limit), col_name] = lower_limit
	dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
	outliers_status = check_outliers_std(dataframe, col_name)
	data.append([outliers_, outliers_status,count, col_name, lower_limit, upper_limit])
	table = tabulate(data, headers=['Outlier (Previously)','Outliers','Count', 'Column','Lower Limit', 'Upper Limit'], tablefmt='rst', numalign='right')
	print("Removing Outliers using 3 Standard Deviation")
	print(table)

	replace_with_thresholds_std(df, df.columns,replace=False)