Last active
March 31, 2022 02:52
-
-
Save wolframalpha/1681593cfeeeaff25f04f0d77e1dbd6e to your computer and use it in GitHub Desktop.
Function to remove or cap outliers in columns of a `pandas.DataFrame`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def treatoutliers(self, df=None, columns=None, factor=1.5, method='IQR', treament='cap'): | |
""" | |
Removes the rows from self.df whose value does not lies in the specified standard deviation | |
:param columns: | |
:param in_stddev: | |
:return: | |
""" | |
# if not columns: | |
# columns = self.mandatory_cols_ + self.optional_cols_ + [self.target_col] | |
if not columns: | |
columns = self.mandatory_cols_ + self.optional_cols_ + [self.target_col] | |
if not df: | |
df = self.df | |
for column in columns: | |
if method == 'STD': | |
permissable_std = factor * df[column].std() | |
col_mean = df[column].mean() | |
floor, ceil = col_mean - permissable_std, col_mean + permissable_std | |
elif method == 'IQR': | |
Q1 = df[column].quantile(0.25) | |
Q3 = df[column].quantile(0.75) | |
IQR = Q3 - Q1 | |
floor, ceil = Q1 - factor * IQR, Q3 + factor * IQR | |
# print(floor, ceil) | |
if treament == 'remove': | |
df = df[(df[column] >= floor) & (df[column] <= ceil)] | |
elif treament == 'cap': | |
df[column] = df[column].clip(floor, ceil) | |
self.df = df | |
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def treatoutliers(df, columns=None, factor=1.5, method='IQR', treament='cap'): | |
""" | |
Removes the rows from self.df whose value does not lies in the specified standard deviation | |
:param columns: | |
:param in_stddev: | |
:return: | |
""" | |
# if not columns: | |
# columns = self.mandatory_cols_ + self.optional_cols_ + [self.target_col] | |
if not columns: | |
columns = df.columns | |
for column in columns: | |
if method == 'STD': | |
permissable_std = factor * df[column].std() | |
col_mean = df[column].mean() | |
floor, ceil = col_mean - permissable_std, col_mean + permissable_std | |
elif method == 'IQR': | |
Q1 = df[column].quantile(0.25) | |
Q3 = df[column].quantile(0.75) | |
IQR = Q3 - Q1 | |
floor, ceil = Q1 - factor * IQR, Q3 + factor * IQR | |
if treament == 'remove': | |
df = df[(df[column] >= floor) & (df[column] <= ceil)] | |
elif treament == 'cap': | |
df[column] = df[column].clip(floor, ceil) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment