Skip to content

Instantly share code, notes, and snippets.

@naranjja
Created September 5, 2018 03:17
Show Gist options
  • Save naranjja/2bd29810e6de6de6f2070a7ebd022344 to your computer and use it in GitHub Desktop.
Save naranjja/2bd29810e6de6de6f2070a7ebd022344 to your computer and use it in GitHub Desktop.
Remove outliers using Pandas
import pandas as pd
import numpy as np
def drop_outliers(df, field_name):
distance = 1.5 * (np.nanpercentile(df[field_name], 75) - np.nanpercentile(df[field_name], 25))
df.drop(df[df[field_name] > distance + np.nanpercentile(df[field_name], 75)].index, inplace=True)
df.drop(df[df[field_name] < np.nanpercentile(df[field_name], 25) - distance].index, inplace=True)
if __name__ == "__main__":
# assuming df exists and contains numeric variables
print(df.shape)
for column in df.select_dtypes(include=[np.number]).columns:
drop_outliers(df, column)
print(df.shape)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment