Skip to content

Instantly share code, notes, and snippets.

@xoelop
Created February 11, 2020 15:01
Show Gist options
  • Save xoelop/5e82c5ed06234adef7332b0bad3e5c86 to your computer and use it in GitHub Desktop.
Save xoelop/5e82c5ed06234adef7332b0bad3e5c86 to your computer and use it in GitHub Desktop.
Code to visualize and clean outliers in Python
import pandas as pd
import numpy as np
# dataviz imports
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import cufflinks as cf
# visualizing outliers
df.select_dtypes('number').quantile(np.linspace(0, 1, 1001)).iplot(logy=True,
layout_update={'hovermode': 'x',},
xTitle='percentile',
yTitle='value'
)
# replacing outliers by NaNs
def drop_outliers(data: Union[pd.DataFrame, pd.Series],
min_quantile=0, max_quantile=1,
min_value=-np.inf, max_value=np.inf):
"""Sets the values that are out of the given range set by max/min quentiles or values to NaNs
Returns pd.DataFrame or pd.Series
"""
result = data[
(data > data.quantile(min_quantile))
& (data < data.quantile(max_quantile))
& (data > min_value)
& (data < max_value)
]
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment