Created
February 11, 2020 15:01
-
-
Save xoelop/5e82c5ed06234adef7332b0bad3e5c86 to your computer and use it in GitHub Desktop.
Code to visualize and clean outliers in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
# dataviz imports | |
import plotly as py | |
import plotly.express as px | |
import plotly.graph_objs as go | |
import plotly.figure_factory as ff | |
from plotly.subplots import make_subplots | |
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot | |
init_notebook_mode(connected=True) | |
import cufflinks as cf | |
# visualizing outliers | |
df.select_dtypes('number').quantile(np.linspace(0, 1, 1001)).iplot(logy=True, | |
layout_update={'hovermode': 'x',}, | |
xTitle='percentile', | |
yTitle='value' | |
) | |
# replacing outliers by NaNs | |
def drop_outliers(data: Union[pd.DataFrame, pd.Series], | |
min_quantile=0, max_quantile=1, | |
min_value=-np.inf, max_value=np.inf): | |
"""Sets the values that are out of the given range set by max/min quentiles or values to NaNs | |
Returns pd.DataFrame or pd.Series | |
""" | |
result = data[ | |
(data > data.quantile(min_quantile)) | |
& (data < data.quantile(max_quantile)) | |
& (data > min_value) | |
& (data < max_value) | |
] | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment