-
-
Save sourangshupal/9ccaee9b51d197d3d946241ee13c351d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# It is highly recommended to use Pandas for such data processing problems | |
import pandas as pd | |
import numpy as np | |
dataset = {'feature1': np.random.rand(5000), | |
'feature2': np.random.rand(5000), | |
'feature3': np.random.rand(5000) | |
} | |
# Pandas data frame objects more convenient than Python dicts to perform data preprocessing operations | |
dataframe = pd.DataFrame(dataset) | |
# Let's do simple filtering... | |
# if the value in the row is greater 0.99 - leave it, otherwise drop the row | |
print(dataframe[(dataframe>0.99).any(axis=1)]) | |
# remove all rows of the dataframe if the value | |
# in the column <feature1> lies too far (!) from corresponding median (the median computed for the column <feature1>) | |
filtering_rule_1 = (dataframe.feature1.median() - dataframe.feature1).abs( ) > 0.3 | |
print(dataframe[~filtering_rule_1]) # ~ -- means <NOT> operation | |
# Another filtering approach: using quantiles | |
lower_bound = .25 | |
upper_bound = .75 | |
quant_df = dataframe.quantile([lower_bound, upper_bound]) # auxiliary dataframe, it consist of quantiles computed for each column | |
# select outliers, i.e. values lie outside corresponding [lower_bound, upper_bound] intervals | |
filtering_rule_2 = dataframe.apply(lambda x: (x < quant_df.loc[lower_bound, x.name]) | (x > quant_df.loc[upper_bound, x.name]), axis=0) | |
# print filtered dataset: if the row includes outlier value, it is dropped; outlier = the value that lies outside [lower_bound, upper_bound] | |
print(dataframe[~(filtering_rule_2).any(axis=1)]) | |
# or assign/create a new df | |
filtered_dataframe = dataframe[~(filtering_rule_2).any(axis=1)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment