sourangshupal · February 4, 2019 12:04
diff --git a/drop_outliers.py b/drop_outliers.py

 # It is highly recommended to use Pandas for such data processing problems
 import pandas as pd
 import numpy as np

 dataset = {'feature1': np.random.rand(5000),
                 'feature2': np.random.rand(5000),
                 'feature3': np.random.rand(5000)
                 }

 # Pandas data frame objects more convenient than Python dicts  to perform data preprocessing operations
 dataframe = pd.DataFrame(dataset)


 # Let's do  simple filtering... 
 # if the value in the row is greater 0.99 - leave it, otherwise drop the row
 print(dataframe[(dataframe>0.99).any(axis=1)])


 # remove all rows of the dataframe if the value 
 # in the column <feature1> lies too far (!) from corresponding median (the median computed for the column <feature1>)
 filtering_rule_1  = (dataframe.feature1.median() - dataframe.feature1).abs( ) > 0.3 

 print(dataframe[~filtering_rule_1])  # ~ -- means <NOT> operation


 # Another filtering approach: using quantiles
 lower_bound = .25
 upper_bound = .75
 quant_df = dataframe.quantile([lower_bound, upper_bound]) # auxiliary dataframe, it consist of quantiles computed for each column

 # select outliers, i.e. values lie outside corresponding [lower_bound, upper_bound] intervals
 filtering_rule_2 = dataframe.apply(lambda x: (x < quant_df.loc[lower_bound, x.name]) |  (x > quant_df.loc[upper_bound, x.name]), axis=0)


 # print filtered dataset: if the row includes outlier value, it is dropped; outlier = the value that lies outside [lower_bound, upper_bound]
 print(dataframe[~(filtering_rule_2).any(axis=1)])

 # or assign/create a new df
 filtered_dataframe = dataframe[~(filtering_rule_2).any(axis=1)]

	# It is highly recommended to use Pandas for such data processing problems
	import pandas as pd
	import numpy as np

	dataset = {'feature1': np.random.rand(5000),
	'feature2': np.random.rand(5000),
	'feature3': np.random.rand(5000)
	}

	# Pandas data frame objects more convenient than Python dicts to perform data preprocessing operations
	dataframe = pd.DataFrame(dataset)


	# Let's do simple filtering...
	# if the value in the row is greater 0.99 - leave it, otherwise drop the row
	print(dataframe[(dataframe>0.99).any(axis=1)])


	# remove all rows of the dataframe if the value
	# in the column <feature1> lies too far (!) from corresponding median (the median computed for the column <feature1>)
	filtering_rule_1 = (dataframe.feature1.median() - dataframe.feature1).abs( ) > 0.3

	print(dataframe[~filtering_rule_1]) # ~ -- means <NOT> operation


	# Another filtering approach: using quantiles
	lower_bound = .25
	upper_bound = .75
	quant_df = dataframe.quantile([lower_bound, upper_bound]) # auxiliary dataframe, it consist of quantiles computed for each column

	# select outliers, i.e. values lie outside corresponding [lower_bound, upper_bound] intervals
	filtering_rule_2 = dataframe.apply(lambda x: (x < quant_df.loc[lower_bound, x.name]) \| (x > quant_df.loc[upper_bound, x.name]), axis=0)


	# print filtered dataset: if the row includes outlier value, it is dropped; outlier = the value that lies outside [lower_bound, upper_bound]
	print(dataframe[~(filtering_rule_2).any(axis=1)])

	# or assign/create a new df
	filtered_dataframe = dataframe[~(filtering_rule_2).any(axis=1)]