joseph-allen · December 29, 2017 14:22 · zahs123 · Dec 8, 2020 · joseph-allen · Dec 8, 2020
diff --git a/outlier_detection b/outlier_detection
 import numpy as np
 from collections import Counter


 def detect_outliers(df, n, features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []

    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col], 75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1

        # outlier step
        outlier_step = 1.5 * IQR

        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index

        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)

    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)

    return multiple_outliers

 # detect outliers from list of features
 lof = ['example_feature_1', 'example_feature_2']
 # params dataset, number of outliers for rejection, list of features
 Outliers_to_drop = detect_outliers(dataset, 2, lof)
	import numpy as np
	from collections import Counter


	def detect_outliers(df, n, features):
	"""
	Takes a dataframe df of features and returns a list of the indices
	corresponding to the observations containing more than n outliers according
	to the Tukey method.
	"""
	outlier_indices = []

	# iterate over features(columns)
	for col in features:
	# 1st quartile (25%)
	Q1 = np.percentile(df[col], 25)
	# 3rd quartile (75%)
	Q3 = np.percentile(df[col], 75)
	# Interquartile range (IQR)
	IQR = Q3 - Q1

	# outlier step
	outlier_step = 1.5 * IQR

	# Determine a list of indices of outliers for feature col
	outlier_list_col = df[(df[col] < Q1 - outlier_step) \| (df[col] > Q3 + outlier_step)].index

	# append the found outlier indices for col to the list of outlier indices
	outlier_indices.extend(outlier_list_col)

	# select observations containing more than 2 outliers
	outlier_indices = Counter(outlier_indices)
	multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)

	return multiple_outliers

	# detect outliers from list of features
	lof = ['example_feature_1', 'example_feature_2']
	# params dataset, number of outliers for rejection, list of features
	Outliers_to_drop = detect_outliers(dataset, 2, lof)