Last active
May 12, 2024 07:35
-
-
Save peeush-agarwal/02bf16664f8ebbaa78a538061ae2d198 to your computer and use it in GitHub Desktop.
Handle outliers using IQR and eliminate rows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
# data as pandas DataFrame | |
train = ... | |
# Variables to store out bounds by column value | |
upper_bounds = {} | |
lower_bounds = {} | |
# Best practice | |
train_copy = train.copy() | |
for c in train_copy.columns: | |
# Take inter quartile boundary range | |
q25, q75 = np.percentile(train_copy[c], [25, 75]) | |
intr_qr = q75 - q25 | |
# Set upper/lower bounds (arbitrary) | |
upper = q75 + 1.5 * intr_qr | |
lower = q25 - 1.5 * intr_qr | |
# Store bounds | |
upper_bounds[c] = upper | |
lower_bounds[c] = lower | |
# Eliminate any row that doesn't fit within the required bounds | |
train_copy = train_copy[(train_copy[c] >= lower) & (train_copy[c] <= upper)].reset_index(drop=True) | |
train_copy.shape |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment