Skip to content

Instantly share code, notes, and snippets.

@ksv-muralidhar
Last active May 5, 2023 11:01
Show Gist options
  • Save ksv-muralidhar/f5e3384c1c6f7a26dcb50b5965f6f367 to your computer and use it in GitHub Desktop.
Save ksv-muralidhar/f5e3384c1c6f7a26dcb50b5965f6f367 to your computer and use it in GitHub Desktop.
Custom Transformer
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
#Data Import
data = pd.DataFrame(load_iris()['data'],columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'])
data.head()
def outlier_removal(X,factor):
X = pd.DataFrame(X).copy()
for i in range(X.shape[1]):
x = pd.Series(X.iloc[:,i]).copy()
q1 = x.quantile(0.25)
q3 = x.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (factor * iqr)
upper_bound = q3 + (factor * iqr)
X.iloc[((X.iloc[:,i] < lower_bound) | (X.iloc[:,i] > upper_bound)),i] = np.nan
return X
#creating outlier_remover object using FunctionTransformer with factor=1.5
outlier_remover = FunctionTransformer(outlier_removal,kw_args={'factor':1.5})
test = pd.DataFrame({'col1':[100,200,300,999],'col2':[0,0,1,2],'col3':[-10,0,1,2]})
test
X = data.copy()
y = load_iris()['target'].copy()
pipeline = Pipeline(steps=[['outlier_removal',ct],['imputer',SimpleImputer()],['regressor',LogisticRegression(max_iter=1000)]])
param_grid = {'outlier_removal__outlier_remover__kw_args':[{'factor':0},{'factor':1},{'factor':2},{'factor':3},{'factor':4}],
'imputer__strategy':['mean','median','most_frequent'],
'regressor__C':[0.01,0.1,1,10,100]}
gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=3)
gs.fit(X,y)
gs.best_params_
class OutlierRemover(BaseEstimator,TransformerMixin):
def __init__(self,factor=1.5):
self.factor = factor
def outlier_detector(self,X,y=None):
X = pd.Series(X).copy()
q1 = X.quantile(0.25)
q3 = X.quantile(0.75)
iqr = q3 - q1
self.lower_bound.append(q1 - (self.factor * iqr))
self.upper_bound.append(q3 + (self.factor * iqr))
def fit(self,X,y=None):
self.lower_bound = []
self.upper_bound = []
X.apply(self.outlier_detector)
return self
def transform(self,X,y=None):
X = pd.DataFrame(X).copy()
for i in range(X.shape[1]):
x = X.iloc[:, i].copy()
x[(x < self.lower_bound[i]) | (x > self.upper_bound[i])] = np.nan
X.iloc[:, i] = x
return X
outlier_remover = OutlierRemover()
test = pd.DataFrame({'col1':[100,200,300,999],'col2':[0,0,1,2],'col3':[-10,0,1,2]})
test
outlier_remover_100 = OutlierRemover(factor=100)
outlier_remover_100.fit_transform(test)
data.plot(kind="box",subplots=True,figsize=(15,5),title="Data with Outliers");
outlier_remover = OutlierRemover()
#ColumnTransformer to remove outliers
ct = ColumnTransformer(transformers=[['outlier_remover',OutlierRemover(),list(range(data.shape[1]))]],remainder='passthrough')
#iris data after outlier removal
data_without_outliers = pd.DataFrame(ct.fit_transform(data),columns=data.columns)
#iris data box plot after outlier removal
data_without_outliers.plot(kind="box",subplots=True,figsize=(15,5),title="Data without Outliers");
# 4 outliers are removed from SepalWidthCm, other columns stayed the same as they have no outliers.
data_without_outliers.isnull().sum()
#outliers removed from sepal width (cm)
list(data.loc[data_without_outliers.isnull().sum(axis=1)>0,'SepalWidthCm'])
X = data.copy()
y = load_iris()['target'].copy()
#Pipeline with outlier remover, imputer and regressor
pipeline = Pipeline(steps=[['outlier_removal',ct],['imputer',SimpleImputer()],['regressor',LogisticRegression(max_iter=1000)]])
param_grid = {'outlier_removal__outlier_remover__factor':[0,1,2,3,4],
'imputer__strategy':['mean','median','most_frequent'],
'regressor__C':[0.01,0.1,1,10,100]}
gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=3)
gs.fit(X,y)
gs.best_params_
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment