Last active
May 5, 2023 11:01
-
-
Save ksv-muralidhar/f5e3384c1c6f7a26dcb50b5965f6f367 to your computer and use it in GitHub Desktop.
Custom Transformer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.base import BaseEstimator,TransformerMixin | |
from sklearn.compose import ColumnTransformer | |
from sklearn.preprocessing import FunctionTransformer | |
from sklearn.datasets import load_iris | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
from sklearn.linear_model import LogisticRegression | |
#Data Import | |
data = pd.DataFrame(load_iris()['data'],columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']) | |
data.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def outlier_removal(X,factor): | |
X = pd.DataFrame(X).copy() | |
for i in range(X.shape[1]): | |
x = pd.Series(X.iloc[:,i]).copy() | |
q1 = x.quantile(0.25) | |
q3 = x.quantile(0.75) | |
iqr = q3 - q1 | |
lower_bound = q1 - (factor * iqr) | |
upper_bound = q3 + (factor * iqr) | |
X.iloc[((X.iloc[:,i] < lower_bound) | (X.iloc[:,i] > upper_bound)),i] = np.nan | |
return X | |
#creating outlier_remover object using FunctionTransformer with factor=1.5 | |
outlier_remover = FunctionTransformer(outlier_removal,kw_args={'factor':1.5}) | |
test = pd.DataFrame({'col1':[100,200,300,999],'col2':[0,0,1,2],'col3':[-10,0,1,2]}) | |
test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
X = data.copy() | |
y = load_iris()['target'].copy() | |
pipeline = Pipeline(steps=[['outlier_removal',ct],['imputer',SimpleImputer()],['regressor',LogisticRegression(max_iter=1000)]]) | |
param_grid = {'outlier_removal__outlier_remover__kw_args':[{'factor':0},{'factor':1},{'factor':2},{'factor':3},{'factor':4}], | |
'imputer__strategy':['mean','median','most_frequent'], | |
'regressor__C':[0.01,0.1,1,10,100]} | |
gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=3) | |
gs.fit(X,y) | |
gs.best_params_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class OutlierRemover(BaseEstimator,TransformerMixin): | |
def __init__(self,factor=1.5): | |
self.factor = factor | |
def outlier_detector(self,X,y=None): | |
X = pd.Series(X).copy() | |
q1 = X.quantile(0.25) | |
q3 = X.quantile(0.75) | |
iqr = q3 - q1 | |
self.lower_bound.append(q1 - (self.factor * iqr)) | |
self.upper_bound.append(q3 + (self.factor * iqr)) | |
def fit(self,X,y=None): | |
self.lower_bound = [] | |
self.upper_bound = [] | |
X.apply(self.outlier_detector) | |
return self | |
def transform(self,X,y=None): | |
X = pd.DataFrame(X).copy() | |
for i in range(X.shape[1]): | |
x = X.iloc[:, i].copy() | |
x[(x < self.lower_bound[i]) | (x > self.upper_bound[i])] = np.nan | |
X.iloc[:, i] = x | |
return X | |
outlier_remover = OutlierRemover() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test = pd.DataFrame({'col1':[100,200,300,999],'col2':[0,0,1,2],'col3':[-10,0,1,2]}) | |
test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
outlier_remover_100 = OutlierRemover(factor=100) | |
outlier_remover_100.fit_transform(test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data.plot(kind="box",subplots=True,figsize=(15,5),title="Data with Outliers"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
outlier_remover = OutlierRemover() | |
#ColumnTransformer to remove outliers | |
ct = ColumnTransformer(transformers=[['outlier_remover',OutlierRemover(),list(range(data.shape[1]))]],remainder='passthrough') | |
#iris data after outlier removal | |
data_without_outliers = pd.DataFrame(ct.fit_transform(data),columns=data.columns) | |
#iris data box plot after outlier removal | |
data_without_outliers.plot(kind="box",subplots=True,figsize=(15,5),title="Data without Outliers"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 4 outliers are removed from SepalWidthCm, other columns stayed the same as they have no outliers. | |
data_without_outliers.isnull().sum() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#outliers removed from sepal width (cm) | |
list(data.loc[data_without_outliers.isnull().sum(axis=1)>0,'SepalWidthCm']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
X = data.copy() | |
y = load_iris()['target'].copy() | |
#Pipeline with outlier remover, imputer and regressor | |
pipeline = Pipeline(steps=[['outlier_removal',ct],['imputer',SimpleImputer()],['regressor',LogisticRegression(max_iter=1000)]]) | |
param_grid = {'outlier_removal__outlier_remover__factor':[0,1,2,3,4], | |
'imputer__strategy':['mean','median','most_frequent'], | |
'regressor__C':[0.01,0.1,1,10,100]} | |
gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=3) | |
gs.fit(X,y) | |
gs.best_params_ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment