import pandas as pd
from sklearn.datasets import fetch_openml
x = fetch_openml(data_id=1461, as_frame=True, parser='pandas')
dataset = x['frame']
print(f'dataset Shape {dataset.shape}')
dataset.head()
from sklearn.model_selection import train_test_split
target = dataset.pop('Class')
X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.33, random_state=42,stratify=target )
print(f'X_train shape {X_train.shape} y_train shape {y_train.shape}')
print(f'X_test shape {X_test.shape} y_test shape {y_test.shape}')
X_train shape (30291, 16) y_train shape (30291,)
X_test shape (14920, 16) y_test shape (14920,)
Building Custom Imputer and Transformer
import numpy as np
import warnings
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils import check_array, check_X_y, estimator_checks
from sklearn.utils.validation import check_is_fitted
from sklearn.base import TransformerMixin, BaseEstimator
def from_bool_to_number(x: np.ndarray) -> np.ndarray:
return np.where(x, 1.0, 0.0)
def from_number_to_bool(x: np.ndarray) -> np.ndarray:
return np.where(x > 0.5, True, False)
BooleanTransformer = FunctionTransformer(from_bool_to_number, from_number_to_bool, check_inverse=False)
class ColumnsGuard(TransformerMixin, BaseEstimator):
"""
Verify column names at predict time match the ones used when fitting
"""
def fit(self, X, y=None):
X_out, y = check_X_y(X, y)
self.expected_ = list(X.columns)
return self
def transform(self, X):
check_is_fitted(self)
X_out = check_array(X)
columns_got = list(X.columns)
missing = set(self.expected_) - set(columns_got)
extra = set(columns_got) - set(self.expected_)
if missing:
raise ValueError('Missing columns: {missing}')
elif extra:
warnings.warn(f'Got extra columns: {extra}, ignoring')
return X[self.expected_]
else:
return X
class BooleanImputer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
# validate and convert if possible:
X = check_array(X, force_all_finite=False)
_, counts = np.unique(X, return_counts=True)
ind = np.argmax(counts)
self.fill_val_ = X[ind]
return self
def transform(self, X):
X = check_array(X, force_all_finite=False)
return np.where(X==np.nan, self.fill_val_, X)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import make_column_selector as column_selector
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([
("columnsguard", ColumnsGuard()),
(
'preprocessor', ColumnTransformer([
(
'numerical',
Pipeline([
('imputer', SimpleImputer(strategy='mean')),
#('scaler', StandardScaler())
]),
column_selector(dtype_include=[np.number])
),
(
'categorical',
Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
]),
column_selector(dtype_include=[object, "category"])
),
(
'boolean',
Pipeline([
('imputer', BooleanImputer()),
('onehot', FunctionTransformer(from_bool_to_number, from_number_to_bool, check_inverse=False))
]),
column_selector(dtype_include=bool)
)
])
),
('classifier', KNeighborsClassifier() )
])
pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
import joblib
joblib.dump(pipeline, 'pipeline.pkl')