Last active
February 18, 2022 09:59
-
-
Save alberto-santini/d2c7572b499dea124a0160406fcf2e40 to your computer and use it in GitHub Desktop.
Error when using ColumnTransformer twice
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import fetch_openml | |
from sklearn.pipeline import Pipeline, make_pipeline | |
from sklearn.compose import ColumnTransformer, make_column_selector | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer | |
from sklearn.linear_model import LinearRegression | |
from sklearn.tree import DecisionTreeRegressor | |
from sklearn.model_selection import train_test_split | |
from mlxtend.regressor import StackingRegressor | |
import numpy as np | |
import pandas as pd | |
# We use the Ames house prices dataset for this example | |
d = fetch_openml('house_prices', as_frame=True).frame | |
# Small data preprocessing: | |
for column in d.columns: | |
if d[column].dtype == object or column == 'MSSubClass': | |
d[column] = pd.Categorical(d[column]) | |
d.drop(columns='Id', inplace=True) | |
# Inner linear regressor: | |
# 1. 1-hot encode categorical features | |
# 2. Standardise numerical features | |
# 3. Perform Linear Regression | |
sr_linear = Pipeline(steps=[ | |
('preprocessing', ColumnTransformer(transformers=[ | |
('categorical', | |
make_pipeline(OneHotEncoder(), StandardScaler()), | |
make_column_selector(dtype_include='category')), | |
('numerical', | |
StandardScaler(), | |
make_column_selector(dtype_include=np.number)) | |
])), | |
('model', LinearRegression()) | |
]) | |
# Just a decision regression tree... | |
# Because it supports categorical features natively, there is | |
# need to 1-hot-encode them. There is also no benefit in | |
# standardising numerical features. | |
sr_tree = DecisionTreeRegressor() | |
# Column transformer which performs imputation. | |
# Categorical columns which contain NaN get a 'None' value. | |
# Numerical columns which contain NaN get imputed with the median. | |
ct_imputation = ColumnTransformer(transformers=[ | |
('categorical', | |
SimpleImputer(strategy='constant', fill_value='None'), | |
make_column_selector(dtype_include='category')), | |
('numerical', | |
SimpleImputer(strategy='median'), | |
make_column_selector(dtype_include=np.number)) | |
]) | |
# Pipeline for a stacked regressor: | |
# 1. Perform imputation via the ColumnTransformer | |
# 2. Because ColumnTransformer returns a numpy array, but the preprocessing steps | |
# of the inner regressors needa pandas DataFrame, we convert the numpy array | |
# back to a DataFrame. We can use get_feature_names_out() because we are using | |
# the nightly version of scikit-learn: the current stable version 1.0.2 does not | |
# support it. We can install the nightly version with just one pip command: | |
# https://scikit-learn.org/stable/developers/advanced_installation.html | |
# 3. Apply the StackingRegressor | |
stacked_regressor = Pipeline(steps=[ | |
('imputation', ct_imputation), | |
('back_to_pandas', FunctionTransformer( | |
func=lambda values: pd.DataFrame(values, columns=ct_imputation.get_feature_names_out()) | |
)), | |
('model', StackingRegressor( | |
regressors=[sr_linear, sr_tree], | |
meta_regressor=DecisionTreeRegressor(), | |
use_features_in_secondary=True | |
)) | |
]) | |
# Prepare the data for training | |
label = 'SalePrice' | |
features = [col for col in d.columns if col != label] | |
X, y = d[features], d[label] | |
# Train the stacked regressor | |
stacked_regressor.fit(X_train, y_train) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--------------------------------------------------------------------------- | |
ValueError Traceback (most recent call last) | |
Input In [37], in <module> | |
----> 1 stacked_regressor.fit(X_train, y_train) | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:394, in Pipeline.fit(self, X, y, **fit_params) | |
392 if self._final_estimator != "passthrough": | |
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]] | |
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step) | |
396 return self | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/mlxtend/regressor/stacking_regression.py:168, in StackingRegressor.fit(self, X, y, sample_weight) | |
165 print(_name_estimators((regr,))[0][1]) | |
167 if sample_weight is None: | |
--> 168 regr.fit(X, y) | |
169 else: | |
170 regr.fit(X, y, sample_weight=sample_weight) | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params) | |
364 """Fit the model. | |
365 | |
366 Fit all the transformers one after the other and transform the | |
(...) | |
387 Pipeline with fitted steps. | |
388 """ | |
389 fit_params_steps = self._check_fit_params(**fit_params) | |
--> 390 Xt = self._fit(X, y, **fit_params_steps) | |
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): | |
392 if self._final_estimator != "passthrough": | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps) | |
346 cloned_transformer = clone(transformer) | |
347 # Fit or load from cache the current transformer | |
--> 348 X, fitted_transformer = fit_transform_one_cached( | |
349 cloned_transformer, | |
350 X, | |
351 y, | |
352 None, | |
353 message_clsname="Pipeline", | |
354 message=self._log_message(step_idx), | |
355 **fit_params_steps[name], | |
356 ) | |
357 # Replace the transformer of the step with the fitted | |
358 # transformer. This is necessary when loading the transformer | |
359 # from the cache. | |
360 self.steps[step_idx] = (name, fitted_transformer) | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/joblib/memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs) | |
348 def __call__(self, *args, **kwargs): | |
--> 349 return self.func(*args, **kwargs) | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) | |
891 with _print_elapsed_time(message_clsname, message): | |
892 if hasattr(transformer, "fit_transform"): | |
--> 893 res = transformer.fit_transform(X, y, **fit_params) | |
894 else: | |
895 res = transformer.fit(X, y, **fit_params).transform(X) | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:672, in ColumnTransformer.fit_transform(self, X, y) | |
670 self._check_n_features(X, reset=True) | |
671 self._validate_transformers() | |
--> 672 self._validate_column_callables(X) | |
673 self._validate_remainder(X) | |
675 result = self._fit_transform(X, y, _fit_transform_one) | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:350, in ColumnTransformer._validate_column_callables(self, X) | |
348 for name, _, columns in self.transformers: | |
349 if callable(columns): | |
--> 350 columns = columns(X) | |
351 all_columns.append(columns) | |
352 transformer_to_input_indices[name] = _get_column_indices(X, columns) | |
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:1036, in make_column_selector.__call__(self, df) | |
1027 """Callable for column selection to be used by a | |
1028 :class:`ColumnTransformer`. | |
1029 | |
(...) | |
1033 DataFrame to select columns from. | |
1034 """ | |
1035 if not hasattr(df, "iloc"): | |
-> 1036 raise ValueError( | |
1037 "make_column_selector can only be applied to pandas dataframes" | |
1038 ) | |
1039 df_row = df.iloc[:1] | |
1040 if self.dtype_include is not None or self.dtype_exclude is not None: | |
ValueError: make_column_selector can only be applied to pandas dataframes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment