Skip to content

Instantly share code, notes, and snippets.

@hermidalc
Last active September 3, 2024 11:48
Show Gist options
  • Save hermidalc/e51ee5dc5321f57bba1312bdb7af5916 to your computer and use it in GitHub Desktop.
Save hermidalc/e51ee5dc5321f57bba1312bdb7af5916 to your computer and use it in GitHub Desktop.
scikit-learn compatible ColumnSelector class
# if column selection on feature names X must be pandas df
# if used in Pipeline must be the first step or you have no
# feature selection step before it and you can then still
# use col indices
import warnings
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils import check_X_y
from sklearn.feature_selection import SelectorMixin
from sklearn.utils.validation import check_is_fitted
class ColumnSelectorWarning(UserWarning):
"""Warning used to notify when column name does not exist
"""
class ColumnSelector(SelectorMixin, BaseEstimator):
"""Column feature selector
Parameters
----------
cols : array-like (default = None)
A list specifying the feature indices to be selected. For example,
[1, 4, 5] to select the 2nd, 5th, and 6th feature columns, and
['A','C','D'] to select the name of feature columns A, C and D.
If None, returns all columns in the array.
"""
def __init__(self, cols=None):
self.cols = cols
def fit(self, X, y):
"""
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Training input data matrix.
y : array-like, shape = (n_samples)
Target values (class labels in classification, real numbers in
regression).
Returns
---------
self : object
Returns self.
"""
X, y = check_X_y(X, y, dtype=None)
self._check_params(X, y)
if self.cols is None:
mask = np.ones(X.shape[1], dtype=bool)
elif isinstance(self.cols[0], str):
mask = X.columns.isin(self.cols).to_numpy()
else:
mask = np.zeros(X.shape[1], dtype=bool)
mask[list(self.cols)] = True
self._mask = mask
return self
def transform(self, X):
"""
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Input data matrix.
Returns
-------
Xr : array of shape (n_samples, n_selected_features)
edgeR filterByExpr counts data matrix with only the selected
features.
"""
check_is_fitted(self, '_mask')
return super().transform(X)
def inverse_transform(self, X):
"""
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Input transformed data matrix.
Returns
-------
Xr : array of shape (n_samples, n_original_features)
`X` with columns of zeros inserted where features would have
been removed by :meth:`transform`.
"""
check_is_fitted(self, '_mask')
return super().inverse_transform(X)
def _check_params(self, X, y):
if (isinstance(self.cols, (list, tuple)) and self.cols
or self.cols.size > 0):
types = {type(i) for i in self.cols}
if len(types) > 1:
raise ValueError('cols should be all names or indices.')
if isinstance(self.cols[0], str):
if not hasattr(X, 'iloc'):
raise ValueError('X needs to be pandas dataframe if'
'cols are feature names.')
for col in self.cols:
if col not in X.columns:
warnings.warn('{} does not exist.'.format(col),
ColumnSelectorWarning)
else:
for col in self.cols:
if not 0 <= col <= X.shape[1]:
raise ValueError(
'cols should be 0 <= col <= n_features; got %r.'
'Use cols=None to return all features.' % col)
def _get_support_mask(self):
check_is_fitted(self, '_mask')
return self._mask
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment