Last active
September 3, 2024 11:48
-
-
Save hermidalc/e51ee5dc5321f57bba1312bdb7af5916 to your computer and use it in GitHub Desktop.
scikit-learn compatible ColumnSelector class
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# if column selection on feature names X must be pandas df | |
# if used in Pipeline must be the first step or you have no | |
# feature selection step before it and you can then still | |
# use col indices | |
import warnings | |
import numpy as np | |
from sklearn.base import BaseEstimator | |
from sklearn.utils import check_X_y | |
from sklearn.feature_selection import SelectorMixin | |
from sklearn.utils.validation import check_is_fitted | |
class ColumnSelectorWarning(UserWarning): | |
"""Warning used to notify when column name does not exist | |
""" | |
class ColumnSelector(SelectorMixin, BaseEstimator): | |
"""Column feature selector | |
Parameters | |
---------- | |
cols : array-like (default = None) | |
A list specifying the feature indices to be selected. For example, | |
[1, 4, 5] to select the 2nd, 5th, and 6th feature columns, and | |
['A','C','D'] to select the name of feature columns A, C and D. | |
If None, returns all columns in the array. | |
""" | |
def __init__(self, cols=None): | |
self.cols = cols | |
def fit(self, X, y): | |
""" | |
Parameters | |
---------- | |
X : array-like, shape = (n_samples, n_features) | |
Training input data matrix. | |
y : array-like, shape = (n_samples) | |
Target values (class labels in classification, real numbers in | |
regression). | |
Returns | |
--------- | |
self : object | |
Returns self. | |
""" | |
X, y = check_X_y(X, y, dtype=None) | |
self._check_params(X, y) | |
if self.cols is None: | |
mask = np.ones(X.shape[1], dtype=bool) | |
elif isinstance(self.cols[0], str): | |
mask = X.columns.isin(self.cols).to_numpy() | |
else: | |
mask = np.zeros(X.shape[1], dtype=bool) | |
mask[list(self.cols)] = True | |
self._mask = mask | |
return self | |
def transform(self, X): | |
""" | |
Parameters | |
---------- | |
X : array-like, shape = (n_samples, n_features) | |
Input data matrix. | |
Returns | |
------- | |
Xr : array of shape (n_samples, n_selected_features) | |
edgeR filterByExpr counts data matrix with only the selected | |
features. | |
""" | |
check_is_fitted(self, '_mask') | |
return super().transform(X) | |
def inverse_transform(self, X): | |
""" | |
Parameters | |
---------- | |
X : array-like, shape = (n_samples, n_features) | |
Input transformed data matrix. | |
Returns | |
------- | |
Xr : array of shape (n_samples, n_original_features) | |
`X` with columns of zeros inserted where features would have | |
been removed by :meth:`transform`. | |
""" | |
check_is_fitted(self, '_mask') | |
return super().inverse_transform(X) | |
def _check_params(self, X, y): | |
if (isinstance(self.cols, (list, tuple)) and self.cols | |
or self.cols.size > 0): | |
types = {type(i) for i in self.cols} | |
if len(types) > 1: | |
raise ValueError('cols should be all names or indices.') | |
if isinstance(self.cols[0], str): | |
if not hasattr(X, 'iloc'): | |
raise ValueError('X needs to be pandas dataframe if' | |
'cols are feature names.') | |
for col in self.cols: | |
if col not in X.columns: | |
warnings.warn('{} does not exist.'.format(col), | |
ColumnSelectorWarning) | |
else: | |
for col in self.cols: | |
if not 0 <= col <= X.shape[1]: | |
raise ValueError( | |
'cols should be 0 <= col <= n_features; got %r.' | |
'Use cols=None to return all features.' % col) | |
def _get_support_mask(self): | |
check_is_fitted(self, '_mask') | |
return self._mask |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment