hermidalc · September 3, 2024 11:48
diff --git a/column_selector.py b/column_selector.py
 # if column selection on feature names X must be pandas df
 # if used in Pipeline must be the first step or you have no
 # feature selection step before it and you can then still
 # use col indices

 import warnings
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_X_y
 from sklearn.feature_selection import SelectorMixin
 from sklearn.utils.validation import check_is_fitted


 class ColumnSelectorWarning(UserWarning):
    """Warning used to notify when column name does not exist
    """


 class ColumnSelector(SelectorMixin, BaseEstimator):
    """Column feature selector

    Parameters
    ----------
    cols : array-like (default = None)
        A list specifying the feature indices to be selected. For example,
        [1, 4, 5] to select the 2nd, 5th, and 6th feature columns, and
        ['A','C','D'] to select the name of feature columns A, C and D.
        If None, returns all columns in the array.
    """

    def __init__(self, cols=None):
        self.cols = cols

    def fit(self, X, y):
        """
        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training input data matrix.

        y : array-like, shape = (n_samples)
            Target values (class labels in classification, real numbers in
            regression).

        Returns
        ---------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y, dtype=None)
        self._check_params(X, y)
        if self.cols is None:
            mask = np.ones(X.shape[1], dtype=bool)
        elif isinstance(self.cols[0], str):
            mask = X.columns.isin(self.cols).to_numpy()
        else:
            mask = np.zeros(X.shape[1], dtype=bool)
            mask[list(self.cols)] = True
        self._mask = mask
        return self

    def transform(self, X):
        """
        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Input data matrix.

        Returns
        -------
        Xr : array of shape (n_samples, n_selected_features)
            edgeR filterByExpr counts data matrix with only the selected
            features.
        """
        check_is_fitted(self, '_mask')
        return super().transform(X)

    def inverse_transform(self, X):
        """
        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Input transformed data matrix.

        Returns
        -------
        Xr : array of shape (n_samples, n_original_features)
            `X` with columns of zeros inserted where features would have
            been removed by :meth:`transform`.
        """
        check_is_fitted(self, '_mask')
        return super().inverse_transform(X)

    def _check_params(self, X, y):
        if (isinstance(self.cols, (list, tuple)) and self.cols
                or self.cols.size > 0):
            types = {type(i) for i in self.cols}
            if len(types) > 1:
                raise ValueError('cols should be all names or indices.')
            if isinstance(self.cols[0], str):
                if not hasattr(X, 'iloc'):
                    raise ValueError('X needs to be pandas dataframe if'
                                     'cols are feature names.')
                for col in self.cols:
                    if col not in X.columns:
                        warnings.warn('{} does not exist.'.format(col),
                                      ColumnSelectorWarning)
            else:
                for col in self.cols:
                    if not 0 <= col <= X.shape[1]:
                        raise ValueError(
                            'cols should be 0 <= col <= n_features; got %r.'
                            'Use cols=None to return all features.' % col)

    def _get_support_mask(self):
        check_is_fitted(self, '_mask')
        return self._mask
	# if column selection on feature names X must be pandas df
	# if used in Pipeline must be the first step or you have no
	# feature selection step before it and you can then still
	# use col indices

	import warnings
	import numpy as np
	from sklearn.base import BaseEstimator
	from sklearn.utils import check_X_y
	from sklearn.feature_selection import SelectorMixin
	from sklearn.utils.validation import check_is_fitted


	class ColumnSelectorWarning(UserWarning):
	"""Warning used to notify when column name does not exist
	"""


	class ColumnSelector(SelectorMixin, BaseEstimator):
	"""Column feature selector

	Parameters
	----------
	cols : array-like (default = None)
	A list specifying the feature indices to be selected. For example,
	[1, 4, 5] to select the 2nd, 5th, and 6th feature columns, and
	['A','C','D'] to select the name of feature columns A, C and D.
	If None, returns all columns in the array.
	"""

	def __init__(self, cols=None):
	self.cols = cols

	def fit(self, X, y):
	"""
	Parameters
	----------
	X : array-like, shape = (n_samples, n_features)
	Training input data matrix.

	y : array-like, shape = (n_samples)
	Target values (class labels in classification, real numbers in
	regression).

	Returns
	---------
	self : object
	Returns self.
	"""
	X, y = check_X_y(X, y, dtype=None)
	self._check_params(X, y)
	if self.cols is None:
	mask = np.ones(X.shape[1], dtype=bool)
	elif isinstance(self.cols[0], str):
	mask = X.columns.isin(self.cols).to_numpy()
	else:
	mask = np.zeros(X.shape[1], dtype=bool)
	mask[list(self.cols)] = True
	self._mask = mask
	return self

	def transform(self, X):
	"""
	Parameters
	----------
	X : array-like, shape = (n_samples, n_features)
	Input data matrix.

	Returns
	-------
	Xr : array of shape (n_samples, n_selected_features)
	edgeR filterByExpr counts data matrix with only the selected
	features.
	"""
	check_is_fitted(self, '_mask')
	return super().transform(X)

	def inverse_transform(self, X):
	"""
	Parameters
	----------
	X : array-like, shape = (n_samples, n_features)
	Input transformed data matrix.

	Returns
	-------
	Xr : array of shape (n_samples, n_original_features)
	`X` with columns of zeros inserted where features would have
	been removed by :meth:`transform`.
	"""
	check_is_fitted(self, '_mask')
	return super().inverse_transform(X)

	def _check_params(self, X, y):
	if (isinstance(self.cols, (list, tuple)) and self.cols
	or self.cols.size > 0):
	types = {type(i) for i in self.cols}
	if len(types) > 1:
	raise ValueError('cols should be all names or indices.')
	if isinstance(self.cols[0], str):
	if not hasattr(X, 'iloc'):
	raise ValueError('X needs to be pandas dataframe if'
	'cols are feature names.')
	for col in self.cols:
	if col not in X.columns:
	warnings.warn('{} does not exist.'.format(col),
	ColumnSelectorWarning)
	else:
	for col in self.cols:
	if not 0 <= col <= X.shape[1]:
	raise ValueError(
	'cols should be 0 <= col <= n_features; got %r.'
	'Use cols=None to return all features.' % col)

	def _get_support_mask(self):
	check_is_fitted(self, '_mask')
	return self._mask