Skip to content

Instantly share code, notes, and snippets.

@geoHeil
Created August 6, 2017 10:26
Show Gist options
  • Save geoHeil/5caff5236b4850d673b2c9b0799dc2ce to your computer and use it in GitHub Desktop.
Save geoHeil/5caff5236b4850d673b2c9b0799dc2ce to your computer and use it in GitHub Desktop.
Sicket learn Multi label encoder with automatic most frequent imputation of unseen labels
class EncodeCategorical(TransformerMixin):
"""
Encodes a specified list of columns or all columns if None. Impute unseen labels with most frequent value per column
"""
def __init__(self, columns=None):
self.columns = columns
self.encoders = None
self.fillNewLabels = None
def fit(self, X, y, **kwargs):
"""
Expects a data frame with named columns to encode.
"""
# Encode all columns if columns is None
if self.columns is None:
self.columns = X.columns
# Fit a label encoder for each column in the data frame
self.encoders = {}
for column in self.columns:
# to fix weird unorderable types: str() > float() issue
# when loading data via loadNeverDataMultipleFiles or loadNeverDataMultipleFilesDirect
self.encoders[column] = LabelEncoder().fit(X[column].astype(str))
# collect most frequent label per column as replacement for unseen labels
self.fillNewLabels = pd.Series([X[c].value_counts().index[0] for c in self.columns],
index=self.columns)
return self
def transform(self, data):
"""
Uses the encoders to transform a data frame.
"""
for column, encoder in self.encoders.items():
fittedLabels = encoder.classes_
replacementForUnseen = self.fillNewLabels[column]
data[column] = data[column].astype(str)
data.loc[~data[column].isin(fittedLabels), column] = str(replacementForUnseen)
data.loc[:, column] = encoder.transform(data[column]) # .astype(str))
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment