Created
August 6, 2017 10:26
-
-
Save geoHeil/5caff5236b4850d673b2c9b0799dc2ce to your computer and use it in GitHub Desktop.
Sicket learn Multi label encoder with automatic most frequent imputation of unseen labels
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class EncodeCategorical(TransformerMixin): | |
""" | |
Encodes a specified list of columns or all columns if None. Impute unseen labels with most frequent value per column | |
""" | |
def __init__(self, columns=None): | |
self.columns = columns | |
self.encoders = None | |
self.fillNewLabels = None | |
def fit(self, X, y, **kwargs): | |
""" | |
Expects a data frame with named columns to encode. | |
""" | |
# Encode all columns if columns is None | |
if self.columns is None: | |
self.columns = X.columns | |
# Fit a label encoder for each column in the data frame | |
self.encoders = {} | |
for column in self.columns: | |
# to fix weird unorderable types: str() > float() issue | |
# when loading data via loadNeverDataMultipleFiles or loadNeverDataMultipleFilesDirect | |
self.encoders[column] = LabelEncoder().fit(X[column].astype(str)) | |
# collect most frequent label per column as replacement for unseen labels | |
self.fillNewLabels = pd.Series([X[c].value_counts().index[0] for c in self.columns], | |
index=self.columns) | |
return self | |
def transform(self, data): | |
""" | |
Uses the encoders to transform a data frame. | |
""" | |
for column, encoder in self.encoders.items(): | |
fittedLabels = encoder.classes_ | |
replacementForUnseen = self.fillNewLabels[column] | |
data[column] = data[column].astype(str) | |
data.loc[~data[column].isin(fittedLabels), column] = str(replacementForUnseen) | |
data.loc[:, column] = encoder.transform(data[column]) # .astype(str)) | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment