Skip to content

Instantly share code, notes, and snippets.

@e-mon
Last active June 26, 2018 08:48
Show Gist options
  • Save e-mon/35e3ab798ef67e821b3752c0cfa793f3 to your computer and use it in GitHub Desktop.
Save e-mon/35e3ab798ef67e821b3752c0cfa793f3 to your computer and use it in GitHub Desktop.
# from https://www.kaggle.com/samratp/wordbatch-ridge-fm-frtl-target-encoding-lgbm/notebook
class TargetEncoder:
# Adapted from https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def __repr__(self):
return 'TargetEncoder'
def __init__(self, smoothing=1, min_samples_leaf=1, noise_level=0, keep_original=False, suffix='enc'):
self.smoothing = smoothing
self.min_samples_leaf = min_samples_leaf
self.noise_level = noise_level
self.keep_original = keep_original
self.suffix = suffix
@staticmethod
def add_noise(series, noise_level):
return series * (1 + noise_level * np.random.randn(len(series)))
def encode(self, train, test, target, cols, suffix=None):
if suffix is None:
suffix = self.suffix
for col in cols:
if self.keep_original:
train[col + suffix], test[col + suffix] = self.encode_column(train[col], test[col], target)
else:
train[col], test[col] = self.encode_column(train[col], test[col], target)
return train, test
def encode_column(self, trn_series, tst_series, target):
temp = pd.concat([trn_series, target], axis=1)
# Compute target mean
averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
# Compute smoothing
smoothing = 1 / (1 + np.exp(-(averages["count"] - self.min_samples_leaf) / self.smoothing))
# Apply average function to all target data
prior = target.mean()
# The bigger the count the less full_avg is taken into account
averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
averages.drop(['mean', 'count'], axis=1, inplace=True)
# Apply averages to trn and tst series
ft_trn_series = pd.merge(
trn_series.to_frame(trn_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=trn_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_trn_series.index = trn_series.index
ft_tst_series = pd.merge(
tst_series.to_frame(tst_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=tst_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_tst_series.index = tst_series.index
return self.add_noise(ft_trn_series, self.noise_level), self.add_noise(ft_tst_series, self.noise_level)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment