Skip to content

Instantly share code, notes, and snippets.

@JCardenasRdz
Created August 7, 2018 18:49
Show Gist options
  • Select an option

  • Save JCardenasRdz/71c6b555f62e0d2b31010bbae94d5b5d to your computer and use it in GitHub Desktop.

Select an option

Save JCardenasRdz/71c6b555f62e0d2b31010bbae94d5b5d to your computer and use it in GitHub Desktop.
def create_manufacturer(DataFrame, N_cases = 3):
"""
return the top_N + 1 manufacturers in one-hot encoding format
"""
man = DataFrame['name'].apply(lambda x: x.split(' ')[0])
cases = man.value_counts().index
manufacturer_encoded = man.replace(cases[N_cases::],
len(cases[N_cases::]) * ['OTHER_MAN'])
return pd.get_dummies( manufacturer_encoded )
class cat_cleaner(BaseEstimator, TransformerMixin):
"""
Creates and encoded the manufacturer variable
Inputs:
N_cases = Numer of N + 1 manufacturers to be encoded
"""
def __init__(self, n_cases=2):
self.n_cases = n_cases
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
return create_manufacturer(X, N_cases = self.n_cases)
# create a tupple with the name transformer
cat_data_cleaner = ('cat_cleaner', cat_cleaner(n_cases=5) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment