Created
August 7, 2018 18:49
-
-
Save JCardenasRdz/71c6b555f62e0d2b31010bbae94d5b5d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def create_manufacturer(DataFrame, N_cases = 3): | |
| """ | |
| return the top_N + 1 manufacturers in one-hot encoding format | |
| """ | |
| man = DataFrame['name'].apply(lambda x: x.split(' ')[0]) | |
| cases = man.value_counts().index | |
| manufacturer_encoded = man.replace(cases[N_cases::], | |
| len(cases[N_cases::]) * ['OTHER_MAN']) | |
| return pd.get_dummies( manufacturer_encoded ) | |
| class cat_cleaner(BaseEstimator, TransformerMixin): | |
| """ | |
| Creates and encoded the manufacturer variable | |
| Inputs: | |
| N_cases = Numer of N + 1 manufacturers to be encoded | |
| """ | |
| def __init__(self, n_cases=2): | |
| self.n_cases = n_cases | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X): | |
| assert isinstance(X, pd.DataFrame) | |
| return create_manufacturer(X, N_cases = self.n_cases) | |
| # create a tupple with the name transformer | |
| cat_data_cleaner = ('cat_cleaner', cat_cleaner(n_cases=5) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment