Created
August 7, 2018 18:18
-
-
Save JCardenasRdz/321bf502fb1aa6d7a804bf98fcd7f650 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #%% 2. Create pipeline to clean contnous data | |
| from sklearn.base import TransformerMixin, BaseEstimator | |
| import pandas as pd | |
| def center_and_scale_data(DataFrame): | |
| """ | |
| this function does the actual center and scaling | |
| """ | |
| D = DataFrame.copy() | |
| for C in D.columns: | |
| S = DataFrame[C].copy() | |
| D[C] = ( S - S.mean() ) / S.std() | |
| return D | |
| class cont_cleaner(BaseEstimator, TransformerMixin): | |
| """ | |
| Center and scales data | |
| Inputs: | |
| cols = list of strings (columns to be extracted) | |
| center = boolean to decide if data should be center and scaled | |
| """ | |
| def __init__(self, cols, center=True): | |
| self.center = center | |
| self.cols = cols | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X): | |
| assert isinstance(X, pd.DataFrame) | |
| if self.center == True: | |
| final_data = center_and_scale_data(X[self.cols]) | |
| else: | |
| final_data = X[self.cols].copy() | |
| return final_data | |
| # define which variables to extract | |
| list_of_cont_variables = ['horsepower','cylinders','displacement'] | |
| # create a tupple with the name transformer | |
| cont_data_cleaner = ('cont_cleaner', cont_cleaner(cols = list_of_cont_variables) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment