Skip to content

Instantly share code, notes, and snippets.

@JCardenasRdz
Created August 7, 2018 18:18
Show Gist options
  • Select an option

  • Save JCardenasRdz/321bf502fb1aa6d7a804bf98fcd7f650 to your computer and use it in GitHub Desktop.

Select an option

Save JCardenasRdz/321bf502fb1aa6d7a804bf98fcd7f650 to your computer and use it in GitHub Desktop.
#%% 2. Create pipeline to clean contnous data
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
def center_and_scale_data(DataFrame):
"""
this function does the actual center and scaling
"""
D = DataFrame.copy()
for C in D.columns:
S = DataFrame[C].copy()
D[C] = ( S - S.mean() ) / S.std()
return D
class cont_cleaner(BaseEstimator, TransformerMixin):
"""
Center and scales data
Inputs:
cols = list of strings (columns to be extracted)
center = boolean to decide if data should be center and scaled
"""
def __init__(self, cols, center=True):
self.center = center
self.cols = cols
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
if self.center == True:
final_data = center_and_scale_data(X[self.cols])
else:
final_data = X[self.cols].copy()
return final_data
# define which variables to extract
list_of_cont_variables = ['horsepower','cylinders','displacement']
# create a tupple with the name transformer
cont_data_cleaner = ('cont_cleaner', cont_cleaner(cols = list_of_cont_variables) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment