Last active
April 7, 2018 13:33
-
-
Save franc3000/f3b79c299b26bf628d060df7d070f728 to your computer and use it in GitHub Desktop.
Feature engineering automation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler, Binarizer | |
from sklearn.base import TransformerMixin, BaseEstimator | |
""" | |
PyData Chicago | |
Franklin Sarkett | |
[email protected] | |
Work Hard Once | |
Strategy and Automation applied to building machine learning models | |
""" | |
class DataFrameColumnExtractor(TransformerMixin, BaseEstimator): | |
""" | |
Returns a DataFrame, given a DataFrame | |
""" | |
def __init__(self, column): | |
self.column = column | |
def transform(self, df): | |
df_col = df[[self.column]] | |
# if all values are NaN, then replace with 0 | |
for c in df_col.columns: | |
if df_col[c].isnull().all(): | |
df_col[c] = df_col[c].fillna(0) | |
return df_col | |
def fit(self, *_): | |
return self | |
class DataFrameImputer(TransformerMixin, BaseEstimator): | |
""" | |
Impute missing values. | |
Columns of dtype object are imputed with the most frequent val in col. | |
Columns of other types are imputed with mean of column. | |
""" | |
def __init__(self): | |
self.fill = 0 | |
def fit(self, df, y=None): | |
# if not df and not series, error | |
if not isinstance(df, pd.DataFrame) and not isinstance(df, pd.Series): | |
raise ValueError('var `df` type is not a DataFrame or Series, it is a {}'.format(type(df))) | |
self.fill = pd.Series([df[c].median(skipna=True) for c in df], index=df.columns) | |
return self | |
def transform(self, df, y=None): | |
return df.fillna(self.fill) | |
class StandardScalerLimitTransformer(TransformerMixin, BaseEstimator): | |
""" | |
Replaces extreme values with the min and max allowed values | |
""" | |
def __init__(self, min_value=-3, max_value=3): | |
self.min_value = min_value | |
self.max_value = max_value | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
# logging.getLogger('SSLimitTransformer').info('transform') | |
X[X < self.min_value] = self.min_value | |
X[X > self.max_value] = self.max_value | |
return X | |
def build_transformed_dataset(df, y): | |
pipeline_sqft = Pipeline([ | |
('Sqft', DataFrameColumnExtractor('SquareFootage')), | |
('df', DataFrameImputer()), | |
('scaler', StandardScaler()), | |
('minmaxlimit', StandardScalerLimitTransformer()) | |
]) | |
pipeline_tav = Pipeline([ | |
('TaxAssessedValue', DataFrameColumnExtractor('TaxAssessedValue')), | |
('df', DataFrameImputer()), | |
('scaler', StandardScaler()), | |
('minmaxlimit', StandardScalerLimitTransformer()) | |
]) | |
# feature union | |
dffu = DataFrameFeatureUnion([ | |
('sqft', pipeline_sqft), | |
('tav', pipeline_tav), | |
]) | |
# calling fit_transform on each pipeline in the feature union | |
X1 = dffu.fit_transform(df) | |
# concat on axis=1, adding cols | |
df_out = pd.concat([df, X1], axis=1) | |
return df_out, y |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment