Created
November 9, 2019 04:19
-
-
Save devanshuDesai/d3bdc9270395490cae3b690632445e0e to your computer and use it in GitHub Desktop.
An example of a feature engineering + model pipeline I made
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.compose import ColumnTransformer | |
from sklearn.preprocessing import FunctionTransformer | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.model_selection import train_test_split | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.model_selection import StratifiedKFold | |
from sklearn.pipeline import Pipeline | |
from sklearn.decomposition import PCA | |
from sklearn.linear_model import LogisticRegression | |
import os | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import warnings | |
warnings.filterwarnings("ignore") | |
plt.style.use('ggplot') | |
bike = pd.read_csv('bikeshare.csv') | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | |
def transform_time(X): | |
"""Convert Datetime objects to seconds for numerical/quantitative parsing""" | |
df = pd.DataFrame(X) | |
return df.apply(lambda x: pd.to_datetime(x).apply(lambda x: x.timestamp())) | |
def as_is(df): | |
"""Returns the dataframe without any modifications""" | |
return df | |
""" | |
Handling Categorical Data - Cardinal / Ordinal | |
""" | |
col_base = ColumnTransformer([('convert_time', FunctionTransformer(transform_time, | |
validate=False), | |
['starttime']), | |
('ohe', OneHotEncoder(), ['usertype', 'gender']), | |
('as_is', FunctionTransformer(as_is, validate=True), | |
['start station latitude', 'start station longitude', | |
'end station latitude', 'end station longitude', | |
'birth year', 'bikeid'])], | |
remainder='drop') | |
base_pipe = Pipeline([('transform', col_base), ('model', LogisticRegression())]) | |
base_pipe.fit(X_train, y_train); | |
""" | |
Feature Engineering | |
""" | |
def get_time_features(vals): | |
"""Takes a date and returns day of week, month, hour and | |
whether it is a weekday/workhour""" | |
ser = vals.iloc[:,0] | |
ser = pd.to_datetime(ser) | |
dow = ser.dt.dayofweek | |
month = ser.dt.month | |
hour = ser.dt.hour | |
weekday = dow.apply(lambda x: 1 if x < 5 else 0) | |
workhour = hour.apply(lambda x: 1 if x in [8,17] else 0) | |
out = pd.concat([dow, month, hour, weekday], axis=1) | |
return out | |
def get_hav_distance(coords): | |
"""Takes a set of coordinates and returns the Haversine distance between the two points""" | |
coords = pd.DataFrame(coords) | |
s_lat = coords.iloc[:,0] | |
s_lng = coords.iloc[:,1] | |
e_lat = coords.iloc[:,2] | |
e_lng = coords.iloc[:,3] | |
# approximate radius of earth in km | |
R = 6373.0 | |
s_lat = s_lat*np.pi/180.0 | |
s_lng = np.deg2rad(s_lng) | |
e_lat = np.deg2rad(e_lat) | |
e_lng = np.deg2rad(e_lng) | |
d = np.sin((e_lat - s_lat)/2)**2 | |
+ np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2 | |
out = 2 * R * np.arcsin(np.sqrt(d)) | |
return pd.DataFrame(out) | |
def get_age(vals): | |
"""Converts Birth Year to Age in 2019""" | |
out = 2019 - pd.DataFrame(vals) | |
return out | |
# Calculates Haversine Distance and Standardize | |
dist = Pipeline([('calc_dist', FunctionTransformer(get_hav_distance)), | |
('standardize', StandardScaler())]) | |
# Perform Different Feature Engineering based on our rules | |
col = ColumnTransformer([('convert_date', | |
FunctionTransformer(get_time_features, validate=False), ['starttime']), | |
('ohe', OneHotEncoder(), | |
['usertype', 'gender']), | |
('distance', dist, | |
['start station latitude', 'start station longitude', | |
'end station latitude', 'end station longitude']), | |
('convert_age', FunctionTransformer(get_age), | |
['birth year'])]) | |
""" | |
Using PCA to reduce dimensionality and produce more generalizable model | |
""" | |
reduce = Pipeline([('transform', col), | |
('pca', PCA(n_components = 'mle', svd_solver = 'full'))]) | |
""" | |
Using 3-fold cross validation to find best parameters for logistic regression | |
""" | |
lg_param_grid = {'penalty' : ['l1', 'l2'], | |
'C' : np.logspace(-4, 4, 3), | |
'solver' : ['liblinear']} | |
kfold = StratifiedKFold(n_splits=3) | |
lr = GridSearchCV(LogisticRegression(), | |
param_grid = lg_param_grid, | |
scoring="accuracy", cv=kfold, | |
n_jobs=4, verbose=1) | |
lr.fit(reduce.fit_transform(X_train), y_train) | |
clf_best = lr.best_estimator_ | |
print(clf_best) | |
# Final wrapper for all transformations + best model | |
final_pipe = Pipeline([('reduce', reduce), ('model', clf_best)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment