Created
July 1, 2020 09:52
-
-
Save gvyshnya/1db09b61267233d8a1dbf6b2bd1dcb3d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import pdpipe as pdp | |
# ... data reading code goes here | |
# set up a transformation pipeline | |
pipeline_1 = pdp.ApplyByCols( | |
['lat', 'lon', 'lat_inspection_location', 'lon_inspection_location'], | |
lambda col: pd.to_numeric(col) | |
) | |
pipeline_1 += pdp.ColRename({'lat': 'lat_quote_location', 'lon': 'lon_quote_location'}) | |
pipeline_1 += pdp.ApplyByCols( | |
['lead_created_at', 'booking_created', 'booking_appointment_date'], | |
lambda col: pd.to_datetime(col).tz_localize(None)) # need to check out timezone | |
pipeline_1 += pdp.ApplyToRows( | |
lambda row: (row['min_price'] + row['max_price'])*0.5, | |
colname='mean_price') # 'mean_price | |
pipeline_1 += pdp.ApplyByCols(['distance'], pd.isna, result_columns=['distance_nan'], drop=False) | |
pipeline_1 += pdp.Bin({'distance': [0,2,4,10,20,50,100]}, drop=False) | |
pipeline_1 += pdp.ApplyToRows( | |
lambda row: row['lead_created_at'].year - row['year'], colname='c_age') | |
# apply the transformation pipeline | |
df = pipeline_1.apply(df) | |
# select ML relevant features | |
selected_cols = [ | |
'make', 'model', 'trim', 'year', 'c_age', 'day_appointment_quote', | |
'day_appointment_created', 'mean_price', | |
'distance_nan', 'distance_bin','booking_location', 'city', 'show_up' | |
] | |
# calculate the list of columns to be dropped from the ML-ready dataframe | |
cols_to_drop = [ | |
col for col in df.columns if col not in selected_cols | |
] | |
pipeline_drop_non_ml_cols = pdp.ColDrop(cols_to_drop) | |
df = pipeline_drop_non_ml_cols.apply(df) | |
# apply one-hot encoding, drop unnecessary cols | |
list_dummie = [] | |
list_dummie += ['distance_nan','distance_bin','make','model','trim','booking_location', 'city'] | |
pipeline_3 = pdp.OneHotEncode(list_dummie) | |
XY_onehot = pipeline_3.apply(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment