gvyshnya · July 1, 2020 09:52
diff --git a/PdPipe Pipeline Example b/PdPipe Pipeline Example
 import numpy as np
 	import pandas as pd
 	import pdpipe as pdp
 	
 	# ... data reading code goes here
 	
 	# set up a transformation pipeline
    pipeline_1 = pdp.ApplyByCols(
        ['lat', 'lon', 'lat_inspection_location', 'lon_inspection_location'],
        lambda col: pd.to_numeric(col)
    )
    pipeline_1 += pdp.ColRename({'lat': 'lat_quote_location', 'lon': 'lon_quote_location'})
    pipeline_1 += pdp.ApplyByCols(
        ['lead_created_at', 'booking_created', 'booking_appointment_date'], 
        lambda col: pd.to_datetime(col).tz_localize(None)) # need to check out timezone
    pipeline_1 += pdp.ApplyToRows(
        lambda row: (row['min_price'] + row['max_price'])*0.5, 
        colname='mean_price') # 'mean_price
    pipeline_1 += pdp.ApplyByCols(['distance'], pd.isna, result_columns=['distance_nan'], drop=False)
    pipeline_1 += pdp.Bin({'distance': [0,2,4,10,20,50,100]}, drop=False)
    pipeline_1 += pdp.ApplyToRows(
        lambda row: row['lead_created_at'].year - row['year'], colname='c_age')

    # apply the transformation pipeline
    df = pipeline_1.apply(df)

    # select ML relevant features
    selected_cols = [
        'make', 'model', 'trim', 'year', 'c_age', 'day_appointment_quote',
        'day_appointment_created', 'mean_price',
        'distance_nan', 'distance_bin','booking_location', 'city', 'show_up'
    ]

    # calculate the list of columns to be dropped from the ML-ready dataframe
    cols_to_drop = [
        col for col in df.columns if col not in selected_cols
    ]

    pipeline_drop_non_ml_cols = pdp.ColDrop(cols_to_drop)
    df = pipeline_drop_non_ml_cols.apply(df)

    # apply one-hot encoding, drop unnecessary cols
    list_dummie = []
    list_dummie += ['distance_nan','distance_bin','make','model','trim','booking_location', 'city']
    pipeline_3 = pdp.OneHotEncode(list_dummie)

    XY_onehot = pipeline_3.apply(df)
	import numpy as np
	import pandas as pd
	import pdpipe as pdp

	# ... data reading code goes here

	# set up a transformation pipeline
	pipeline_1 = pdp.ApplyByCols(
	['lat', 'lon', 'lat_inspection_location', 'lon_inspection_location'],
	lambda col: pd.to_numeric(col)
	)
	pipeline_1 += pdp.ColRename({'lat': 'lat_quote_location', 'lon': 'lon_quote_location'})
	pipeline_1 += pdp.ApplyByCols(
	['lead_created_at', 'booking_created', 'booking_appointment_date'],
	lambda col: pd.to_datetime(col).tz_localize(None)) # need to check out timezone
	pipeline_1 += pdp.ApplyToRows(
	lambda row: (row['min_price'] + row['max_price'])*0.5,
	colname='mean_price') # 'mean_price
	pipeline_1 += pdp.ApplyByCols(['distance'], pd.isna, result_columns=['distance_nan'], drop=False)
	pipeline_1 += pdp.Bin({'distance': [0,2,4,10,20,50,100]}, drop=False)
	pipeline_1 += pdp.ApplyToRows(
	lambda row: row['lead_created_at'].year - row['year'], colname='c_age')

	# apply the transformation pipeline
	df = pipeline_1.apply(df)

	# select ML relevant features
	selected_cols = [
	'make', 'model', 'trim', 'year', 'c_age', 'day_appointment_quote',
	'day_appointment_created', 'mean_price',
	'distance_nan', 'distance_bin','booking_location', 'city', 'show_up'
	]

	# calculate the list of columns to be dropped from the ML-ready dataframe
	cols_to_drop = [
	col for col in df.columns if col not in selected_cols
	]

	pipeline_drop_non_ml_cols = pdp.ColDrop(cols_to_drop)
	df = pipeline_drop_non_ml_cols.apply(df)

	# apply one-hot encoding, drop unnecessary cols
	list_dummie = []
	list_dummie += ['distance_nan','distance_bin','make','model','trim','booking_location', 'city']
	pipeline_3 = pdp.OneHotEncode(list_dummie)

	XY_onehot = pipeline_3.apply(df)