Created
September 20, 2018 13:53
-
-
Save marnixkoops/0002f46c1f92244d435e0acd0d3e6545 to your computer and use it in GitHub Desktop.
LightGBM framework
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ############################################################################################ | |
| # [+] SETUP | |
| ############################################################################################ | |
| import numpy as np | |
| import pandas as pd | |
| import gc | |
| import glob | |
| import os | |
| from contextlib import contextmanager | |
| from datetime import date, datetime, timedelta | |
| from tqdm import tqdm | |
| from workalendar.europe import Netherlands, Belgium | |
| import time | |
| import lightgbm as lgb | |
| from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit | |
| from sklearn import metrics | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| sns.set_style('darkgrid') | |
| import warnings | |
| warnings.simplefilter("ignore", UserWarning) | |
| pd.options.mode.chained_assignment = None # default='warn' | |
| @contextmanager | |
| def timer(title): | |
| t0 = time.time() | |
| yield | |
| print(' Finished {} in {:.0f}s at {}'.format( | |
| title, time.time() - t0, datetime.now().replace(second=0, microsecond=0))) | |
| ############################################################################################ | |
| # [+] LOAD DATA | |
| ############################################################################################ | |
| def load_data(): | |
| # Prepare Actual DF from shards | |
| # datapath = './data/actuals/' # Combine actuals CSV shards into one file | |
| # files = glob.glob(os.path.join(datapath, '*.csv')) | |
| # | |
| # np_array_list = [] | |
| # for file_ in tqdm(files): | |
| # df = pd.read_csv(file_, index_col=None, header=0) | |
| # np_array_list.append(df.as_matrix()) | |
| # | |
| # comb_np_array = np.vstack(np_array_list) | |
| # actual_df = pd.DataFrame(comb_np_array) | |
| # del files, np_array_list, comb_np_array, df | |
| # | |
| # actual_df.columns = ['product_id', 'date', 'actual', 'on_stock', 'actual_raw', 'actual_intermediate'] | |
| # actual_df['actual_raw'].fillna(actual_df['actual'], inplace=True) # Create column with unsmoothed actuals | |
| # actual_df.drop(['actual', 'actual_intermediate'], axis=1, inplace=True) | |
| # # Check if we have only whole numbers to savely convert to int | |
| # np.array_equal(actual_df['actual_raw'].values, actual_df['actual_raw'].astype('int').values) | |
| # actual_df['product_id'] = actual_df['product_id'].astype('int') | |
| # actual_df['actual_raw'] = actual_df['actual_raw'].astype('int') | |
| # actual_df['on_stock'] = actual_df['on_stock'].astype('int') | |
| # | |
| # # Subset actuals to products with at least 365 on_stock observations | |
| # n_prods_before = actual_df['product_id'].nunique() | |
| # actual_counts = actual_df[['product_id', 'on_stock']].groupby('product_id').agg('sum') | |
| # actual_counts = actual_counts[actual_counts['on_stock'] > 364].copy().reset_index() | |
| # np.min(actual_counts['on_stock'].values) # Double check, should be 365 | |
| # actual_df = actual_df[actual_df['product_id'].isin(actual_counts['product_id'])].copy() | |
| # n_prods_after = actual_df['product_id'].nunique() | |
| # print('Removing {} of {} products with <365 on stock observations'.format(n_prods_before - n_prods_after, n_prods_before)) | |
| # | |
| # # Save actuals | |
| # actual_df.to_hdf('./data/actual_df.h5', 'actual_df') | |
| # Clean up actuals data, remove oos samples and set datatypes | |
| # actual_df = pd.read_hdf('./data/actual_df.h5', 'actual_df') | |
| # actual_df = actual_df[actual_df['on_stock'] == 1].copy() | |
| # actual_df[['actual_raw']] = actual_df[['actual_raw']].astype('int16') | |
| # actual_df[['product_id']] = actual_df[['product_id']].astype('int32') | |
| # actual_df.reset_index(drop=True, inplace=True) | |
| # actual_df['date'] = pd.to_datetime(actual_df['date']) | |
| # actual_df.drop('on_stock', axis=1, inplace=True) | |
| # actual_df.to_hdf('./data/actual_df.h5', 'actual_df') | |
| # Read prepared actuals data | |
| print('[+] Reading actuals data ...') | |
| actual_df = pd.read_hdf('./data/actual_df.h5', 'actual_df') | |
| # Read product data | |
| print('[+] Reading product data ...') | |
| product_df = pd.read_csv('./data/product.csv') # Load product information | |
| product_df.head(1) | |
| # Drop columns (product_class_id is constant, product_size_id is unimportant) | |
| product_df.drop(['product_class_id', 'product_size_id'], axis=1, inplace=True) | |
| return actual_df, product_df | |
| ############################################################################################ | |
| # [+] FEATURE ENGINEERING | |
| ############################################################################################ | |
| def process_features(actual_df, product_df): | |
| # Generate Date based features | |
| print('[+] Generating seasonality features ...') | |
| actual_df['date'] = pd.to_datetime(actual_df['date']) | |
| actual_df['year'] = actual_df['date'].dt.year | |
| actual_df['quarter'] = actual_df['date'].dt.quarter | |
| actual_df['month'] = actual_df['date'].dt.month | |
| actual_df['weekday'] = actual_df['date'].dt.weekday | |
| actual_df['dayofmonth'] = actual_df['date'].dt.day | |
| actual_df['weekofyear'] = actual_df['date'].dt.week | |
| actual_df['dayofyear'] = actual_df['date'].dt.dayofyear | |
| # Generate Holiday / Event Features | |
| print('[+] Generating holiday & event features ...') | |
| holiday_nl = Netherlands() # Create holiday objects from workalendar | |
| holiday_be = Belgium() | |
| years_in_df = np.unique(actual_df['year']) | |
| holiday_arr_nl = [] | |
| holiday_arr_be = [] | |
| for year in years_in_df: # Generate holidays based on years present in the data | |
| holiday_arr_nl.append(holiday_nl.holidays(year)) | |
| holiday_arr_be.append(holiday_be.holidays(year)) | |
| holiday_df_nl = pd.DataFrame(holiday_arr_nl[0]).append( # Combine generated holidays per year into one frame | |
| pd.DataFrame(holiday_arr_nl[1])) | |
| holiday_df_nl = holiday_df_nl.append(pd.DataFrame(holiday_arr_nl[2])) | |
| holiday_df_be = pd.DataFrame(holiday_arr_be[0]).append( | |
| pd.DataFrame(holiday_arr_be[1])) | |
| holiday_df_be = holiday_df_be.append(pd.DataFrame(holiday_arr_be[2])) | |
| holiday_df_nl.columns = ['date', 'holiday_nl'] # Set columns | |
| holiday_df_be.columns = ['date', 'holiday_be'] | |
| holiday_df = pd.merge(holiday_df_nl, holiday_df_be, on=[ | |
| 'date'], how='outer') # Combine NL and BE holidays | |
| holiday_df['date'] = pd.to_datetime(holiday_df['date']) | |
| # Encode different holidays as categories | |
| # holiday_df[['holiday_nl', 'holiday_be']] = holiday_df[['holiday_nl', 'holiday_be']].fillna(value='None') | |
| # holiday_df['holiday_nl'] = holiday_df['holiday_nl'].astype('category') | |
| # holiday_df['holiday_be'] = holiday_df['holiday_be'].astype('category') | |
| # holiday_df['holiday_nl_cat'] = holiday_df['holiday_nl'].cat.codes | |
| # holiday_df['holiday_be_cat'] = holiday_df['holiday_be'].cat.codes | |
| # Black friday (Only 2017 for now, nothing much happened in 2016) | |
| # holiday_df = holiday_df.append({'date': '2017-11-24', 'holiday_nl': 'Black Friday', 'holiday_be': 'Black Friday'}, ignore_index=True) | |
| # holiday_df['date'] = pd.to_datetime(holiday_df['date']) | |
| # holiday_df.tail(3) | |
| # Encode all holidays as a single holiday dummy | |
| holiday_df[['holiday_nl', 'holiday_be']] = holiday_df[[ | |
| 'holiday_nl', 'holiday_be']].notnull().astype('int8') | |
| # Merge generated features into matrix | |
| print('[+] Merging features into one dataframe ...') | |
| demand_df = pd.merge(actual_df, product_df, on='product_id', how='left') | |
| demand_df = pd.merge(demand_df, holiday_df, on='date', how='left') | |
| del holiday_df | |
| # Set Non Holidays to 0 | |
| demand_df[['holiday_nl', 'holiday_be']] = demand_df[[ | |
| 'holiday_nl', 'holiday_be']].fillna(value=0).astype('int8') | |
| # Add shifted holiday features | |
| for shift in [1, 2, 3, 4, 5]: | |
| demand_df['holiday_nl_min_{}'.format( | |
| shift)] = demand_df['holiday_nl'].shift(-shift).fillna(value=0).astype('int8') | |
| demand_df['holiday_nl_plus_{}'.format( | |
| shift)] = demand_df['holiday_nl'].shift(shift).fillna(value=0).astype('int8') | |
| demand_df['holiday_be_min_{}'.format( | |
| shift)] = demand_df['holiday_be'].shift(-shift).fillna(value=0).astype('int8') | |
| demand_df['holiday_be_plus_{}'.format( | |
| shift)] = demand_df['holiday_be'].shift(shift).fillna(value=0).astype('int8') | |
| # Generate Aggregation Features | |
| print('[+] Generating all-time product aggregation features ...') | |
| num_aggregations = { | |
| 'actual_raw': ['max', 'mean', 'median', 'sum', 'std']} | |
| full_agg_df = demand_df.groupby('product_id').agg(num_aggregations) | |
| full_agg_df.columns = ["_".join(agg_feature) for agg_feature in full_agg_df.columns.ravel()] | |
| full_agg_df.reset_index(drop=False, inplace=True) | |
| print('[+] Generating monthly product aggregation features ...') | |
| month_agg_df = demand_df.groupby(['product_id', 'month']).agg(num_aggregations) | |
| month_agg_df.columns = ["_month_".join(agg_feature) | |
| for agg_feature in month_agg_df.columns.ravel()] | |
| month_agg_df.reset_index(drop=False, inplace=True) | |
| print('[+] Generating weekly product aggregation features ...') | |
| week_agg_df = demand_df.groupby(['product_id', 'weekofyear']).agg(num_aggregations) | |
| week_agg_df.columns = ["_week_".join(agg_feature) | |
| for agg_feature in week_agg_df.columns.ravel()] | |
| week_agg_df.reset_index(drop=False, inplace=True) | |
| print('[+] Generating monthly product_type aggregation features ...') | |
| pt_month_agg_df = demand_df.groupby(['product_type_id', 'month']).agg(num_aggregations) | |
| pt_month_agg_df.columns = ["_pt_month_".join(agg_feature) | |
| for agg_feature in pt_month_agg_df.columns.ravel()] | |
| pt_month_agg_df.reset_index(drop=False, inplace=True) | |
| # Add aggregation features to dataframe | |
| # Check if aggregation is done correctly and clean up | |
| # np.equal(agg_df.shape[0], demand_df['product_id'].nunique()) | |
| demand_df = pd.merge(demand_df, full_agg_df, how='left', on='product_id') | |
| demand_df = pd.merge(demand_df, month_agg_df, how='left', on=['product_id', 'month']) | |
| demand_df = pd.merge(demand_df, week_agg_df, how='left', on=['product_id', 'weekofyear']) | |
| demand_df = pd.merge(demand_df, pt_month_agg_df, how='left', on=['product_type_id', 'month']) | |
| del full_agg_df, month_agg_df, week_agg_df, pt_month_agg_df | |
| gc.collect() | |
| # Generate Window Aggregation Features | |
| def get_timewindow(df, dt, minus, periods, freq='D'): | |
| return df[pd.date_range( | |
| dt - timedelta(days=minus), periods=periods, freq=freq)] | |
| # Set smallest datatypes possible to minimize RAM footprint | |
| print('[+] Setting datatypes ...') | |
| int8_features = [ | |
| 'quarter', 'month', 'weekday', 'dayofmonth', 'weekofyear', | |
| 'holiday_nl', 'holiday_be'] | |
| int16_features = [ | |
| 'year', 'dayofyear', 'actual_raw', 'team_id', 'subproduct_type_id', 'actual_raw_max', 'actual_raw_median', 'actual_raw_sum', | |
| 'actual_raw_month_max', 'actual_raw_month_median', 'actual_raw_month_sum', 'actual_raw_week_max', | |
| 'actual_raw_week_median', 'actual_raw_week_sum', 'actual_raw_pt_month_max', 'actual_raw_pt_month_median', 'actual_raw_pt_month_sum'] | |
| float16_features = [ | |
| 'actual_raw_mean', 'actual_raw_std', 'actual_raw_month_mean', 'actual_raw_month_std', 'actual_raw_week_mean', | |
| 'actual_raw_week_std', 'actual_raw_pt_month_mean', 'actual_raw_pt_month_std'] | |
| int32_features = [ | |
| 'product_id', 'product_type_id', 'brand_id', 'manufacturer_id', | |
| 'product_group_id'] | |
| demand_df[int8_features] = demand_df[int8_features].apply( | |
| lambda col: col.astype('int8')) | |
| demand_df[int16_features] = demand_df[int16_features].apply( | |
| lambda col: col.astype('int16')) | |
| demand_df[float16_features] = demand_df[float16_features].apply( | |
| lambda col: col.astype('float16')) | |
| demand_df[int32_features] = demand_df[int32_features].apply( | |
| lambda col: col.astype('int32')) | |
| # Write processed data to disk | |
| print('[+] Writing processed dataframe to disk ...') | |
| demand_df.sort_values(by='date', inplace=True, ascending=True) | |
| demand_df.to_hdf('./data/demand_df.h5', 'demand_df') | |
| return demand_df | |
| ############################################################################################ | |
| # [+] LOAD PREPARED DATA | |
| ############################################################################################ | |
| def load_prepared_data(): | |
| print('[+] Loading prepared data from disk ...') | |
| df = pd.read_hdf('./data/demand_df.h5', 'demand_df') | |
| df.reset_index(drop=True, inplace=True) | |
| print('\n Prepared data has {} samples {} unique products and {} features'. | |
| format(df.shape[0], df['product_id'].nunique(), df.shape[1])) | |
| print('\n Included features: {}'.format(df.columns.values)) | |
| gc.collect() | |
| return df | |
| ############################################################################################ | |
| # [+] SUBSET DATA | |
| ############################################################################################ | |
| def subset_data(df, n_days=731): | |
| # Drop undesired columns | |
| print('[+] Excluding features for modelling ...') | |
| drop_features = ['team_id', 'manufacturer_id', | |
| 'product_group_id', 'holiday_nl', 'holiday_be'] | |
| df.drop(drop_features, axis=1, inplace=True) | |
| print('Dropped the following features from the matrix: {} '.format(drop_features)) | |
| # Drop undesired samples | |
| print('[+] Subsetting to {} days of data ...'.format(n_days)) | |
| first_train_date = np.max(df['date']) - timedelta(n_days) | |
| df = df[df['date'] >= first_train_date].copy() | |
| df.sort_values(by='date', inplace=True, ascending=True) | |
| print('Data now ranges from {} to {}'.format(df['date'].min().replace( | |
| second=0, microsecond=0).date(), df['date'].max().replace(second=0, microsecond=0).date())) | |
| return df | |
| ############################################################################################ | |
| # [+] LightGBM BOOSTING | |
| ############################################################################################ | |
| def train_model(df, debug): | |
| print('[+] Setting up LightGBM ...') | |
| target = df['actual_raw'].copy() | |
| if debug: # Only use a small subset for fast debugging | |
| df = df.tail(1000).copy() | |
| target = target.tail(1000).copy() | |
| features = [f for f in df.columns if f not in ['date', 'actual_raw']] | |
| num_features = ['actual_raw', 'actual_raw_max', | |
| 'actual_raw_mean', 'actual_raw_median', 'actual_raw_sum', 'actual_raw_std'] | |
| cat_features = [cf for cf in features if cf not in num_features] | |
| num_folds = 10 | |
| oof_preds = np.zeros(df.shape[0]).astype('int16') | |
| kfolds = KFold(shuffle=True, n_splits=num_folds, random_state=2018) | |
| feature_importance_df = pd.DataFrame() | |
| params = { | |
| 'nthread': -1, | |
| 'boosting_type': 'gbdt', | |
| 'objective': 'mean_squared_error', | |
| 'learning_rate': 0.05, | |
| 'num_leaves': 32, | |
| 'max_depth': 32, | |
| 'subsample': 0.8, | |
| 'verbose': -1} | |
| for n_fold, (train_idx, valid_idx) in enumerate(kfolds.split(df)): | |
| train_x, train_y = df[features].iloc[train_idx], target.iloc[train_idx] | |
| valid_x, valid_y = df[features].iloc[valid_idx], target.iloc[valid_idx] | |
| lgb_train = lgb.Dataset( | |
| train_x, train_y, feature_name=features, categorical_feature=cat_features) | |
| lgb_valid = lgb.Dataset( | |
| valid_x, valid_y, feature_name=features, categorical_feature=cat_features) | |
| gc.collect() | |
| print('[+] Started training fold {} at {}'.format( | |
| n_fold + 1, datetime.now().replace(second=0, microsecond=0))) | |
| booster = lgb.train( | |
| params, | |
| lgb_train, | |
| 10000, | |
| valid_sets=[lgb_train, lgb_valid], | |
| categorical_feature=cat_features, | |
| early_stopping_rounds=100, | |
| verbose_eval=100) | |
| oof_preds[valid_idx] = booster.predict( | |
| valid_x, num_iteration=booster.best_iteration) | |
| print('Fold {} Validation MSE: {:.5}'.format( | |
| n_fold + 1, | |
| metrics.mean_squared_error(valid_y, | |
| oof_preds[valid_idx]))) | |
| fold_importance_df = pd.DataFrame() | |
| fold_importance_df['feature'] = features | |
| fold_importance_df['gain'] = booster.feature_importance( | |
| importance_type='gain') | |
| fold_importance_df['split'] = booster.feature_importance( | |
| importance_type='split') | |
| fold_importance_df['fold'] = n_fold + 1 | |
| feature_importance_df = pd.concat( | |
| [feature_importance_df, fold_importance_df], axis=0) | |
| del train_x, train_y, valid_x, valid_y, lgb_train, lgb_valid | |
| return df, target, oof_preds, num_folds, features, params, kfolds, feature_importance_df, booster | |
| ############################################################################################ | |
| # [+] RESULTS | |
| ############################################################################################ | |
| def process_results(df, target, oof_preds, num_folds, features, params, kfolds, feature_importance_df, booster, runtag): | |
| # Metrics | |
| print('[+] Calculating metrics ...') | |
| overall_mse = metrics.mean_squared_error(target, oof_preds) | |
| overall_mae = metrics.mean_absolute_error(target, oof_preds) | |
| model_id = runtag + '_MSE_{:.5}_MAE_{:.5}'.format(overall_mse, overall_mae) | |
| print("Overall MSE: {:.5}, Overall MAE: {:.5}".format(overall_mse, overall_mae)) | |
| # Compare to baseline WA model | |
| forecast_df = pd.read_hdf('./data/wa_forecast_df.h5') | |
| forecast_df.columns = ['product_id', 'wa_forecast', 'date'] | |
| forecast_df['wa_forecast'] = forecast_df['wa_forecast'] / 7 | |
| df['lgbm_forecast'] = oof_preds | |
| forecast_df = pd.merge(forecast_df, df[['date', 'product_id', 'actual_raw', 'lgbm_forecast']], | |
| how='inner', on=['date', 'product_id']) | |
| # Feature importances | |
| print('[+] Saving feature importance plots to disk ...') | |
| def display_importances(df=feature_importance_df, type='split', runtag=runtag): | |
| cols = feature_importance_df[[ | |
| 'feature', '{}'.format(type) | |
| ]].groupby('feature').mean().sort_values( | |
| by='{}'.format(type), ascending=False).index | |
| best_features = df.loc[df.feature.isin(cols)] | |
| plt.figure(figsize=(12, 12)) | |
| sns.barplot( | |
| y='feature', | |
| x=type, | |
| data=best_features.sort_values(by=type, ascending=False)) | |
| plt.title( | |
| '{} mean {} importance (Over folds + Std dev)'.format(model_id, type)) | |
| plt.tight_layout() | |
| plt.savefig('./fig/{}_imp_{}.png'.format(model_id, type)) | |
| split_plot = display_importances(feature_importance_df, type='split') | |
| gain_plot = display_importances(feature_importance_df, type='gain') | |
| # Write results to log | |
| # Used to calculate metrics over different windows through time | |
| tsfolds = TimeSeriesSplit(n_splits=20) | |
| print('[+] Writing model info and results to log ...') | |
| with open("LGBM_log.txt", "a+") as text_file: | |
| print("\n[++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++]", file=text_file) | |
| print(" Model ID: {}".format(model_id), file=text_file) | |
| print("\n Date Range: from {} to {}".format(df['date'].min().replace(second=0, microsecond=0).date(), | |
| df['date'].max().replace(second=0, microsecond=0).date()), file=text_file) | |
| print("\n Metrics \n Overall MSE: {:.5}, Overall MAE: {:.5}".format( | |
| overall_mse, overall_mae), file=text_file) | |
| print("\n Parameters: \n {}".format(params), file=text_file) | |
| print("\n Features: \n {}".format(features), file=text_file) | |
| print("\n Cross-Validation: \n {}".format(kfolds), file=text_file) | |
| print("\n Date Range: from {} to {}".format(forecast_df['date'].min().replace(second=0, microsecond=0).date(), | |
| forecast_df['date'].max().replace(second=0, microsecond=0).date()), file=text_file) | |
| for n_fold, (train_idx, valid_idx) in tqdm(enumerate(kfolds.split(forecast_df))): | |
| print('K-Fold {} LGBM OOF MSE: {:.5}'.format(n_fold + 1, | |
| metrics.mean_squared_error(forecast_df['actual_raw'].iloc[valid_idx], | |
| forecast_df['lgbm_forecast'].iloc[valid_idx])), file=text_file) | |
| print("\n", file=text_file) | |
| for n_fold, (train_idx, valid_idx) in tqdm(enumerate(kfolds.split(forecast_df))): | |
| print('K-Fold {} LGBM OOF MAE: {:.5}'.format(n_fold + 1, | |
| metrics.mean_absolute_error(forecast_df['actual_raw'].iloc[valid_idx], | |
| forecast_df['lgbm_forecast'].iloc[valid_idx])), file=text_file) | |
| print("\n", file=text_file) | |
| for n_fold, (train_idx, valid_idx) in tqdm(enumerate(kfolds.split(forecast_df))): | |
| print('K-Fold {} WA OOF MSE: {:.5f}'.format(n_fold + 1, | |
| metrics.mean_squared_error(forecast_df['actual_raw'].iloc[valid_idx], | |
| forecast_df['wa_forecast'].iloc[valid_idx])), file=text_file) | |
| print("\n", file=text_file) | |
| for n_fold, (train_idx, valid_idx) in tqdm(enumerate(kfolds.split(forecast_df))): | |
| print('K-Fold {} WA OOF MAE: {:.5f}'.format(n_fold + 1, | |
| metrics.mean_absolute_error(forecast_df['actual_raw'].iloc[valid_idx], | |
| forecast_df['wa_forecast'].iloc[valid_idx])), file=text_file) | |
| print("\n Fold Comparison: {}".format(tsfolds), file=text_file) | |
| print("\n Date Range: from {} to {}".format(forecast_df['date'].min().replace(second=0, microsecond=0).date(), | |
| forecast_df['date'].max().replace(second=0, microsecond=0).date()), file=text_file) | |
| for n_fold, (train_idx, valid_idx) in tqdm(enumerate(tsfolds.split(forecast_df))): | |
| print('TS-Fold {} from {} to {} LGBM MSE: {:.5} WA MSE: {:.5}'.format( | |
| n_fold + 1, forecast_df['date'].iloc[valid_idx].min().replace(second=0, | |
| microsecond=0).date(), | |
| forecast_df['date'].iloc[valid_idx].max().replace(second=0, microsecond=0).date(), | |
| metrics.mean_squared_error( | |
| forecast_df['actual_raw'].iloc[valid_idx], forecast_df['lgbm_forecast'].iloc[valid_idx]), | |
| metrics.mean_squared_error(forecast_df['actual_raw'].iloc[valid_idx], forecast_df['wa_forecast'].iloc[valid_idx])), file=text_file) | |
| print('TS-Fold {} from {} to {} LGBM MAE: {:.5f} WA MSE: {:.5f}'.format( | |
| n_fold + 1, forecast_df['date'].iloc[valid_idx].min().replace(second=0, | |
| microsecond=0).date(), | |
| forecast_df['date'].iloc[valid_idx].max().replace(second=0, microsecond=0).date(), | |
| metrics.mean_absolute_error( | |
| forecast_df['actual_raw'].iloc[valid_idx], forecast_df['lgbm_forecast'].iloc[valid_idx]), | |
| metrics.mean_absolute_error(forecast_df['actual_raw'].iloc[valid_idx], forecast_df['wa_forecast'].iloc[valid_idx])), file=text_file) | |
| print("[++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++]\n", file=text_file) | |
| return forecast_df, split_plot, gain_plot | |
| ############################################################################################ | |
| # [+] lightGBM RUNNER | |
| ############################################################################################ | |
| # Setup | |
| runtag = 'LGBM_{}'.format(datetime.now().replace(second=0, microsecond=0)) | |
| debug = False | |
| # Prepare and save data to disk | |
| with timer('loading data'): | |
| actual_df, product_df = load_data() | |
| gc.collect() | |
| with timer('processing features'): | |
| df = process_features(actual_df, product_df) | |
| del actual_df, product_df | |
| gc.collect() | |
| # Load prepared data and train model | |
| with timer('\n loading prepared data'): | |
| df = load_prepared_data() | |
| gc.collect() | |
| with timer('\n subsetting data'): | |
| df = subset_data(df, n_days=731) | |
| gc.collect() | |
| with timer('\n training model'): | |
| df, target, oof_preds, num_folds, features, params, kfolds, feature_importance_df, booster = train_model( | |
| df, debug) | |
| gc.collect() | |
| with timer('\n processing results'): | |
| forecast_df, split_plot, gain_plot = process_results( | |
| df, target, oof_preds, num_folds, features, params, kfolds, feature_importance_df, booster, runtag) | |
| gc.collect() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment