Skip to content

Instantly share code, notes, and snippets.

@marnixkoops
Created September 20, 2018 13:53
Show Gist options
  • Select an option

  • Save marnixkoops/0002f46c1f92244d435e0acd0d3e6545 to your computer and use it in GitHub Desktop.

Select an option

Save marnixkoops/0002f46c1f92244d435e0acd0d3e6545 to your computer and use it in GitHub Desktop.
LightGBM framework
############################################################################################
# [+] SETUP
############################################################################################
import numpy as np
import pandas as pd
import gc
import glob
import os
from contextlib import contextmanager
from datetime import date, datetime, timedelta
from tqdm import tqdm
from workalendar.europe import Netherlands, Belgium
import time
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.simplefilter("ignore", UserWarning)
pd.options.mode.chained_assignment = None # default='warn'
@contextmanager
def timer(title):
t0 = time.time()
yield
print(' Finished {} in {:.0f}s at {}'.format(
title, time.time() - t0, datetime.now().replace(second=0, microsecond=0)))
############################################################################################
# [+] LOAD DATA
############################################################################################
def load_data():
# Prepare Actual DF from shards
# datapath = './data/actuals/' # Combine actuals CSV shards into one file
# files = glob.glob(os.path.join(datapath, '*.csv'))
#
# np_array_list = []
# for file_ in tqdm(files):
# df = pd.read_csv(file_, index_col=None, header=0)
# np_array_list.append(df.as_matrix())
#
# comb_np_array = np.vstack(np_array_list)
# actual_df = pd.DataFrame(comb_np_array)
# del files, np_array_list, comb_np_array, df
#
# actual_df.columns = ['product_id', 'date', 'actual', 'on_stock', 'actual_raw', 'actual_intermediate']
# actual_df['actual_raw'].fillna(actual_df['actual'], inplace=True) # Create column with unsmoothed actuals
# actual_df.drop(['actual', 'actual_intermediate'], axis=1, inplace=True)
# # Check if we have only whole numbers to savely convert to int
# np.array_equal(actual_df['actual_raw'].values, actual_df['actual_raw'].astype('int').values)
# actual_df['product_id'] = actual_df['product_id'].astype('int')
# actual_df['actual_raw'] = actual_df['actual_raw'].astype('int')
# actual_df['on_stock'] = actual_df['on_stock'].astype('int')
#
# # Subset actuals to products with at least 365 on_stock observations
# n_prods_before = actual_df['product_id'].nunique()
# actual_counts = actual_df[['product_id', 'on_stock']].groupby('product_id').agg('sum')
# actual_counts = actual_counts[actual_counts['on_stock'] > 364].copy().reset_index()
# np.min(actual_counts['on_stock'].values) # Double check, should be 365
# actual_df = actual_df[actual_df['product_id'].isin(actual_counts['product_id'])].copy()
# n_prods_after = actual_df['product_id'].nunique()
# print('Removing {} of {} products with <365 on stock observations'.format(n_prods_before - n_prods_after, n_prods_before))
#
# # Save actuals
# actual_df.to_hdf('./data/actual_df.h5', 'actual_df')
# Clean up actuals data, remove oos samples and set datatypes
# actual_df = pd.read_hdf('./data/actual_df.h5', 'actual_df')
# actual_df = actual_df[actual_df['on_stock'] == 1].copy()
# actual_df[['actual_raw']] = actual_df[['actual_raw']].astype('int16')
# actual_df[['product_id']] = actual_df[['product_id']].astype('int32')
# actual_df.reset_index(drop=True, inplace=True)
# actual_df['date'] = pd.to_datetime(actual_df['date'])
# actual_df.drop('on_stock', axis=1, inplace=True)
# actual_df.to_hdf('./data/actual_df.h5', 'actual_df')
# Read prepared actuals data
print('[+] Reading actuals data ...')
actual_df = pd.read_hdf('./data/actual_df.h5', 'actual_df')
# Read product data
print('[+] Reading product data ...')
product_df = pd.read_csv('./data/product.csv') # Load product information
product_df.head(1)
# Drop columns (product_class_id is constant, product_size_id is unimportant)
product_df.drop(['product_class_id', 'product_size_id'], axis=1, inplace=True)
return actual_df, product_df
############################################################################################
# [+] FEATURE ENGINEERING
############################################################################################
def process_features(actual_df, product_df):
# Generate Date based features
print('[+] Generating seasonality features ...')
actual_df['date'] = pd.to_datetime(actual_df['date'])
actual_df['year'] = actual_df['date'].dt.year
actual_df['quarter'] = actual_df['date'].dt.quarter
actual_df['month'] = actual_df['date'].dt.month
actual_df['weekday'] = actual_df['date'].dt.weekday
actual_df['dayofmonth'] = actual_df['date'].dt.day
actual_df['weekofyear'] = actual_df['date'].dt.week
actual_df['dayofyear'] = actual_df['date'].dt.dayofyear
# Generate Holiday / Event Features
print('[+] Generating holiday & event features ...')
holiday_nl = Netherlands() # Create holiday objects from workalendar
holiday_be = Belgium()
years_in_df = np.unique(actual_df['year'])
holiday_arr_nl = []
holiday_arr_be = []
for year in years_in_df: # Generate holidays based on years present in the data
holiday_arr_nl.append(holiday_nl.holidays(year))
holiday_arr_be.append(holiday_be.holidays(year))
holiday_df_nl = pd.DataFrame(holiday_arr_nl[0]).append( # Combine generated holidays per year into one frame
pd.DataFrame(holiday_arr_nl[1]))
holiday_df_nl = holiday_df_nl.append(pd.DataFrame(holiday_arr_nl[2]))
holiday_df_be = pd.DataFrame(holiday_arr_be[0]).append(
pd.DataFrame(holiday_arr_be[1]))
holiday_df_be = holiday_df_be.append(pd.DataFrame(holiday_arr_be[2]))
holiday_df_nl.columns = ['date', 'holiday_nl'] # Set columns
holiday_df_be.columns = ['date', 'holiday_be']
holiday_df = pd.merge(holiday_df_nl, holiday_df_be, on=[
'date'], how='outer') # Combine NL and BE holidays
holiday_df['date'] = pd.to_datetime(holiday_df['date'])
# Encode different holidays as categories
# holiday_df[['holiday_nl', 'holiday_be']] = holiday_df[['holiday_nl', 'holiday_be']].fillna(value='None')
# holiday_df['holiday_nl'] = holiday_df['holiday_nl'].astype('category')
# holiday_df['holiday_be'] = holiday_df['holiday_be'].astype('category')
# holiday_df['holiday_nl_cat'] = holiday_df['holiday_nl'].cat.codes
# holiday_df['holiday_be_cat'] = holiday_df['holiday_be'].cat.codes
# Black friday (Only 2017 for now, nothing much happened in 2016)
# holiday_df = holiday_df.append({'date': '2017-11-24', 'holiday_nl': 'Black Friday', 'holiday_be': 'Black Friday'}, ignore_index=True)
# holiday_df['date'] = pd.to_datetime(holiday_df['date'])
# holiday_df.tail(3)
# Encode all holidays as a single holiday dummy
holiday_df[['holiday_nl', 'holiday_be']] = holiday_df[[
'holiday_nl', 'holiday_be']].notnull().astype('int8')
# Merge generated features into matrix
print('[+] Merging features into one dataframe ...')
demand_df = pd.merge(actual_df, product_df, on='product_id', how='left')
demand_df = pd.merge(demand_df, holiday_df, on='date', how='left')
del holiday_df
# Set Non Holidays to 0
demand_df[['holiday_nl', 'holiday_be']] = demand_df[[
'holiday_nl', 'holiday_be']].fillna(value=0).astype('int8')
# Add shifted holiday features
for shift in [1, 2, 3, 4, 5]:
demand_df['holiday_nl_min_{}'.format(
shift)] = demand_df['holiday_nl'].shift(-shift).fillna(value=0).astype('int8')
demand_df['holiday_nl_plus_{}'.format(
shift)] = demand_df['holiday_nl'].shift(shift).fillna(value=0).astype('int8')
demand_df['holiday_be_min_{}'.format(
shift)] = demand_df['holiday_be'].shift(-shift).fillna(value=0).astype('int8')
demand_df['holiday_be_plus_{}'.format(
shift)] = demand_df['holiday_be'].shift(shift).fillna(value=0).astype('int8')
# Generate Aggregation Features
print('[+] Generating all-time product aggregation features ...')
num_aggregations = {
'actual_raw': ['max', 'mean', 'median', 'sum', 'std']}
full_agg_df = demand_df.groupby('product_id').agg(num_aggregations)
full_agg_df.columns = ["_".join(agg_feature) for agg_feature in full_agg_df.columns.ravel()]
full_agg_df.reset_index(drop=False, inplace=True)
print('[+] Generating monthly product aggregation features ...')
month_agg_df = demand_df.groupby(['product_id', 'month']).agg(num_aggregations)
month_agg_df.columns = ["_month_".join(agg_feature)
for agg_feature in month_agg_df.columns.ravel()]
month_agg_df.reset_index(drop=False, inplace=True)
print('[+] Generating weekly product aggregation features ...')
week_agg_df = demand_df.groupby(['product_id', 'weekofyear']).agg(num_aggregations)
week_agg_df.columns = ["_week_".join(agg_feature)
for agg_feature in week_agg_df.columns.ravel()]
week_agg_df.reset_index(drop=False, inplace=True)
print('[+] Generating monthly product_type aggregation features ...')
pt_month_agg_df = demand_df.groupby(['product_type_id', 'month']).agg(num_aggregations)
pt_month_agg_df.columns = ["_pt_month_".join(agg_feature)
for agg_feature in pt_month_agg_df.columns.ravel()]
pt_month_agg_df.reset_index(drop=False, inplace=True)
# Add aggregation features to dataframe
# Check if aggregation is done correctly and clean up
# np.equal(agg_df.shape[0], demand_df['product_id'].nunique())
demand_df = pd.merge(demand_df, full_agg_df, how='left', on='product_id')
demand_df = pd.merge(demand_df, month_agg_df, how='left', on=['product_id', 'month'])
demand_df = pd.merge(demand_df, week_agg_df, how='left', on=['product_id', 'weekofyear'])
demand_df = pd.merge(demand_df, pt_month_agg_df, how='left', on=['product_type_id', 'month'])
del full_agg_df, month_agg_df, week_agg_df, pt_month_agg_df
gc.collect()
# Generate Window Aggregation Features
def get_timewindow(df, dt, minus, periods, freq='D'):
return df[pd.date_range(
dt - timedelta(days=minus), periods=periods, freq=freq)]
# Set smallest datatypes possible to minimize RAM footprint
print('[+] Setting datatypes ...')
int8_features = [
'quarter', 'month', 'weekday', 'dayofmonth', 'weekofyear',
'holiday_nl', 'holiday_be']
int16_features = [
'year', 'dayofyear', 'actual_raw', 'team_id', 'subproduct_type_id', 'actual_raw_max', 'actual_raw_median', 'actual_raw_sum',
'actual_raw_month_max', 'actual_raw_month_median', 'actual_raw_month_sum', 'actual_raw_week_max',
'actual_raw_week_median', 'actual_raw_week_sum', 'actual_raw_pt_month_max', 'actual_raw_pt_month_median', 'actual_raw_pt_month_sum']
float16_features = [
'actual_raw_mean', 'actual_raw_std', 'actual_raw_month_mean', 'actual_raw_month_std', 'actual_raw_week_mean',
'actual_raw_week_std', 'actual_raw_pt_month_mean', 'actual_raw_pt_month_std']
int32_features = [
'product_id', 'product_type_id', 'brand_id', 'manufacturer_id',
'product_group_id']
demand_df[int8_features] = demand_df[int8_features].apply(
lambda col: col.astype('int8'))
demand_df[int16_features] = demand_df[int16_features].apply(
lambda col: col.astype('int16'))
demand_df[float16_features] = demand_df[float16_features].apply(
lambda col: col.astype('float16'))
demand_df[int32_features] = demand_df[int32_features].apply(
lambda col: col.astype('int32'))
# Write processed data to disk
print('[+] Writing processed dataframe to disk ...')
demand_df.sort_values(by='date', inplace=True, ascending=True)
demand_df.to_hdf('./data/demand_df.h5', 'demand_df')
return demand_df
############################################################################################
# [+] LOAD PREPARED DATA
############################################################################################
def load_prepared_data():
print('[+] Loading prepared data from disk ...')
df = pd.read_hdf('./data/demand_df.h5', 'demand_df')
df.reset_index(drop=True, inplace=True)
print('\n Prepared data has {} samples {} unique products and {} features'.
format(df.shape[0], df['product_id'].nunique(), df.shape[1]))
print('\n Included features: {}'.format(df.columns.values))
gc.collect()
return df
############################################################################################
# [+] SUBSET DATA
############################################################################################
def subset_data(df, n_days=731):
# Drop undesired columns
print('[+] Excluding features for modelling ...')
drop_features = ['team_id', 'manufacturer_id',
'product_group_id', 'holiday_nl', 'holiday_be']
df.drop(drop_features, axis=1, inplace=True)
print('Dropped the following features from the matrix: {} '.format(drop_features))
# Drop undesired samples
print('[+] Subsetting to {} days of data ...'.format(n_days))
first_train_date = np.max(df['date']) - timedelta(n_days)
df = df[df['date'] >= first_train_date].copy()
df.sort_values(by='date', inplace=True, ascending=True)
print('Data now ranges from {} to {}'.format(df['date'].min().replace(
second=0, microsecond=0).date(), df['date'].max().replace(second=0, microsecond=0).date()))
return df
############################################################################################
# [+] LightGBM BOOSTING
############################################################################################
def train_model(df, debug):
print('[+] Setting up LightGBM ...')
target = df['actual_raw'].copy()
if debug: # Only use a small subset for fast debugging
df = df.tail(1000).copy()
target = target.tail(1000).copy()
features = [f for f in df.columns if f not in ['date', 'actual_raw']]
num_features = ['actual_raw', 'actual_raw_max',
'actual_raw_mean', 'actual_raw_median', 'actual_raw_sum', 'actual_raw_std']
cat_features = [cf for cf in features if cf not in num_features]
num_folds = 10
oof_preds = np.zeros(df.shape[0]).astype('int16')
kfolds = KFold(shuffle=True, n_splits=num_folds, random_state=2018)
feature_importance_df = pd.DataFrame()
params = {
'nthread': -1,
'boosting_type': 'gbdt',
'objective': 'mean_squared_error',
'learning_rate': 0.05,
'num_leaves': 32,
'max_depth': 32,
'subsample': 0.8,
'verbose': -1}
for n_fold, (train_idx, valid_idx) in enumerate(kfolds.split(df)):
train_x, train_y = df[features].iloc[train_idx], target.iloc[train_idx]
valid_x, valid_y = df[features].iloc[valid_idx], target.iloc[valid_idx]
lgb_train = lgb.Dataset(
train_x, train_y, feature_name=features, categorical_feature=cat_features)
lgb_valid = lgb.Dataset(
valid_x, valid_y, feature_name=features, categorical_feature=cat_features)
gc.collect()
print('[+] Started training fold {} at {}'.format(
n_fold + 1, datetime.now().replace(second=0, microsecond=0)))
booster = lgb.train(
params,
lgb_train,
10000,
valid_sets=[lgb_train, lgb_valid],
categorical_feature=cat_features,
early_stopping_rounds=100,
verbose_eval=100)
oof_preds[valid_idx] = booster.predict(
valid_x, num_iteration=booster.best_iteration)
print('Fold {} Validation MSE: {:.5}'.format(
n_fold + 1,
metrics.mean_squared_error(valid_y,
oof_preds[valid_idx])))
fold_importance_df = pd.DataFrame()
fold_importance_df['feature'] = features
fold_importance_df['gain'] = booster.feature_importance(
importance_type='gain')
fold_importance_df['split'] = booster.feature_importance(
importance_type='split')
fold_importance_df['fold'] = n_fold + 1
feature_importance_df = pd.concat(
[feature_importance_df, fold_importance_df], axis=0)
del train_x, train_y, valid_x, valid_y, lgb_train, lgb_valid
return df, target, oof_preds, num_folds, features, params, kfolds, feature_importance_df, booster
############################################################################################
# [+] RESULTS
############################################################################################
def process_results(df, target, oof_preds, num_folds, features, params, kfolds, feature_importance_df, booster, runtag):
# Metrics
print('[+] Calculating metrics ...')
overall_mse = metrics.mean_squared_error(target, oof_preds)
overall_mae = metrics.mean_absolute_error(target, oof_preds)
model_id = runtag + '_MSE_{:.5}_MAE_{:.5}'.format(overall_mse, overall_mae)
print("Overall MSE: {:.5}, Overall MAE: {:.5}".format(overall_mse, overall_mae))
# Compare to baseline WA model
forecast_df = pd.read_hdf('./data/wa_forecast_df.h5')
forecast_df.columns = ['product_id', 'wa_forecast', 'date']
forecast_df['wa_forecast'] = forecast_df['wa_forecast'] / 7
df['lgbm_forecast'] = oof_preds
forecast_df = pd.merge(forecast_df, df[['date', 'product_id', 'actual_raw', 'lgbm_forecast']],
how='inner', on=['date', 'product_id'])
# Feature importances
print('[+] Saving feature importance plots to disk ...')
def display_importances(df=feature_importance_df, type='split', runtag=runtag):
cols = feature_importance_df[[
'feature', '{}'.format(type)
]].groupby('feature').mean().sort_values(
by='{}'.format(type), ascending=False).index
best_features = df.loc[df.feature.isin(cols)]
plt.figure(figsize=(12, 12))
sns.barplot(
y='feature',
x=type,
data=best_features.sort_values(by=type, ascending=False))
plt.title(
'{} mean {} importance (Over folds + Std dev)'.format(model_id, type))
plt.tight_layout()
plt.savefig('./fig/{}_imp_{}.png'.format(model_id, type))
split_plot = display_importances(feature_importance_df, type='split')
gain_plot = display_importances(feature_importance_df, type='gain')
# Write results to log
# Used to calculate metrics over different windows through time
tsfolds = TimeSeriesSplit(n_splits=20)
print('[+] Writing model info and results to log ...')
with open("LGBM_log.txt", "a+") as text_file:
print("\n[++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++]", file=text_file)
print(" Model ID: {}".format(model_id), file=text_file)
print("\n Date Range: from {} to {}".format(df['date'].min().replace(second=0, microsecond=0).date(),
df['date'].max().replace(second=0, microsecond=0).date()), file=text_file)
print("\n Metrics \n Overall MSE: {:.5}, Overall MAE: {:.5}".format(
overall_mse, overall_mae), file=text_file)
print("\n Parameters: \n {}".format(params), file=text_file)
print("\n Features: \n {}".format(features), file=text_file)
print("\n Cross-Validation: \n {}".format(kfolds), file=text_file)
print("\n Date Range: from {} to {}".format(forecast_df['date'].min().replace(second=0, microsecond=0).date(),
forecast_df['date'].max().replace(second=0, microsecond=0).date()), file=text_file)
for n_fold, (train_idx, valid_idx) in tqdm(enumerate(kfolds.split(forecast_df))):
print('K-Fold {} LGBM OOF MSE: {:.5}'.format(n_fold + 1,
metrics.mean_squared_error(forecast_df['actual_raw'].iloc[valid_idx],
forecast_df['lgbm_forecast'].iloc[valid_idx])), file=text_file)
print("\n", file=text_file)
for n_fold, (train_idx, valid_idx) in tqdm(enumerate(kfolds.split(forecast_df))):
print('K-Fold {} LGBM OOF MAE: {:.5}'.format(n_fold + 1,
metrics.mean_absolute_error(forecast_df['actual_raw'].iloc[valid_idx],
forecast_df['lgbm_forecast'].iloc[valid_idx])), file=text_file)
print("\n", file=text_file)
for n_fold, (train_idx, valid_idx) in tqdm(enumerate(kfolds.split(forecast_df))):
print('K-Fold {} WA OOF MSE: {:.5f}'.format(n_fold + 1,
metrics.mean_squared_error(forecast_df['actual_raw'].iloc[valid_idx],
forecast_df['wa_forecast'].iloc[valid_idx])), file=text_file)
print("\n", file=text_file)
for n_fold, (train_idx, valid_idx) in tqdm(enumerate(kfolds.split(forecast_df))):
print('K-Fold {} WA OOF MAE: {:.5f}'.format(n_fold + 1,
metrics.mean_absolute_error(forecast_df['actual_raw'].iloc[valid_idx],
forecast_df['wa_forecast'].iloc[valid_idx])), file=text_file)
print("\n Fold Comparison: {}".format(tsfolds), file=text_file)
print("\n Date Range: from {} to {}".format(forecast_df['date'].min().replace(second=0, microsecond=0).date(),
forecast_df['date'].max().replace(second=0, microsecond=0).date()), file=text_file)
for n_fold, (train_idx, valid_idx) in tqdm(enumerate(tsfolds.split(forecast_df))):
print('TS-Fold {} from {} to {} LGBM MSE: {:.5} WA MSE: {:.5}'.format(
n_fold + 1, forecast_df['date'].iloc[valid_idx].min().replace(second=0,
microsecond=0).date(),
forecast_df['date'].iloc[valid_idx].max().replace(second=0, microsecond=0).date(),
metrics.mean_squared_error(
forecast_df['actual_raw'].iloc[valid_idx], forecast_df['lgbm_forecast'].iloc[valid_idx]),
metrics.mean_squared_error(forecast_df['actual_raw'].iloc[valid_idx], forecast_df['wa_forecast'].iloc[valid_idx])), file=text_file)
print('TS-Fold {} from {} to {} LGBM MAE: {:.5f} WA MSE: {:.5f}'.format(
n_fold + 1, forecast_df['date'].iloc[valid_idx].min().replace(second=0,
microsecond=0).date(),
forecast_df['date'].iloc[valid_idx].max().replace(second=0, microsecond=0).date(),
metrics.mean_absolute_error(
forecast_df['actual_raw'].iloc[valid_idx], forecast_df['lgbm_forecast'].iloc[valid_idx]),
metrics.mean_absolute_error(forecast_df['actual_raw'].iloc[valid_idx], forecast_df['wa_forecast'].iloc[valid_idx])), file=text_file)
print("[++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++]\n", file=text_file)
return forecast_df, split_plot, gain_plot
############################################################################################
# [+] lightGBM RUNNER
############################################################################################
# Setup
runtag = 'LGBM_{}'.format(datetime.now().replace(second=0, microsecond=0))
debug = False
# Prepare and save data to disk
with timer('loading data'):
actual_df, product_df = load_data()
gc.collect()
with timer('processing features'):
df = process_features(actual_df, product_df)
del actual_df, product_df
gc.collect()
# Load prepared data and train model
with timer('\n loading prepared data'):
df = load_prepared_data()
gc.collect()
with timer('\n subsetting data'):
df = subset_data(df, n_days=731)
gc.collect()
with timer('\n training model'):
df, target, oof_preds, num_folds, features, params, kfolds, feature_importance_df, booster = train_model(
df, debug)
gc.collect()
with timer('\n processing results'):
forecast_df, split_plot, gain_plot = process_results(
df, target, oof_preds, num_folds, features, params, kfolds, feature_importance_df, booster, runtag)
gc.collect()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment