Skip to content

Instantly share code, notes, and snippets.

View WillKoehrsen's full-sized avatar
🌆
building

Will Koehrsen WillKoehrsen

🌆
building
View GitHub Profile
def feature_matrix_from_entityset(es_dict, feature_defs):
"""Run deep feature synthesis from an entityset and feature definitions.
Saves feature matrix based on partition number."""
# Extract the entityset
es = es_dict['es']
# Calculate the feature matrix and save
feature_matrix = ft.calculate_feature_matrix(feature_definitions,
entityset=es,
def total_previous_month(numeric, datetime, time):
"""Return total of `numeric` column in the month prior to `time`."""
df = pd.DataFrame({'value': numeric, 'time': datetime})
previous_month = time.month - 1
# Handle January
if previous_month == 0:
previous_month = 12
previous_year = time.year - 1
import numpy as np
# Hyperparameter grid
param_grid = {
'boosting_type': ['gbdt', 'goss', 'dart'],
'num_leaves': list(range(20, 150)),
'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
'subsample_for_bin': list(range(20000, 300000, 20000)),
'min_child_samples': list(range(20, 500, 5)),
'reg_alpha': list(np.linspace(0, 1)),
import lightgbm as lgb
def objective(hyperparameters, iteration):
"""Objective function for grid and random search. Returns
the cross validation score from a set of hyperparameters."""
# Number of estimators will be found using early stopping
if 'n_estimators' in hyperparameters.keys():
del hyperparameters['n_estimators']
from hyperopt import fmin
MAX_EVALS = 500
# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest,
max_evals = MAX_EVALS, trials = bayes_trials)
# Read in results and sort with best on top
results = pd.read_csv('gbm_trials.csv')
results.sort_values('loss', ascending = True, inplace = True)
# Extract the ideal number of estimators and hyperparameters
best_bayes_estimators = int(results.iloc[0, 4])
best_bayes_params = results.iloc[0, 1]
# Re-create the best model and train on the training data
best_bayes_model = lgb.LGBMClassifier(**best_bayes_params,
# Write to the csv file ('a' means append)
of_connection = open(out_file, 'a')
writer = csv.writer(of_connection)
writer.writerow([loss, params, iteration, n_estimators, run_time])
of_connection.close()
import csv
# File to save first results
out_file = 'gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)
# Write the headers to the file
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()
# Sample from the full space
example = sample(space)
# Dictionary get method with default
subsample = example['boosting_type'].get('subsample', 1.0)
# Assign top-level keys
example['boosting_type'] = example['boosting_type']['boosting_type']
example['subsample'] = subsample
# boosting type domain
boosting_type = {'boosting_type': hp.choice('boosting_type',
[{'boosting_type': 'gbdt',
'subsample': hp.uniform('subsample', 0.5, 1)},
{'boosting_type': 'dart',
'subsample': hp.uniform('subsample', 0.5, 1)},
{'boosting_type': 'goss',
'subsample': 1.0}])}