Will Koehrsen WillKoehrsen

🌆

building

Senior Software Engineer @gridstatus

WillKoehrsen / feature_matrix_from_entityset.py

Last active August 15, 2018 15:02

	def feature_matrix_from_entityset(es_dict, feature_defs):
	"""Run deep feature synthesis from an entityset and feature definitions.
	Saves feature matrix based on partition number."""

	# Extract the entityset
	es = es_dict['es']

	# Calculate the feature matrix and save
	feature_matrix = ft.calculate_feature_matrix(feature_definitions,
	entityset=es,

WillKoehrsen / custom_retail_primitive.py

Last active August 6, 2018 16:08

	def total_previous_month(numeric, datetime, time):
	"""Return total of `numeric` column in the month prior to `time`."""
	df = pd.DataFrame({'value': numeric, 'time': datetime})
	previous_month = time.month - 1

	# Handle January
	if previous_month == 0:
	previous_month = 12
	previous_year = time.year - 1

WillKoehrsen / domain_search.py

Created July 18, 2018 00:03

	import numpy as np

	# Hyperparameter grid
	param_grid = {
	'boosting_type': ['gbdt', 'goss', 'dart'],
	'num_leaves': list(range(20, 150)),
	'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
	'subsample_for_bin': list(range(20000, 300000, 20000)),
	'min_child_samples': list(range(20, 500, 5)),
	'reg_alpha': list(np.linspace(0, 1)),

WillKoehrsen / objective_function_search.py

Created July 17, 2018 23:58

	import lightgbm as lgb

	def objective(hyperparameters, iteration):
	"""Objective function for grid and random search. Returns
	the cross validation score from a set of hyperparameters."""

	# Number of estimators will be found using early stopping
	if 'n_estimators' in hyperparameters.keys():
	del hyperparameters['n_estimators']

WillKoehrsen / fmin_optimize.py

Created July 3, 2018 15:37

	from hyperopt import fmin

	MAX_EVALS = 500

	# Optimize
	best = fmin(fn = objective, space = space, algo = tpe.suggest,
	max_evals = MAX_EVALS, trials = bayes_trials)

WillKoehrsen / evaluate_bayes_gbm.py

Created July 2, 2018 15:19

	# Read in results and sort with best on top
	results = pd.read_csv('gbm_trials.csv')
	results.sort_values('loss', ascending = True, inplace = True)

	# Extract the ideal number of estimators and hyperparameters
	best_bayes_estimators = int(results.iloc[0, 4])
	best_bayes_params = results.iloc[0, 1]

	# Re-create the best model and train on the training data
	best_bayes_model = lgb.LGBMClassifier(**best_bayes_params,

WillKoehrsen / objective_write_csv.py

Last active July 3, 2018 14:18

	# Write to the csv file ('a' means append)
	of_connection = open(out_file, 'a')
	writer = csv.writer(of_connection)
	writer.writerow([loss, params, iteration, n_estimators, run_time])
	of_connection.close()

WillKoehrsen / csv_hyperopt_writing.py

Created July 2, 2018 15:05

	import csv

	# File to save first results
	out_file = 'gbm_trials.csv'
	of_connection = open(out_file, 'w')
	writer = csv.writer(of_connection)

	# Write the headers to the file
	writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
	of_connection.close()

WillKoehrsen / complete_sample.py

Last active July 2, 2018 14:49

	# Sample from the full space
	example = sample(space)

	# Dictionary get method with default
	subsample = example['boosting_type'].get('subsample', 1.0)

	# Assign top-level keys
	example['boosting_type'] = example['boosting_type']['boosting_type']
	example['subsample'] = subsample

WillKoehrsen / boosting_type.py

Last active August 16, 2019 06:48


	# boosting type domain
	boosting_type = {'boosting_type': hp.choice('boosting_type',
	[{'boosting_type': 'gbdt',
	'subsample': hp.uniform('subsample', 0.5, 1)},
	{'boosting_type': 'dart',
	'subsample': hp.uniform('subsample', 0.5, 1)},
	{'boosting_type': 'goss',
	'subsample': 1.0}])}