Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rayheberer/dcbaf91e9ca053316755e60991e48da9 to your computer and use it in GitHub Desktop.
Save rayheberer/dcbaf91e9ca053316755e60991e48da9 to your computer and use it in GitHub Desktop.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from collections import defaultdict
# Get Data
(x, y), (x_test, y_test) = tf.keras.datasets.boston_housing.load_data()
# Process Data
np.random.seed(41)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)
# Run Experiment
def sample_params(param_grid):
return {key: np.random.choice(val) for key, val in param_grid.items()}
def optimize_val_performance(x_train, y_train, x_val, y_val, param_grid, iterations=1000):
best_score = -np.inf
best_model = None
models = []
for ix in range(iterations):
model = GradientBoostingRegressor(**sample_params(param_grid))
models.append(model)
model.fit(x_train, y_train)
score = model.score(x_val, y_val)
if score > best_score:
best_score = score
best_model = ix
return models[best_model]
param_grid = {
"learning_rate": np.logspace(-3, -1, 10),
"n_estimators": np.int64(np.linspace(1, 10, 10)),
"subsample": np.linspace(0.5, 1, 10),
"max_depth": np.int64(np.linspace(1, 10, 10)),
"max_features": np.linspace(0.5, 1, 10)
}
n_trials = 5
randomized_search_iterations = np.int64(np.logspace(1, 3.5, 50))
val_scores = defaultdict(lambda: [])
test_scores = defaultdict(lambda: [])
for trial in range(1, n_trials + 1):
print("trial", trial)
for n_iter in randomized_search_iterations:
print("Sampling {} hyperparameter configurations...".format(n_iter))
model = optimize_val_performance(x_train, y_train, x_val, y_val, param_grid, n_iter)
val_scores[n_iter].append(model.score(x_val, y_val))
test_scores[n_iter].append(model.score(x_test, y_test))
# Gather Results
mean_val_scores_dict = {key: np.mean(scores) for key, scores in val_scores.items()}
mean_val_scores = np.array([mean_val_scores_dict.get(i) for i in randomized_search_iterations])
mean_test_scores_dict = {key: np.mean(scores) for key, scores in test_scores.items()}
mean_test_scores = np.array([mean_test_scores_dict.get(i) for i in randomized_search_iterations])
diffs = {key: np.array(val_scores[key]) - np.array(test_scores[key]) for key in val_scores.keys()}
mean_diffs_dict = {key: np.mean(diff) for key, diff in diffs.items()}
sd_diffs_dict = {key: np.std(diff) for key, diff in diffs.items()}
mean_diffs = np.array([mean_diffs_dict.get(i) for i in randomized_search_iterations])
sd_diffs = np.array([sd_diffs_dict.get(i) for i in randomized_search_iterations])
# Visualize Results
fig = plt.figure(figsize=(20, 10))
fig.text(0.5, 0.04, "Randomized Search Iterations", ha="center", fontsize=14)
plt.subplot(121)
plt.plot(randomized_search_iterations, mean_val_scores, randomized_search_iterations, mean_test_scores)
plt.ylabel("Mean Coefficient of Determination (R2)", fontsize=14)
plt.subplot(122)
plt.plot(randomized_search_iterations, mean_diffs)
plt.fill_between(randomized_search_iterations, mean_diffs + sd_diffs, mean_diffs - sd_diffs,
color="gray", alpha=0.2)
plt.ylabel("Validation-Test R2 Performance Gap", fontsize=14)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment