Created
January 30, 2020 00:06
-
-
Save ryancheunggit/b251ffcaf221457c1f32b076a05936e7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# chunk 0 | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
from pprint import pprint | |
print(tf.__version__) | |
tf.random.set_seed(42) | |
(x_tr, y_tr), (x_te, y_te) = tf.keras.datasets.boston_housing.load_data() | |
y_tr, y_te = map(lambda x: np.expand_dims(x, -1), (y_tr, y_te)) | |
x_tr, y_tr, x_te, y_te = map(lambda x: tf.cast(x, tf.float32), (x_tr, y_tr, x_te, y_te)) | |
class MLP(tf.keras.Model): | |
def __init__(self, num_hidden_units, num_targets, hidden_activation='relu', **kwargs): | |
super().__init__(**kwargs) | |
if type(num_hidden_units) is int: num_hidden_units = [num_hidden_units] | |
self.feature_extractor = tf.keras.Sequential([tf.keras.layers.Dense(unit, activation=hidden_activation) | |
for unit in num_hidden_units]) | |
self.last_linear = tf.keras.layers.Dense(num_targets, activation='linear') | |
@tf.function | |
def call(self, x): | |
features = self.feature_extractor(x) | |
outputs = self.last_linear(features) | |
return outputs | |
# chunk 1 | |
# gradients = tape.gradient(loss, model.variables) | |
# for g, v in zip(gradients, model.variables): | |
# v.assign_add(tf.constant([-0.05], dtype=tf.float32) * g) | |
# chunk 2 | |
# eta = tf.constant(-.05, dtype=tf.float32) | |
# gradients = tape.gradient(loss, model.variables) | |
# for grad, var in zip(gradients, model.variables): | |
# update = - eta * grad | |
# var.assign_add(update) | |
# chunk 3 | |
class GradientDescent(object): | |
def __init__(self, lr=.01): | |
self._lr = tf.Variable(lr, dtype=tf.float32) | |
def apply_gradients(self, grads_and_vars): | |
for grad, var in grads_and_vars: | |
update = - self._lr * grad | |
var.assign_add(update) | |
# chunk 4 | |
@tf.function | |
def train_step(model, optimizer, x, y): | |
with tf.GradientTape() as tape: | |
loss = tf.reduce_mean(tf.square(y - model(x))) | |
gradients = tape.gradient(loss, model.variables) | |
optimizer.apply_gradients(zip(gradients, model.variables)) | |
return loss | |
@tf.function | |
def test_step(model, x, y): | |
return tf.reduce_mean(tf.square(y - model(x))) | |
def train(model, optimizer, n_epochs=10000, his_freq=100): | |
history = [] | |
for epoch in range(1, n_epochs + 1): | |
tr_loss = train_step(model, optimizer, x_tr, y_tr) | |
te_loss = test_step(model, x_te, y_te) | |
if not epoch % his_freq: | |
history.append({'epoch': epoch, | |
'training_loss': tr_loss.numpy(), | |
'testing_loss': te_loss.numpy()}) | |
return model, pd.DataFrame(history) | |
# chunk 5 | |
def lr_experiments(learning_rates): | |
experiments = [] | |
for lr in learning_rates: | |
model, history = train(MLP(4, 1), GradientDescent(lr)) | |
history['lr'] = lr | |
experiments.append(history) | |
experiments = pd.concat(experiments, axis=0) | |
return experiments | |
experiments = lr_experiments(learning_rates=[10 ** -i for i in range(3, 10)]) | |
ax = experiments.\ | |
pivot(index='epoch', columns='lr', values='testing_loss').\ | |
plot(kind='line', logy=True, figsize=(12, 6)) | |
ax.get_figure().savefig('ch4_plot_1.png') | |
print(experiments.groupby('lr')['testing_loss'].min()) | |
# chunk 6 | |
model, history = train(MLP(4, 1), GradientDescent(1e-3)) | |
print(model.layers[0](x_te)) | |
print(model.layers[0](x_te) @ model.layers[1].variables[0]) | |
print(model.layers[1].variables[1]) | |
# chunk 7 | |
class GradientDescent(object): | |
def __init__(self, lr=.01, clipnorm=None): | |
self._lr = tf.Variable(lr, dtype=tf.float32) | |
self.clipnorm = clipnorm | |
def apply_gradients(self, grads_and_vars): | |
for grad, var in grads_and_vars: | |
if self.clipnorm: grad = tf.clip_by_norm(grad, self.clipnorm) | |
update = - self._lr * grad | |
var.assign_add(update) | |
model, history = train(MLP(4, 1), GradientDescent(1e-3, clipnorm=2)) | |
ax = history.plot(x='epoch', logy=True, figsize=(12, 6)) | |
ax.get_figure().savefig('ch4_plot_2.png') | |
print(history['testing_loss'].min()) | |
# chunk 8 | |
class Dataset(object): | |
def __init__(self, tensors, batch_size=32, shuffle=True): | |
self.tensors = tensors if isinstance(tensors, (list, tuple)) else (tensors, ) | |
self.batch_size = batch_size | |
self.shuffle = shuffle | |
self.total = tensors[0].shape[0] | |
assert all(self.total == tensor.shape[0] for tensor in self.tensors), 'Tensors should have matched length' | |
self.n_steps = self.total // self.batch_size | |
self._indices = tf.range(self.total) | |
def __iter__(self): | |
self._i = 0 | |
if self.shuffle: | |
self._indices = tf.random.shuffle(self._indices) | |
return self | |
def __next__(self): | |
if self._i >= self.n_steps: | |
raise StopIteration | |
else: | |
start = self._i * self.batch_size | |
end = start + self.batch_size | |
indices = self._indices[start: end] | |
samples = (tf.gather(tensor, indices) for tensor in self.tensors) | |
self._i += 1 | |
return samples | |
# chunk 9 | |
train_dataset = Dataset((x_tr, y_tr)) | |
test_dataset = Dataset((x_te, y_te)) | |
def train(model, optimizer, n_epochs, batch_size=32, his_freq=10): | |
history = [] | |
for epoch in range(1, n_epochs + 1): | |
tr_loss = [] | |
for x, y in train_dataset: | |
tr_loss.append(train_step(model, optimizer, x, y).numpy()) | |
te_loss = [] | |
for x, y in test_dataset: | |
te_loss.append(test_step(model, x, y).numpy()) | |
te_loss_full = test_step(model, x_te, y_te) | |
if not epoch % his_freq: | |
history.append({'epoch': epoch, | |
'training_loss': np.mean(tr_loss), | |
'testing_loss': np.mean(te_loss), | |
'testing_loss_full': te_loss_full.numpy()}) | |
return model, pd.DataFrame(history) | |
model, history = train(MLP(4, 1), GradientDescent(1e-3, 2), n_epochs=2000) | |
ax = history.plot(x='epoch', kind='line', figsize=(12, 6)) | |
ax.get_figure().savefig('ch4_plot_3.png') | |
print(history.testing_loss_full.min()) | |
# chunk 10 | |
class Momentum(object): | |
def __init__(self, lr=.01, beta=.9, clipnorm=None): | |
self._lr = tf.Variable(lr, dtype=tf.float32) | |
self._beta = tf.Variable(beta, dtype=tf.float32) | |
self.clipnorm = clipnorm | |
def init_moments(self, var_list): | |
self._moments = {var._unique_id: tf.Variable(tf.zeros_like(var)) | |
for var in var_list} | |
def apply_gradients(self, grads_and_vars): | |
for grad, var in grads_and_vars: | |
if self.clipnorm: grad = tf.clip_by_norm(grad, self.clipnorm) | |
m = self._moments[var._unique_id] | |
m.assign(self._beta * m - self._lr * grad) | |
update = m | |
var.assign_add(m) | |
# chunk 11 | |
model = MLP(4, 1) | |
model.build(input_shape=(32,13)) | |
optimizer = Momentum(lr=1e-3, beta=.98, clipnorm=2) | |
optimizer.init_moments(model.variables) | |
model, history = train(model, optimizer, n_epochs=1000) | |
ax = history.plot(x='epoch', kind='line', figsize=(12, 6)) | |
ax.get_figure().savefig('ch4_plot_4.png') | |
print(history.testing_loss_full.min()) | |
# chunk 12 | |
class Adam(object): | |
def __init__(self, lr=.01, beta_1=.9, beta_2=.999, epsilon=1e-8, clipnorm=None): | |
self._lr = tf.Variable(lr, dtype=tf.float32) | |
self._beta_1 = tf.Variable(beta_1, dtype=tf.float32) | |
self._beta_2 = tf.Variable(beta_2, dtype=tf.float32) | |
self._epsilon = tf.constant(epsilon, dtype=tf.float32) | |
self.clipnorm = clipnorm | |
self._t = tf.Variable(0, dtype=tf.float32) | |
def init_moments(self, var_list): | |
self._m = {var._unique_id: tf.Variable(tf.zeros_like(var)) | |
for var in var_list} | |
self._v = {var._unique_id: tf.Variable(tf.zeros_like(var)) | |
for var in var_list} | |
def apply_gradients(self, grads_and_vars): | |
self._t.assign_add(tf.constant(1., self._t.dtype)) | |
for grad, var in grads_and_vars: | |
if self.clipnorm: grad = tf.clip_by_norm(grad, self.clipnorm) | |
m = self._m[var._unique_id] | |
v = self._v[var._unique_id] | |
m.assign(self._beta_1 * m + (1. - self._beta_1) * grad) | |
v.assign(self._beta_2 * v + (1. - self._beta_2) * tf.square(grad)) | |
lr = self._lr * tf.sqrt(1 - tf.pow(self._beta_2, self._t)) / (1 - tf.pow(self._beta_1, self._t)) | |
update = -lr * m / (tf.sqrt(v) + self._epsilon) | |
var.assign_add(update) | |
class Adam(object): | |
def __init__(self, lr=.01, beta_1=.9, beta_2=.999, epsilon=1e-8, clipnorm=None): | |
self._lr = tf.Variable(lr, dtype=tf.float32) | |
self._beta_1 = tf.Variable(beta_1, dtype=tf.float32) | |
self._beta_2 = tf.Variable(beta_2, dtype=tf.float32) | |
self._epsilon = tf.constant(epsilon, dtype=tf.float32) | |
self.clipnorm = clipnorm | |
self._t = tf.Variable(0, dtype=tf.float32) | |
def init_moments(self, var_list): | |
self._m = {var._unique_id: tf.Variable(tf.zeros_like(var)) | |
for var in var_list} | |
self._v = {var._unique_id: tf.Variable(tf.zeros_like(var)) | |
for var in var_list} | |
def apply_gradients(self, grads_and_vars): | |
self._t.assign_add(tf.constant(1., self._t.dtype)) | |
for grad, var in grads_and_vars: | |
if self.clipnorm: grad = tf.clip_by_norm(grad, self.clipnorm) | |
m = self._m[var._unique_id] | |
v = self._v[var._unique_id] | |
m.assign(self._beta_1 * m + (1. - self._beta_1) * grad) | |
v.assign(self._beta_2 * v + (1. - self._beta_2) * tf.square(grad)) | |
lr = self._lr * tf.sqrt(1 - tf.pow(self._beta_2, self._t)) / (1 - tf.pow(self._beta_1, self._t)) | |
update = -lr * m / (tf.sqrt(v) + self._epsilon) | |
var.assign_add(update) | |
# chunk 13 | |
model = MLP(4, 1) | |
model.build(input_shape=(32,13)) | |
optimizer = Adam(lr=1e-3, beta_1=.9, beta_2=.999, epsilon=1e-8, clipnorm=2) | |
optimizer.init_moments(model.variables) | |
model, history = train(model, optimizer, n_epochs=1000) | |
ax = history.plot(x='epoch', kind='line', figsize=(12, 6)) | |
ax.get_figure().savefig('ch4_plot_5.png') | |
print(history.testing_loss_full.min()) | |
# chunk 14 | |
class Adam(tf.keras.optimizers.Optimizer): | |
def __init__(self, learning_rate=.001, beta_1=.9, beta_2=.999, epsilon=1e-8, name='Adam', **kwargs): | |
super().__init__(name, **kwargs) | |
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) | |
self._set_hyper('beta_1', beta_1) | |
self._set_hyper('beta_2', beta_2) | |
self.epsilon = epsilon or tf.keras.backend.epsilon() | |
def _create_slots(self, var_list): | |
for var in var_list: | |
self.add_slot(var, 'm') | |
for var in var_list: | |
self.add_slot(var, 'v') | |
def _resource_apply_dense(self, grad, var): | |
dtype = var.dtype.base_dtype | |
t = tf.cast(self.iterations + 1, dtype) | |
lr = self._decayed_lr(dtype) | |
beta_1 = self._get_hyper('beta_1', dtype) | |
beta_2 = self._get_hyper('beta_2', dtype) | |
epsilon = tf.convert_to_tensor(self.epsilon, dtype) | |
m = self.get_slot(var, 'm') | |
v = self.get_slot(var, 'v') | |
m = m.assign(beta_1 * m + (1. - beta_1) * grad) | |
v = v.assign(beta_2 * v + (1. - beta_2) * tf.square(grad)) | |
lr = lr * tf.sqrt(1 - tf.pow(beta_2, t)) / (1 - tf.pow(beta_1, t)) | |
update = -lr * m / (tf.sqrt(v) + epsilon) | |
var_update = var.assign_add(update) | |
updates = [var_update, m, v] | |
return tf.group(*updates) | |
def get_config(self): | |
config = super().get_config() | |
config.update({ | |
'learning_rate': self._serialize_hyperparameter('learning_rate'), | |
'beta_1': self._serialize_hyperparameter('beta_1'), | |
'beta_2': self._serialize_hyperparameter('beta_2'), | |
'epsilon': self.epsilon, | |
'total_steps': self._serialize_hyperparameter('total_steps'), | |
}) | |
return config | |
# chunk 15 | |
model = MLP(4, 1) | |
optimizer = Adam(lr=1e-3, beta_1=.9, beta_2=.999, epsilon=1e-8) | |
model, history = train(model, optimizer, n_epochs=1000) | |
ax = history.plot(x='epoch', kind='line', figsize=(12, 6)) | |
ax.get_figure().savefig('ch4_plot_5.png') | |
print(history.testing_loss_full.min()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment