Skip to content

Instantly share code, notes, and snippets.

@ryancheunggit
Created January 30, 2020 00:06
Show Gist options
  • Save ryancheunggit/b251ffcaf221457c1f32b076a05936e7 to your computer and use it in GitHub Desktop.
Save ryancheunggit/b251ffcaf221457c1f32b076a05936e7 to your computer and use it in GitHub Desktop.
# chunk 0
import numpy as np
import pandas as pd
import tensorflow as tf
from pprint import pprint
print(tf.__version__)
tf.random.set_seed(42)
(x_tr, y_tr), (x_te, y_te) = tf.keras.datasets.boston_housing.load_data()
y_tr, y_te = map(lambda x: np.expand_dims(x, -1), (y_tr, y_te))
x_tr, y_tr, x_te, y_te = map(lambda x: tf.cast(x, tf.float32), (x_tr, y_tr, x_te, y_te))
class MLP(tf.keras.Model):
def __init__(self, num_hidden_units, num_targets, hidden_activation='relu', **kwargs):
super().__init__(**kwargs)
if type(num_hidden_units) is int: num_hidden_units = [num_hidden_units]
self.feature_extractor = tf.keras.Sequential([tf.keras.layers.Dense(unit, activation=hidden_activation)
for unit in num_hidden_units])
self.last_linear = tf.keras.layers.Dense(num_targets, activation='linear')
@tf.function
def call(self, x):
features = self.feature_extractor(x)
outputs = self.last_linear(features)
return outputs
# chunk 1
# gradients = tape.gradient(loss, model.variables)
# for g, v in zip(gradients, model.variables):
# v.assign_add(tf.constant([-0.05], dtype=tf.float32) * g)
# chunk 2
# eta = tf.constant(-.05, dtype=tf.float32)
# gradients = tape.gradient(loss, model.variables)
# for grad, var in zip(gradients, model.variables):
# update = - eta * grad
# var.assign_add(update)
# chunk 3
class GradientDescent(object):
def __init__(self, lr=.01):
self._lr = tf.Variable(lr, dtype=tf.float32)
def apply_gradients(self, grads_and_vars):
for grad, var in grads_and_vars:
update = - self._lr * grad
var.assign_add(update)
# chunk 4
@tf.function
def train_step(model, optimizer, x, y):
with tf.GradientTape() as tape:
loss = tf.reduce_mean(tf.square(y - model(x)))
gradients = tape.gradient(loss, model.variables)
optimizer.apply_gradients(zip(gradients, model.variables))
return loss
@tf.function
def test_step(model, x, y):
return tf.reduce_mean(tf.square(y - model(x)))
def train(model, optimizer, n_epochs=10000, his_freq=100):
history = []
for epoch in range(1, n_epochs + 1):
tr_loss = train_step(model, optimizer, x_tr, y_tr)
te_loss = test_step(model, x_te, y_te)
if not epoch % his_freq:
history.append({'epoch': epoch,
'training_loss': tr_loss.numpy(),
'testing_loss': te_loss.numpy()})
return model, pd.DataFrame(history)
# chunk 5
def lr_experiments(learning_rates):
experiments = []
for lr in learning_rates:
model, history = train(MLP(4, 1), GradientDescent(lr))
history['lr'] = lr
experiments.append(history)
experiments = pd.concat(experiments, axis=0)
return experiments
experiments = lr_experiments(learning_rates=[10 ** -i for i in range(3, 10)])
ax = experiments.\
pivot(index='epoch', columns='lr', values='testing_loss').\
plot(kind='line', logy=True, figsize=(12, 6))
ax.get_figure().savefig('ch4_plot_1.png')
print(experiments.groupby('lr')['testing_loss'].min())
# chunk 6
model, history = train(MLP(4, 1), GradientDescent(1e-3))
print(model.layers[0](x_te))
print(model.layers[0](x_te) @ model.layers[1].variables[0])
print(model.layers[1].variables[1])
# chunk 7
class GradientDescent(object):
def __init__(self, lr=.01, clipnorm=None):
self._lr = tf.Variable(lr, dtype=tf.float32)
self.clipnorm = clipnorm
def apply_gradients(self, grads_and_vars):
for grad, var in grads_and_vars:
if self.clipnorm: grad = tf.clip_by_norm(grad, self.clipnorm)
update = - self._lr * grad
var.assign_add(update)
model, history = train(MLP(4, 1), GradientDescent(1e-3, clipnorm=2))
ax = history.plot(x='epoch', logy=True, figsize=(12, 6))
ax.get_figure().savefig('ch4_plot_2.png')
print(history['testing_loss'].min())
# chunk 8
class Dataset(object):
def __init__(self, tensors, batch_size=32, shuffle=True):
self.tensors = tensors if isinstance(tensors, (list, tuple)) else (tensors, )
self.batch_size = batch_size
self.shuffle = shuffle
self.total = tensors[0].shape[0]
assert all(self.total == tensor.shape[0] for tensor in self.tensors), 'Tensors should have matched length'
self.n_steps = self.total // self.batch_size
self._indices = tf.range(self.total)
def __iter__(self):
self._i = 0
if self.shuffle:
self._indices = tf.random.shuffle(self._indices)
return self
def __next__(self):
if self._i >= self.n_steps:
raise StopIteration
else:
start = self._i * self.batch_size
end = start + self.batch_size
indices = self._indices[start: end]
samples = (tf.gather(tensor, indices) for tensor in self.tensors)
self._i += 1
return samples
# chunk 9
train_dataset = Dataset((x_tr, y_tr))
test_dataset = Dataset((x_te, y_te))
def train(model, optimizer, n_epochs, batch_size=32, his_freq=10):
history = []
for epoch in range(1, n_epochs + 1):
tr_loss = []
for x, y in train_dataset:
tr_loss.append(train_step(model, optimizer, x, y).numpy())
te_loss = []
for x, y in test_dataset:
te_loss.append(test_step(model, x, y).numpy())
te_loss_full = test_step(model, x_te, y_te)
if not epoch % his_freq:
history.append({'epoch': epoch,
'training_loss': np.mean(tr_loss),
'testing_loss': np.mean(te_loss),
'testing_loss_full': te_loss_full.numpy()})
return model, pd.DataFrame(history)
model, history = train(MLP(4, 1), GradientDescent(1e-3, 2), n_epochs=2000)
ax = history.plot(x='epoch', kind='line', figsize=(12, 6))
ax.get_figure().savefig('ch4_plot_3.png')
print(history.testing_loss_full.min())
# chunk 10
class Momentum(object):
def __init__(self, lr=.01, beta=.9, clipnorm=None):
self._lr = tf.Variable(lr, dtype=tf.float32)
self._beta = tf.Variable(beta, dtype=tf.float32)
self.clipnorm = clipnorm
def init_moments(self, var_list):
self._moments = {var._unique_id: tf.Variable(tf.zeros_like(var))
for var in var_list}
def apply_gradients(self, grads_and_vars):
for grad, var in grads_and_vars:
if self.clipnorm: grad = tf.clip_by_norm(grad, self.clipnorm)
m = self._moments[var._unique_id]
m.assign(self._beta * m - self._lr * grad)
update = m
var.assign_add(m)
# chunk 11
model = MLP(4, 1)
model.build(input_shape=(32,13))
optimizer = Momentum(lr=1e-3, beta=.98, clipnorm=2)
optimizer.init_moments(model.variables)
model, history = train(model, optimizer, n_epochs=1000)
ax = history.plot(x='epoch', kind='line', figsize=(12, 6))
ax.get_figure().savefig('ch4_plot_4.png')
print(history.testing_loss_full.min())
# chunk 12
class Adam(object):
def __init__(self, lr=.01, beta_1=.9, beta_2=.999, epsilon=1e-8, clipnorm=None):
self._lr = tf.Variable(lr, dtype=tf.float32)
self._beta_1 = tf.Variable(beta_1, dtype=tf.float32)
self._beta_2 = tf.Variable(beta_2, dtype=tf.float32)
self._epsilon = tf.constant(epsilon, dtype=tf.float32)
self.clipnorm = clipnorm
self._t = tf.Variable(0, dtype=tf.float32)
def init_moments(self, var_list):
self._m = {var._unique_id: tf.Variable(tf.zeros_like(var))
for var in var_list}
self._v = {var._unique_id: tf.Variable(tf.zeros_like(var))
for var in var_list}
def apply_gradients(self, grads_and_vars):
self._t.assign_add(tf.constant(1., self._t.dtype))
for grad, var in grads_and_vars:
if self.clipnorm: grad = tf.clip_by_norm(grad, self.clipnorm)
m = self._m[var._unique_id]
v = self._v[var._unique_id]
m.assign(self._beta_1 * m + (1. - self._beta_1) * grad)
v.assign(self._beta_2 * v + (1. - self._beta_2) * tf.square(grad))
lr = self._lr * tf.sqrt(1 - tf.pow(self._beta_2, self._t)) / (1 - tf.pow(self._beta_1, self._t))
update = -lr * m / (tf.sqrt(v) + self._epsilon)
var.assign_add(update)
class Adam(object):
def __init__(self, lr=.01, beta_1=.9, beta_2=.999, epsilon=1e-8, clipnorm=None):
self._lr = tf.Variable(lr, dtype=tf.float32)
self._beta_1 = tf.Variable(beta_1, dtype=tf.float32)
self._beta_2 = tf.Variable(beta_2, dtype=tf.float32)
self._epsilon = tf.constant(epsilon, dtype=tf.float32)
self.clipnorm = clipnorm
self._t = tf.Variable(0, dtype=tf.float32)
def init_moments(self, var_list):
self._m = {var._unique_id: tf.Variable(tf.zeros_like(var))
for var in var_list}
self._v = {var._unique_id: tf.Variable(tf.zeros_like(var))
for var in var_list}
def apply_gradients(self, grads_and_vars):
self._t.assign_add(tf.constant(1., self._t.dtype))
for grad, var in grads_and_vars:
if self.clipnorm: grad = tf.clip_by_norm(grad, self.clipnorm)
m = self._m[var._unique_id]
v = self._v[var._unique_id]
m.assign(self._beta_1 * m + (1. - self._beta_1) * grad)
v.assign(self._beta_2 * v + (1. - self._beta_2) * tf.square(grad))
lr = self._lr * tf.sqrt(1 - tf.pow(self._beta_2, self._t)) / (1 - tf.pow(self._beta_1, self._t))
update = -lr * m / (tf.sqrt(v) + self._epsilon)
var.assign_add(update)
# chunk 13
model = MLP(4, 1)
model.build(input_shape=(32,13))
optimizer = Adam(lr=1e-3, beta_1=.9, beta_2=.999, epsilon=1e-8, clipnorm=2)
optimizer.init_moments(model.variables)
model, history = train(model, optimizer, n_epochs=1000)
ax = history.plot(x='epoch', kind='line', figsize=(12, 6))
ax.get_figure().savefig('ch4_plot_5.png')
print(history.testing_loss_full.min())
# chunk 14
class Adam(tf.keras.optimizers.Optimizer):
def __init__(self, learning_rate=.001, beta_1=.9, beta_2=.999, epsilon=1e-8, name='Adam', **kwargs):
super().__init__(name, **kwargs)
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
self._set_hyper('beta_1', beta_1)
self._set_hyper('beta_2', beta_2)
self.epsilon = epsilon or tf.keras.backend.epsilon()
def _create_slots(self, var_list):
for var in var_list:
self.add_slot(var, 'm')
for var in var_list:
self.add_slot(var, 'v')
def _resource_apply_dense(self, grad, var):
dtype = var.dtype.base_dtype
t = tf.cast(self.iterations + 1, dtype)
lr = self._decayed_lr(dtype)
beta_1 = self._get_hyper('beta_1', dtype)
beta_2 = self._get_hyper('beta_2', dtype)
epsilon = tf.convert_to_tensor(self.epsilon, dtype)
m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
m = m.assign(beta_1 * m + (1. - beta_1) * grad)
v = v.assign(beta_2 * v + (1. - beta_2) * tf.square(grad))
lr = lr * tf.sqrt(1 - tf.pow(beta_2, t)) / (1 - tf.pow(beta_1, t))
update = -lr * m / (tf.sqrt(v) + epsilon)
var_update = var.assign_add(update)
updates = [var_update, m, v]
return tf.group(*updates)
def get_config(self):
config = super().get_config()
config.update({
'learning_rate': self._serialize_hyperparameter('learning_rate'),
'beta_1': self._serialize_hyperparameter('beta_1'),
'beta_2': self._serialize_hyperparameter('beta_2'),
'epsilon': self.epsilon,
'total_steps': self._serialize_hyperparameter('total_steps'),
})
return config
# chunk 15
model = MLP(4, 1)
optimizer = Adam(lr=1e-3, beta_1=.9, beta_2=.999, epsilon=1e-8)
model, history = train(model, optimizer, n_epochs=1000)
ax = history.plot(x='epoch', kind='line', figsize=(12, 6))
ax.get_figure().savefig('ch4_plot_5.png')
print(history.testing_loss_full.min())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment