Last active
April 5, 2018 05:14
-
-
Save notwa/83074f1a4e408bc10dcf9df00c6d620f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import autograd.numpy as np | |
import autograd.numpy.random as npr | |
from autograd.misc.flatten import flatten | |
from autograd.misc.optimizers import unflatten_optimizer | |
import itertools, types | |
def iterize(iterable): | |
if type(iterable) in (tuple, list): | |
iterator = iter(iterable) | |
elif np.isscalar(iterable): | |
iterator = itertools.repeat(iterable) | |
elif isinstance(iterable, types.GeneratorType): | |
iterator = iterable | |
else: | |
raise NotImplementedError("iterize: unhandled input type: " + str(type(iterable))) | |
return iterator | |
def init_he_normal(size, ins, outs): | |
s = np.sqrt(2 / ins) | |
return npr.normal(0, s, size=size) | |
def init_he_uniform(size, ins, outs): | |
s = np.sqrt(6 / ins) | |
return npr.uniform(-s, s, size=size) | |
def init_dense(size_in, size_out, init=init_he_uniform): | |
return [init((size_in, size_out), size_in, size_out), | |
np.zeros((1, size_out))] | |
def accuracy(p, t): | |
correct = np.argmax(p, axis=-1) == np.argmax(t, axis=-1) | |
return np.mean(correct) | |
def crossentropy(p, t, eps=1e-6): | |
p = np.clip(p, eps, 1 - eps) | |
f = np.sum(-t * np.log(p) - (1 - t) * np.log(1 - p), axis=-1) | |
return np.mean(f) | |
def dropout(x, amount): | |
p = 1 - amount | |
mask = (npr.rand(*x.shape) < p) / p | |
return x * mask | |
def layernorm(x, eps=1e-5): | |
mean = np.mean(x, axis=0) | |
center = x - mean | |
var = np.var(center, axis=0) + eps | |
std = np.sqrt(var) | |
return center / std | |
def softmax(x): | |
alpha = np.max(x, axis=-1, keepdims=True) | |
num = np.exp(x - alpha) | |
den = np.sum(num, axis=-1, keepdims=True) | |
return num / den | |
def l1l2(params, l1=0.0, l2=0.0): | |
x, _ = flatten(params) | |
f = 0.0 | |
if l1: | |
f = f + np.sum(l1 * np.abs(x)) | |
if l2: | |
f = f + np.sum(l2 * np.square(x)) | |
return f | |
def l1l2avg(params, l1=0.0, l2=0.0): | |
f = 0.0 | |
for layer in params: | |
x, _ = flatten(layer) | |
if l1: | |
f = f + np.mean(l1 * np.abs(x)) | |
if l2: | |
f = f + np.mean(l2 * np.square(x)) | |
return f | |
def slow_starter(x): | |
return np.sqrt((1 - np.cos(2 * np.pi * np.square(x))) / 2) | |
def wave_clr(x): | |
return 0.5 * (1 - np.cos(np.pi * x)) | |
def sine_clr(x): | |
return np.sin(np.pi * x) | |
def typical_scheduler(f, num_epochs, num_batches, num_restarts=1, base_rate=1.0): | |
num_epochs = max(num_epochs, 2) # FIXME | |
for epoch in range(num_epochs): | |
for batch in range(num_batches): | |
x = (epoch + batch / (num_batches - 1)) / (num_epochs - 1) | |
x = x * num_restarts | |
if batch != num_batches - 1 and epoch != num_epochs - 1: | |
x = x % 1 | |
yield f(x) * base_rate | |
def batchize(x, t, batch_size, num_epochs=1): | |
for i in range(num_epochs): | |
indices = np.arange(len(x)) | |
npr.shuffle(indices) | |
for i in range(0, len(x), batch_size): | |
batch_indices = indices[i:i + batch_size] | |
yield x[batch_indices], t[batch_indices] | |
def mse(P, T): | |
return np.mean(np.square(P - T) / 2) | |
def huber(P, T, delta=1.0): | |
R = P - T | |
error = np.where(R <= delta, | |
np.square(R) / 2, | |
delta * (np.abs(R) - delta / 2)) | |
return np.mean(error) | |
@unflatten_optimizer | |
def madam(grad, x, callback=None, num_iters=100, | |
step_size=0.001, b1=0.81, b2=0.99005, eps=1e-8, | |
step_log=None, gradients=None): | |
"""The Adam optimizer, with tweaked defaults | |
and allowing a generator or iterable for step_size.""" | |
step_size = iterize(step_size) | |
m = np.zeros(len(x)) | |
v = np.zeros(len(x)) | |
for i in range(num_iters): | |
g = grad(x, i) | |
if callback: callback(x, i, g) | |
m = (1 - b1) * g + b1 * m # First moment estimate. | |
v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. | |
mhat = m / (1 - b1**(i + 1)) # Bias correction. | |
vhat = v / (1 - b2**(i + 1)) | |
step = next(step_size) * mhat / (np.sqrt(vhat) + eps) | |
x = x - step | |
if step_log is not None: | |
step_log.append(step.astype(np.float32)) | |
if gradients is not None: | |
gradients.append(g.astype(np.float32)) | |
return x | |
@unflatten_optimizer | |
def mom(grad, x, callback=None, num_iters=100, | |
step_size=1.0, mu=0.9, | |
step_log=None, gradients=None): | |
"""SGD with plain momentum and compensated step sizes, | |
plus the rest of the gismos.""" | |
step_size = iterize(step_size) | |
v = np.zeros(len(x)) | |
for i in range(num_iters): | |
g = grad(x, i) | |
if callback: callback(x, i, g) | |
v = mu * v - next(step_size) * g | |
step = -v * (1 - mu) | |
x = x - step | |
if step_log is not None: | |
step_log.append(step.astype(np.float32)) | |
if gradients is not None: | |
gradients.append(g.astype(np.float32)) | |
return x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment