Skip to content

Instantly share code, notes, and snippets.

@notwa
Last active April 5, 2018 05:14
Show Gist options
  • Save notwa/83074f1a4e408bc10dcf9df00c6d620f to your computer and use it in GitHub Desktop.
Save notwa/83074f1a4e408bc10dcf9df00c6d620f to your computer and use it in GitHub Desktop.
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd.misc.flatten import flatten
from autograd.misc.optimizers import unflatten_optimizer
import itertools, types
def iterize(iterable):
if type(iterable) in (tuple, list):
iterator = iter(iterable)
elif np.isscalar(iterable):
iterator = itertools.repeat(iterable)
elif isinstance(iterable, types.GeneratorType):
iterator = iterable
else:
raise NotImplementedError("iterize: unhandled input type: " + str(type(iterable)))
return iterator
def init_he_normal(size, ins, outs):
s = np.sqrt(2 / ins)
return npr.normal(0, s, size=size)
def init_he_uniform(size, ins, outs):
s = np.sqrt(6 / ins)
return npr.uniform(-s, s, size=size)
def init_dense(size_in, size_out, init=init_he_uniform):
return [init((size_in, size_out), size_in, size_out),
np.zeros((1, size_out))]
def accuracy(p, t):
correct = np.argmax(p, axis=-1) == np.argmax(t, axis=-1)
return np.mean(correct)
def crossentropy(p, t, eps=1e-6):
p = np.clip(p, eps, 1 - eps)
f = np.sum(-t * np.log(p) - (1 - t) * np.log(1 - p), axis=-1)
return np.mean(f)
def dropout(x, amount):
p = 1 - amount
mask = (npr.rand(*x.shape) < p) / p
return x * mask
def layernorm(x, eps=1e-5):
mean = np.mean(x, axis=0)
center = x - mean
var = np.var(center, axis=0) + eps
std = np.sqrt(var)
return center / std
def softmax(x):
alpha = np.max(x, axis=-1, keepdims=True)
num = np.exp(x - alpha)
den = np.sum(num, axis=-1, keepdims=True)
return num / den
def l1l2(params, l1=0.0, l2=0.0):
x, _ = flatten(params)
f = 0.0
if l1:
f = f + np.sum(l1 * np.abs(x))
if l2:
f = f + np.sum(l2 * np.square(x))
return f
def l1l2avg(params, l1=0.0, l2=0.0):
f = 0.0
for layer in params:
x, _ = flatten(layer)
if l1:
f = f + np.mean(l1 * np.abs(x))
if l2:
f = f + np.mean(l2 * np.square(x))
return f
def slow_starter(x):
return np.sqrt((1 - np.cos(2 * np.pi * np.square(x))) / 2)
def wave_clr(x):
return 0.5 * (1 - np.cos(np.pi * x))
def sine_clr(x):
return np.sin(np.pi * x)
def typical_scheduler(f, num_epochs, num_batches, num_restarts=1, base_rate=1.0):
num_epochs = max(num_epochs, 2) # FIXME
for epoch in range(num_epochs):
for batch in range(num_batches):
x = (epoch + batch / (num_batches - 1)) / (num_epochs - 1)
x = x * num_restarts
if batch != num_batches - 1 and epoch != num_epochs - 1:
x = x % 1
yield f(x) * base_rate
def batchize(x, t, batch_size, num_epochs=1):
for i in range(num_epochs):
indices = np.arange(len(x))
npr.shuffle(indices)
for i in range(0, len(x), batch_size):
batch_indices = indices[i:i + batch_size]
yield x[batch_indices], t[batch_indices]
def mse(P, T):
return np.mean(np.square(P - T) / 2)
def huber(P, T, delta=1.0):
R = P - T
error = np.where(R <= delta,
np.square(R) / 2,
delta * (np.abs(R) - delta / 2))
return np.mean(error)
@unflatten_optimizer
def madam(grad, x, callback=None, num_iters=100,
step_size=0.001, b1=0.81, b2=0.99005, eps=1e-8,
step_log=None, gradients=None):
"""The Adam optimizer, with tweaked defaults
and allowing a generator or iterable for step_size."""
step_size = iterize(step_size)
m = np.zeros(len(x))
v = np.zeros(len(x))
for i in range(num_iters):
g = grad(x, i)
if callback: callback(x, i, g)
m = (1 - b1) * g + b1 * m # First moment estimate.
v = (1 - b2) * (g**2) + b2 * v # Second moment estimate.
mhat = m / (1 - b1**(i + 1)) # Bias correction.
vhat = v / (1 - b2**(i + 1))
step = next(step_size) * mhat / (np.sqrt(vhat) + eps)
x = x - step
if step_log is not None:
step_log.append(step.astype(np.float32))
if gradients is not None:
gradients.append(g.astype(np.float32))
return x
@unflatten_optimizer
def mom(grad, x, callback=None, num_iters=100,
step_size=1.0, mu=0.9,
step_log=None, gradients=None):
"""SGD with plain momentum and compensated step sizes,
plus the rest of the gismos."""
step_size = iterize(step_size)
v = np.zeros(len(x))
for i in range(num_iters):
g = grad(x, i)
if callback: callback(x, i, g)
v = mu * v - next(step_size) * g
step = -v * (1 - mu)
x = x - step
if step_log is not None:
step_log.append(step.astype(np.float32))
if gradients is not None:
gradients.append(g.astype(np.float32))
return x
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment