Created
May 7, 2018 10:55
-
-
Save lgeiger/10a3b1a0b94b52bc64d14d949ad74595 to your computer and use it in GitHub Desktop.
Training GANs with Optimism using Tensorflow (https://github.com/vsyrgkanis/optimistic_GAN_training/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
from tensorflow.python.eager import context | |
from tensorflow.python.ops import control_flow_ops | |
from tensorflow.python.ops import math_ops | |
from tensorflow.python.ops import state_ops | |
from tensorflow.python.framework import ops | |
from tensorflow.python.training import optimizer | |
class AdamirrorOptimizer(optimizer.Optimizer): | |
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, | |
use_locking=False, name="Adamirror"): | |
super(AdamirrorOptimizer, self).__init__(use_locking, name) | |
self._lr = learning_rate | |
self._beta1 = beta1 | |
self._beta2 = beta2 | |
self._epsilon = epsilon | |
# Tensor versions of the constructor arguments, created in _prepare(). | |
self._lr_t = None | |
self._beta1_t = None | |
self._beta2_t = None | |
self._epsilon_t = None | |
def _get_beta_accumulators(self): | |
if context.executing_eagerly(): | |
graph = None | |
else: | |
graph = ops.get_default_graph() | |
return (self._get_non_slot_variable("beta1_power", graph=graph), | |
self._get_non_slot_variable("beta2_power", graph=graph)) | |
def _create_slots(self, var_list): | |
# Create the beta1 and beta2 accumulators on the same device as the first | |
# variable. Sort the var_list to make sure this device is consistent across | |
# workers (these need to go on the same PS, otherwise some updates are | |
# silently ignored). | |
first_var = min(var_list, key=lambda x: x.name) | |
self._create_non_slot_variable(initial_value=self._beta1, | |
name="beta1_power", | |
colocate_with=first_var) | |
self._create_non_slot_variable(initial_value=self._beta2, | |
name="beta2_power", | |
colocate_with=first_var) | |
# Create slots for the first and second moments. | |
for v in var_list: | |
self._zeros_slot(v, "m", self._name) | |
self._zeros_slot(v, "v", self._name) | |
def _prepare(self): | |
self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate") | |
self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1") | |
self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2") | |
self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon") | |
def _apply_dense(self, grad, var): | |
beta1_power, beta2_power = self._get_beta_accumulators() | |
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) | |
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) | |
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) | |
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) | |
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) | |
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) | |
lr = lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power) | |
m = self.get_slot(var, "m") | |
v = self.get_slot(var, "v") | |
old_update = lr * m / (math_ops.sqrt(v) + epsilon_t) | |
var_update = state_ops.assign_add(var, old_update, use_locking=self._use_locking) | |
with ops.control_dependencies([var_update]): | |
m_t = state_ops.assign(m, | |
m * beta1_t + (grad * (1 - beta1_t)), | |
use_locking=self._use_locking) | |
v_t = state_ops.assign(v, | |
v * beta2_t + ((grad * grad) * (1 - beta2_t)), | |
use_locking=self._use_locking) | |
var_update = state_ops.assign_sub(var, | |
2 * lr * m_t / (math_ops.sqrt(v_t) + epsilon_t), | |
use_locking=self._use_locking) | |
return control_flow_ops.group(*[var_update, m_t, v_t]) | |
def _resource_apply_dense(self, grad, handle): | |
return self._apply_dense(grad, handle) | |
def _finish(self, update_ops, name_scope): | |
# Update the power accumulators. | |
with ops.control_dependencies(update_ops): | |
beta1_power, beta2_power = self._get_beta_accumulators() | |
with ops.colocate_with(beta1_power): | |
update_beta1 = beta1_power.assign(beta1_power * self._beta1_t, use_locking=self._use_locking) | |
update_beta2 = beta2_power.assign(beta2_power * self._beta2_t, use_locking=self._use_locking) | |
return control_flow_ops.group(*update_ops + [update_beta1, update_beta2], name=name_scope) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Tests for Adamirror.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import numpy as np | |
from tensorflow.python.client import session | |
from tensorflow.python.eager import context | |
from tensorflow.python.framework import constant_op | |
from tensorflow.python.framework import dtypes | |
from tensorflow.python.framework import ops | |
from tensorflow.python.framework import test_util | |
from tensorflow.python.ops import resource_variable_ops | |
from tensorflow.python.ops import variables | |
from tensorflow.python.platform import test | |
from .adamirror import AdamirrorOptimizer | |
def adam_update_numpy(param, g_t, t, m, v, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8): | |
alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t) | |
alpha_t_1 = alpha * np.sqrt(1 - beta2**(t - 1)) / (1 - beta1**(t - 1)) if t > 1 else alpha_t | |
m_t = beta1 * m + (1 - beta1) * g_t | |
v_t = beta2 * v + (1 - beta2) * g_t * g_t | |
param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon) + alpha_t_1 * m / (np.sqrt(v) + epsilon) | |
return param_t, m_t, v_t | |
class AdamirrorOptimizerTest(test.TestCase): | |
def doTestBasic(self, use_resource=False): | |
for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]): | |
with self.test_session(graph=ops.Graph()): | |
# Initialize variables for numpy implementation. | |
m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 | |
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) | |
grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) | |
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) | |
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) | |
epsilon = 1e-7 if dtype == dtypes.half else 1e-8 | |
if use_resource: | |
var0 = resource_variable_ops.ResourceVariable(var0_np, name="var0_%d" % i) | |
var1 = resource_variable_ops.ResourceVariable(var1_np, name="var1_%d" % i) | |
else: | |
var0 = variables.Variable(var0_np) | |
var1 = variables.Variable(var1_np) | |
grads0 = constant_op.constant(grads0_np) | |
grads1 = constant_op.constant(grads1_np) | |
opt = AdamirrorOptimizer(epsilon=epsilon) | |
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) | |
opt_variables = opt.variables() | |
beta1_power, beta2_power = opt._get_beta_accumulators() | |
self.assertTrue(beta1_power is not None) | |
self.assertTrue(beta2_power is not None) | |
self.assertIn(beta1_power, opt_variables) | |
self.assertIn(beta2_power, opt_variables) | |
with ops.Graph().as_default(): | |
# Shouldn't return non-slot variables from other graphs. | |
self.assertEqual(0, len(opt.variables())) | |
if not context.executing_eagerly(): | |
self.evaluate(variables.global_variables_initializer()) | |
# Fetch params to validate initial values | |
self.assertAllClose([1.0, 2.0], self.evaluate(var0)) | |
self.assertAllClose([3.0, 4.0], self.evaluate(var1)) | |
beta1_power, beta2_power = opt._get_beta_accumulators() | |
# Run 3 steps of Adam | |
for t in range(1, 4): | |
if not context.executing_eagerly(): | |
self.evaluate(update) | |
elif t > 1: | |
opt.apply_gradients(zip([grads0, grads1], [var0, var1])) | |
self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta1_power)) | |
self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta2_power)) | |
var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0, epsilon=epsilon) | |
var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1, epsilon=epsilon) | |
# Validate updated params | |
self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) | |
self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) | |
if use_resource: | |
self.assertEqual("var0_%d/Adamirror:0" % (i,), | |
opt.get_slot(var=var0, name="m").name) | |
def testBasic(self): | |
with self.test_session(): | |
self.doTestBasic(use_resource=False) | |
@test_util.run_in_graph_and_eager_modes(reset_test=True) | |
def testResourceBasic(self): | |
self.doTestBasic(use_resource=True) | |
def testTensorLearningRate(self): | |
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: | |
with self.test_session(): | |
# Initialize variables for numpy implementation. | |
m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 | |
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) | |
grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) | |
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) | |
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) | |
epsilon = 1e-7 if dtype == dtypes.half else 1e-8 | |
var0 = variables.Variable(var0_np) | |
var1 = variables.Variable(var1_np) | |
grads0 = constant_op.constant(grads0_np) | |
grads1 = constant_op.constant(grads1_np) | |
opt = AdamirrorOptimizer(constant_op.constant(0.001), epsilon=epsilon) | |
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) | |
variables.global_variables_initializer().run() | |
# Fetch params to validate initial values | |
self.assertAllClose([1.0, 2.0], var0.eval()) | |
self.assertAllClose([3.0, 4.0], var1.eval()) | |
beta1_power, beta2_power = opt._get_beta_accumulators() | |
# Run 3 steps of Adam | |
for t in range(1, 4): | |
self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) | |
self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) | |
update.run() | |
var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0, epsilon=epsilon) | |
var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1, epsilon=epsilon) | |
# Validate updated params | |
self.assertAllCloseAccordingToType(var0_np, var0.eval()) | |
self.assertAllCloseAccordingToType(var1_np, var1.eval()) | |
def testSharing(self): | |
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: | |
with self.test_session(): | |
# Initialize variables for numpy implementation. | |
m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 | |
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) | |
grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) | |
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) | |
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) | |
epsilon = 1e-7 if dtype == dtypes.half else 1e-8 | |
var0 = variables.Variable(var0_np) | |
var1 = variables.Variable(var1_np) | |
grads0 = constant_op.constant(grads0_np) | |
grads1 = constant_op.constant(grads1_np) | |
opt = AdamirrorOptimizer(epsilon=epsilon) | |
update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) | |
update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) | |
variables.global_variables_initializer().run() | |
beta1_power, beta2_power = opt._get_beta_accumulators() | |
# Fetch params to validate initial values | |
self.assertAllClose([1.0, 2.0], var0.eval()) | |
self.assertAllClose([3.0, 4.0], var1.eval()) | |
# Run 3 steps of intertwined Adam1 and Adam2. | |
for t in range(1, 4): | |
self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) | |
self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) | |
if t % 2 == 0: | |
update1.run() | |
else: | |
update2.run() | |
var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0, epsilon=epsilon) | |
var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1, epsilon=epsilon) | |
# Validate updated params | |
self.assertAllCloseAccordingToType(var0_np, var0.eval()) | |
self.assertAllCloseAccordingToType(var1_np, var1.eval()) | |
def testTwoSessions(self): | |
optimizer = AdamirrorOptimizer() | |
g = ops.Graph() | |
with g.as_default(): | |
with session.Session(): | |
var0 = variables.Variable(np.array([1.0, 2.0]), name="v0") | |
grads0 = constant_op.constant(np.array([0.1, 0.1])) | |
optimizer.apply_gradients([(grads0, var0)]) | |
gg = ops.Graph() | |
with gg.as_default(): | |
with session.Session(): | |
var0 = variables.Variable(np.array([1.0, 2.0]), name="v0") | |
grads0 = constant_op.constant(np.array([0.1, 0.1])) | |
# If the optimizer saves any state not keyed by graph the following line | |
# fails. | |
optimizer.apply_gradients([(grads0, var0)]) | |
def testSlotsUniqueEager(self): | |
with context.eager_mode(): | |
v1 = resource_variable_ops.ResourceVariable(1.) | |
v2 = resource_variable_ops.ResourceVariable(1.) | |
opt = AdamirrorOptimizer(1.) | |
opt.minimize(lambda: v1 + v2) | |
# There should be two non-slot variables, and two unique slot variables | |
# for v1 and v2 respectively. | |
self.assertEqual(6, len(set(opt.variables()))) | |
if __name__ == "__main__": | |
test.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment