Created
January 27, 2018 06:20
-
-
Save alsrgv/cbbb2e25f09c1df983098098511d16c4 to your computer and use it in GitHub Desktop.
Model parallelism in Horovod
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2018 Uber Technologies, Inc. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
#!/usr/bin/env python | |
import tensorflow as tf | |
import horovod.tensorflow as hvd | |
layers = tf.contrib.layers | |
learn = tf.contrib.learn | |
tf.logging.set_verbosity(tf.logging.INFO) | |
def conv_model(feature, target, mode): | |
"""2-layer convolution model.""" | |
# Convert the target to a one-hot tensor of shape (batch_size, 10) and | |
# with a on-value of 1 for each one-hot vector of length 10. | |
target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0) | |
# Reshape feature to 4d tensor with 2nd and 3rd dimensions being | |
# image width and height final dimension being the number of color channels. | |
feature = tf.reshape(feature, [-1, 28, 28, 1]) | |
# Horovod Model Parallelism: specify devices for particular layers. | |
with tf.device('/gpu:0'): | |
# First conv layer will compute 32 features for each 5x5 patch | |
with tf.variable_scope('conv_layer1'): | |
h_conv1 = layers.conv2d( | |
feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu) | |
h_pool1 = tf.nn.max_pool( | |
h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') | |
# Second conv layer will compute 64 features for each 5x5 patch. | |
with tf.variable_scope('conv_layer2'): | |
h_conv2 = layers.conv2d( | |
h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu) | |
h_pool2 = tf.nn.max_pool( | |
h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') | |
# reshape tensor into a batch of vectors | |
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) | |
with tf.device('/gpu:1'): | |
# Densely connected layer with 1024 neurons. | |
h_fc1 = layers.dropout( | |
layers.fully_connected( | |
h_pool2_flat, 1024, activation_fn=tf.nn.relu), | |
keep_prob=0.5, | |
is_training=mode == tf.contrib.learn.ModeKeys.TRAIN) | |
# Compute logits (1 per class) and compute loss. | |
logits = layers.fully_connected(h_fc1, 10, activation_fn=None) | |
loss = tf.losses.softmax_cross_entropy(target, logits) | |
return tf.argmax(logits, 1), loss | |
def main(_): | |
# Horovod: initialize Horovod. | |
hvd.init() | |
# Download and load MNIST dataset. | |
mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) | |
# Build model... | |
with tf.name_scope('input'): | |
image = tf.placeholder(tf.float32, [None, 784], name='image') | |
label = tf.placeholder(tf.float32, [None], name='label') | |
predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) | |
# Horovod: adjust learning rate based on number of GPUs. | |
opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) | |
# Horovod: add Horovod Distributed Optimizer. | |
opt = hvd.DistributedOptimizer(opt) | |
global_step = tf.contrib.framework.get_or_create_global_step() | |
train_op = opt.minimize(loss, global_step=global_step) | |
hooks = [ | |
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states | |
# from rank 0 to all other processes. This is necessary to ensure consistent | |
# initialization of all workers when training is started with random weights | |
# or restored from a checkpoint. | |
hvd.BroadcastGlobalVariablesHook(0), | |
# Horovod: adjust number of steps based on number of GPUs. | |
tf.train.StopAtStepHook(last_step=20000 // hvd.size()), | |
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, | |
every_n_iter=10), | |
] | |
# Horovod Model Parallelism: pin GPUs to be used to process local rank (two GPU per process) | |
config = tf.ConfigProto() | |
config.gpu_options.allow_growth = True | |
config.gpu_options.visible_device_list = \ | |
'%d,%d' % (hvd.local_rank() * 2, hvd.local_rank() * 2 + 1) | |
# Horovod: save checkpoints only on worker 0 to prevent other workers from | |
# corrupting them. | |
checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None | |
# The MonitoredTrainingSession takes care of session initialization, | |
# restoring from a checkpoint, saving to a checkpoint, and closing when done | |
# or an error occurs. | |
with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, | |
hooks=hooks, | |
config=config) as mon_sess: | |
while not mon_sess.should_stop(): | |
# Run a training step synchronously. | |
image_, label_ = mnist.train.next_batch(100) | |
mon_sess.run(train_op, feed_dict={image: image_, label: label_}) | |
if __name__ == "__main__": | |
tf.app.run() |
Currently trying to get model parallelism to work with horovod and tensorflow. However when I run the tf.device command does not seem to work with all the model layers staying on GPU 0. Adding an assert command after the relevant with tf.device command such as:-
assert x.device.endswith("/GPU:1")
will fail.
Still interested to see if this works, or if an alternative is available.
could you share one example with tf2 and keras?
hello, I cannot get it how do you realize model parallelism?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Is there a way to do this in PyTorch using Horovod?