Skip to content

Instantly share code, notes, and snippets.

@alsrgv
Created January 27, 2018 06:20
Show Gist options
  • Save alsrgv/cbbb2e25f09c1df983098098511d16c4 to your computer and use it in GitHub Desktop.
Save alsrgv/cbbb2e25f09c1df983098098511d16c4 to your computer and use it in GitHub Desktop.
Model parallelism in Horovod
# Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#!/usr/bin/env python
import tensorflow as tf
import horovod.tensorflow as hvd
layers = tf.contrib.layers
learn = tf.contrib.learn
tf.logging.set_verbosity(tf.logging.INFO)
def conv_model(feature, target, mode):
"""2-layer convolution model."""
# Convert the target to a one-hot tensor of shape (batch_size, 10) and
# with a on-value of 1 for each one-hot vector of length 10.
target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
# Reshape feature to 4d tensor with 2nd and 3rd dimensions being
# image width and height final dimension being the number of color channels.
feature = tf.reshape(feature, [-1, 28, 28, 1])
# Horovod Model Parallelism: specify devices for particular layers.
with tf.device('/gpu:0'):
# First conv layer will compute 32 features for each 5x5 patch
with tf.variable_scope('conv_layer1'):
h_conv1 = layers.conv2d(
feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu)
h_pool1 = tf.nn.max_pool(
h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
# Second conv layer will compute 64 features for each 5x5 patch.
with tf.variable_scope('conv_layer2'):
h_conv2 = layers.conv2d(
h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu)
h_pool2 = tf.nn.max_pool(
h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
# reshape tensor into a batch of vectors
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
with tf.device('/gpu:1'):
# Densely connected layer with 1024 neurons.
h_fc1 = layers.dropout(
layers.fully_connected(
h_pool2_flat, 1024, activation_fn=tf.nn.relu),
keep_prob=0.5,
is_training=mode == tf.contrib.learn.ModeKeys.TRAIN)
# Compute logits (1 per class) and compute loss.
logits = layers.fully_connected(h_fc1, 10, activation_fn=None)
loss = tf.losses.softmax_cross_entropy(target, logits)
return tf.argmax(logits, 1), loss
def main(_):
# Horovod: initialize Horovod.
hvd.init()
# Download and load MNIST dataset.
mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
# Build model...
with tf.name_scope('input'):
image = tf.placeholder(tf.float32, [None, 784], name='image')
label = tf.placeholder(tf.float32, [None], name='label')
predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)
# Horovod: adjust learning rate based on number of GPUs.
opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())
# Horovod: add Horovod Distributed Optimizer.
opt = hvd.DistributedOptimizer(opt)
global_step = tf.contrib.framework.get_or_create_global_step()
train_op = opt.minimize(loss, global_step=global_step)
hooks = [
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
# from rank 0 to all other processes. This is necessary to ensure consistent
# initialization of all workers when training is started with random weights
# or restored from a checkpoint.
hvd.BroadcastGlobalVariablesHook(0),
# Horovod: adjust number of steps based on number of GPUs.
tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
every_n_iter=10),
]
# Horovod Model Parallelism: pin GPUs to be used to process local rank (two GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = \
'%d,%d' % (hvd.local_rank() * 2, hvd.local_rank() * 2 + 1)
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing when done
# or an error occurs.
with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
hooks=hooks,
config=config) as mon_sess:
while not mon_sess.should_stop():
# Run a training step synchronously.
image_, label_ = mnist.train.next_batch(100)
mon_sess.run(train_op, feed_dict={image: image_, label: label_})
if __name__ == "__main__":
tf.app.run()
@meetps
Copy link

meetps commented Jan 29, 2020

Is there a way to do this in PyTorch using Horovod?

@JohnTaylor2000
Copy link

Currently trying to get model parallelism to work with horovod and tensorflow. However when I run the tf.device command does not seem to work with all the model layers staying on GPU 0. Adding an assert command after the relevant with tf.device command such as:-

assert x.device.endswith("/GPU:1")

will fail.

@JohnTaylor2000
Copy link

Still interested to see if this works, or if an alternative is available.

@aprabh2
Copy link

aprabh2 commented Apr 6, 2022

could you share one example with tf2 and keras?

@etoilestar
Copy link

hello, I cannot get it how do you realize model parallelism?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment