Created
September 17, 2018 19:24
-
-
Save nvcastet/60b8c0c66da4cf2949e38fc790208a1c to your computer and use it in GitHub Desktop.
Replacing MirroredStrategy with CollectiveAllReduceStrategy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2018 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Helper functions for running models in a distributed setting.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import tensorflow as tf | |
def get_distribution_strategy(num_gpus, all_reduce_alg=None): | |
"""Return a DistributionStrategy for running the model. | |
Args: | |
num_gpus: Number of GPUs to run this model. | |
all_reduce_alg: Specify which algorithm to use when performing all-reduce. | |
See tf.contrib.distribute.AllReduceCrossTowerOps for available algorithms. | |
If None, DistributionStrategy will choose based on device topology. | |
Returns: | |
tf.contrib.distribute.DistibutionStrategy object. | |
""" | |
if num_gpus == 0: | |
return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0") | |
elif num_gpus == 1: | |
return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0") | |
else: | |
if all_reduce_alg: | |
return tf.contrib.distribute.MirroredStrategy( | |
num_gpus=num_gpus, | |
cross_tower_ops=tf.contrib.distribute.AllReduceCrossTowerOps( | |
all_reduce_alg, num_packs=num_gpus)) | |
else: | |
return tf.contrib.distribute.CollectiveAllReduceStrategy(num_gpus_per_worker=num_gpus) | |
#return tf.contrib.distribute.MirroredStrategy(num_gpus=num_gpus) | |
def per_device_batch_size(batch_size, num_gpus): | |
"""For multi-gpu, batch-size must be a multiple of the number of GPUs. | |
Note that this should eventually be handled by DistributionStrategies | |
directly. Multi-GPU support is currently experimental, however, | |
so doing the work here until that feature is in place. | |
Args: | |
batch_size: Global batch size to be divided among devices. This should be | |
equal to num_gpus times the single-GPU batch_size for multi-gpu training. | |
num_gpus: How many GPUs are used with DistributionStrategies. | |
Returns: | |
Batch size per device. | |
Raises: | |
ValueError: if batch_size is not divisible by number of devices | |
""" | |
if num_gpus <= 1: | |
return batch_size | |
remainder = batch_size % num_gpus | |
if remainder: | |
err = ("When running with multiple GPUs, batch size " | |
"must be a multiple of the number of available GPUs. Found {} " | |
"GPUs with a batch size of {}; try --batch_size={} instead." | |
).format(num_gpus, batch_size, batch_size - remainder) | |
raise ValueError(err) | |
return int(batch_size / num_gpus) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment