bearpelican · September 13, 2018 22:40
diff --git a/train_c10d_deadlock.py b/train_c10d_deadlock.py

 import argparse
 import os
 import torch
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
 import torch.distributed as dist
 import torch.utils.data
 import torch.utils.data.distributed
 from torch.nn.parallel import DistributedDataParallel

 cudnn.benchmark = True

 from torchvision.models import resnet50


 def get_parser():
    parser = argparse.ArgumentParser(description='PyTorch Minimal Distributed')
    parser.add_argument('--disable-fp16', action='store_true', help='Run with fp16 training')
    parser.add_argument('--dist-url', default='env://', type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend')
    parser.add_argument('--local_rank', default=0, type=int,
                        help='Used for multi-process training. Can either be manually set ' +
                        'or automatically set by using \'python -m multiproc\'.')
    return parser

 cudnn.benchmark = True
 args = get_parser().parse_args()

 
 def env_world_size(): return int(os.environ['WORLD_SIZE'])
 def env_rank(): return int(os.environ['RANK'])

 def sum_tensor(tensor):
  rt = tensor.clone()
  dist.all_reduce(rt, op=dist.reduce_op.SUM)
  return rt


 import torch
 import torch.nn as nn
 from torch.autograd import Variable
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

 def BN_convert_float(module):
    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
        module.float()
    for child in module.children():
        BN_convert_float(child)
    return module


 def network_to_half(network):
    return BN_convert_float(network.half())

 def main():
    print('Distributed initializing process group')
    torch.cuda.set_device(args.local_rank)
    process_group = dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=env_world_size(), rank=env_rank())
    assert(env_world_size() == dist.get_world_size())
    print("Distributed: success (%d/%d)"%(args.local_rank, dist.get_world_size()))

    print('Loading model')
    model = resnet50().cuda()
    # if not args.disable_fp16: model = model.half()
    if not args.disable_fp16: model = network_to_half(model)
    print('Loading distributed')
    model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, process_group=process_group)
    print('Distributed model loaded')

    print('Deadlock may happen here if fp16 enabled')
    tensor = torch.tensor([1.0]).float().cuda()
    print('Creating tensor:', tensor.item())
    output = sum_tensor(tensor)
    print('Able to sync machines:', output.item())

 if __name__ == '__main__':
    main()

	import argparse
	import os
	import torch
	import torch.nn as nn
	import torch.backends.cudnn as cudnn
	import torch.distributed as dist
	import torch.utils.data
	import torch.utils.data.distributed
	from torch.nn.parallel import DistributedDataParallel

	cudnn.benchmark = True

	from torchvision.models import resnet50


	def get_parser():
	parser = argparse.ArgumentParser(description='PyTorch Minimal Distributed')
	parser.add_argument('--disable-fp16', action='store_true', help='Run with fp16 training')
	parser.add_argument('--dist-url', default='env://', type=str,
	help='url used to set up distributed training')
	parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend')
	parser.add_argument('--local_rank', default=0, type=int,
	help='Used for multi-process training. Can either be manually set ' +
	'or automatically set by using \'python -m multiproc\'.')
	return parser

	cudnn.benchmark = True
	args = get_parser().parse_args()


	def env_world_size(): return int(os.environ['WORLD_SIZE'])
	def env_rank(): return int(os.environ['RANK'])

	def sum_tensor(tensor):
	rt = tensor.clone()
	dist.all_reduce(rt, op=dist.reduce_op.SUM)
	return rt


	import torch
	import torch.nn as nn
	from torch.autograd import Variable
	from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

	def BN_convert_float(module):
	if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
	module.float()
	for child in module.children():
	BN_convert_float(child)
	return module


	def network_to_half(network):
	return BN_convert_float(network.half())

	def main():
	print('Distributed initializing process group')
	torch.cuda.set_device(args.local_rank)
	process_group = dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=env_world_size(), rank=env_rank())
	assert(env_world_size() == dist.get_world_size())
	print("Distributed: success (%d/%d)"%(args.local_rank, dist.get_world_size()))

	print('Loading model')
	model = resnet50().cuda()
	# if not args.disable_fp16: model = model.half()
	if not args.disable_fp16: model = network_to_half(model)
	print('Loading distributed')
	model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, process_group=process_group)
	print('Distributed model loaded')

	print('Deadlock may happen here if fp16 enabled')
	tensor = torch.tensor([1.0]).float().cuda()
	print('Creating tensor:', tensor.item())
	output = sum_tensor(tensor)
	print('Able to sync machines:', output.item())

	if __name__ == '__main__':
	main()
No results found