youkaichao · May 9, 2024 05:54
diff --git a/test.py b/test.py
 # run the code with `torchrun --nproc-per-node 4 test.py`
 import os
 os.environ['NCCL_DEBUG'] = 'TRACE'
 import torch
 import torch.distributed as dist

 # nccl communicators are lazily created
 dist.init_process_group(backend='nccl')
 print("init done")

 rank = dist.get_rank()

 device = torch.device(f'cuda:{rank}')
 data = torch.zeros((5, 5), device=device)
 # it is created on the fly, when we request collective operation
 print("will create nccl communicator")
 dist.all_reduce(data, op=dist.ReduceOp.SUM)

 print("will create nccl communicator again")
 new_group = dist.new_group(ranks=[0, 1])
 # when doing collective operation with a new group, it will
 # create a new communicator
 dist.all_reduce(data, op=dist.ReduceOp.SUM, group=new_group)
	# run the code with `torchrun --nproc-per-node 4 test.py`
	import os
	os.environ['NCCL_DEBUG'] = 'TRACE'
	import torch
	import torch.distributed as dist

	# nccl communicators are lazily created
	dist.init_process_group(backend='nccl')
	print("init done")

	rank = dist.get_rank()

	device = torch.device(f'cuda:{rank}')
	data = torch.zeros((5, 5), device=device)
	# it is created on the fly, when we request collective operation
	print("will create nccl communicator")
	dist.all_reduce(data, op=dist.ReduceOp.SUM)

	print("will create nccl communicator again")
	new_group = dist.new_group(ranks=[0, 1])
	# when doing collective operation with a new group, it will
	# create a new communicator
	dist.all_reduce(data, op=dist.ReduceOp.SUM, group=new_group)