Created
May 9, 2024 05:54
-
-
Save youkaichao/7924d8fb41b9c471eccea0f157e8bbf1 to your computer and use it in GitHub Desktop.
pytorch distributed nccl communicator creation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run the code with `torchrun --nproc-per-node 4 test.py` | |
import os | |
os.environ['NCCL_DEBUG'] = 'TRACE' | |
import torch | |
import torch.distributed as dist | |
# nccl communicators are lazily created | |
dist.init_process_group(backend='nccl') | |
print("init done") | |
rank = dist.get_rank() | |
device = torch.device(f'cuda:{rank}') | |
data = torch.zeros((5, 5), device=device) | |
# it is created on the fly, when we request collective operation | |
print("will create nccl communicator") | |
dist.all_reduce(data, op=dist.ReduceOp.SUM) | |
print("will create nccl communicator again") | |
new_group = dist.new_group(ranks=[0, 1]) | |
# when doing collective operation with a new group, it will | |
# create a new communicator | |
dist.all_reduce(data, op=dist.ReduceOp.SUM, group=new_group) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment