Skip to content

Instantly share code, notes, and snippets.

@youkaichao
Created May 9, 2024 05:54
Show Gist options
  • Save youkaichao/7924d8fb41b9c471eccea0f157e8bbf1 to your computer and use it in GitHub Desktop.
Save youkaichao/7924d8fb41b9c471eccea0f157e8bbf1 to your computer and use it in GitHub Desktop.
pytorch distributed nccl communicator creation
# run the code with `torchrun --nproc-per-node 4 test.py`
import os
os.environ['NCCL_DEBUG'] = 'TRACE'
import torch
import torch.distributed as dist
# nccl communicators are lazily created
dist.init_process_group(backend='nccl')
print("init done")
rank = dist.get_rank()
device = torch.device(f'cuda:{rank}')
data = torch.zeros((5, 5), device=device)
# it is created on the fly, when we request collective operation
print("will create nccl communicator")
dist.all_reduce(data, op=dist.ReduceOp.SUM)
print("will create nccl communicator again")
new_group = dist.new_group(ranks=[0, 1])
# when doing collective operation with a new group, it will
# create a new communicator
dist.all_reduce(data, op=dist.ReduceOp.SUM, group=new_group)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment