Skip to content

Instantly share code, notes, and snippets.

@Bowser1704
Last active February 20, 2025 10:21
Show Gist options
  • Save Bowser1704/5bc947ff0461f504bac8a211c10cb374 to your computer and use it in GitHub Desktop.
Save Bowser1704/5bc947ff0461f504bac8a211c10cb374 to your computer and use it in GitHub Desktop.
nccl_test
import os
import time
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
def benchmark_all_reduce():
# Initialize distributed environment
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
# Initialize NCCL backend (automatically enables RDMA when available)
dist.init_process_group(backend="nccl")
torch.cuda.set_device(local_rank)
# Benchmark parameters
data_sizes = [2**i for i in range(20, 31)] # Test from 1MB (2^20 elements) to 1GB (2^30 elements)
num_iters = 100 # Number of iterations per data size
warmup_iters = 10 # Warmup iterations to exclude initialization overhead
if rank == 0:
print(f"{'Size (MB)':<12} {'Bandwidth (GB/s)':<15}")
for size in data_sizes:
# Create CUDA tensor (float32: 4 bytes per element)
data = torch.rand(size, dtype=torch.float32, device="cuda")
element_size = data.element_size() # 4 bytes for float32
total_bytes = size * element_size # Total data size in bytes
# Warmup phase (exclude timing)
for _ in range(warmup_iters):
dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.cuda.synchronize() # Ensure CUDA operations complete
# Timed measurement
start_time = time.perf_counter()
for _ in range(num_iters):
dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.cuda.synchronize() # Synchronize after each iteration
end_time = time.perf_counter()
# Bandwidth calculation formula:
# Effective transferred data = 2*(N-1)/N * size_per_gpu * num_iters
# (All-Reduce requires 2*(N-1)/N data transfer per operation)
total_data_transferred = 2 * (world_size - 1) / world_size * total_bytes * num_iters
elapsed_time = end_time - start_time
bandwidth = total_data_transferred / elapsed_time / (1024**3) # Convert to GB/s
if rank == 0:
size_mb = total_bytes / (1024**2) # Convert bytes to MB
print(f"{size_mb:<12.2f} {bandwidth:<15.2f}")
if __name__ == "__main__":
benchmark_all_reduce()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment