Last active
February 20, 2025 10:21
-
-
Save Bowser1704/5bc947ff0461f504bac8a211c10cb374 to your computer and use it in GitHub Desktop.
nccl_test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import torch | |
import torch.distributed as dist | |
from torch.nn.parallel import DistributedDataParallel as DDP | |
def benchmark_all_reduce(): | |
# Initialize distributed environment | |
rank = int(os.environ["RANK"]) | |
local_rank = int(os.environ["LOCAL_RANK"]) | |
world_size = int(os.environ["WORLD_SIZE"]) | |
# Initialize NCCL backend (automatically enables RDMA when available) | |
dist.init_process_group(backend="nccl") | |
torch.cuda.set_device(local_rank) | |
# Benchmark parameters | |
data_sizes = [2**i for i in range(20, 31)] # Test from 1MB (2^20 elements) to 1GB (2^30 elements) | |
num_iters = 100 # Number of iterations per data size | |
warmup_iters = 10 # Warmup iterations to exclude initialization overhead | |
if rank == 0: | |
print(f"{'Size (MB)':<12} {'Bandwidth (GB/s)':<15}") | |
for size in data_sizes: | |
# Create CUDA tensor (float32: 4 bytes per element) | |
data = torch.rand(size, dtype=torch.float32, device="cuda") | |
element_size = data.element_size() # 4 bytes for float32 | |
total_bytes = size * element_size # Total data size in bytes | |
# Warmup phase (exclude timing) | |
for _ in range(warmup_iters): | |
dist.all_reduce(data, op=dist.ReduceOp.SUM) | |
torch.cuda.synchronize() # Ensure CUDA operations complete | |
# Timed measurement | |
start_time = time.perf_counter() | |
for _ in range(num_iters): | |
dist.all_reduce(data, op=dist.ReduceOp.SUM) | |
torch.cuda.synchronize() # Synchronize after each iteration | |
end_time = time.perf_counter() | |
# Bandwidth calculation formula: | |
# Effective transferred data = 2*(N-1)/N * size_per_gpu * num_iters | |
# (All-Reduce requires 2*(N-1)/N data transfer per operation) | |
total_data_transferred = 2 * (world_size - 1) / world_size * total_bytes * num_iters | |
elapsed_time = end_time - start_time | |
bandwidth = total_data_transferred / elapsed_time / (1024**3) # Convert to GB/s | |
if rank == 0: | |
size_mb = total_bytes / (1024**2) # Convert bytes to MB | |
print(f"{size_mb:<12.2f} {bandwidth:<15.2f}") | |
if __name__ == "__main__": | |
benchmark_all_reduce() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment