Bowser1704 · February 20, 2025 10:21
diff --git a/nccl_test.py b/nccl_test.py
 import os
 import time
 import torch
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel as DDP

 def benchmark_all_reduce():
    # Initialize distributed environment
    rank = int(os.environ["RANK"])
    local_rank = int(os.environ["LOCAL_RANK"])
    world_size = int(os.environ["WORLD_SIZE"])
    
    # Initialize NCCL backend (automatically enables RDMA when available)
    dist.init_process_group(backend="nccl")
    torch.cuda.set_device(local_rank)
    
    # Benchmark parameters
    data_sizes = [2**i for i in range(20, 31)]  # Test from 1MB (2^20 elements) to 1GB (2^30 elements)
    num_iters = 100       # Number of iterations per data size
    warmup_iters = 10     # Warmup iterations to exclude initialization overhead
    
    if rank == 0:
        print(f"{'Size (MB)':<12} {'Bandwidth (GB/s)':<15}")
    
    for size in data_sizes:
        # Create CUDA tensor (float32: 4 bytes per element)
        data = torch.rand(size, dtype=torch.float32, device="cuda")
        element_size = data.element_size()  # 4 bytes for float32
        total_bytes = size * element_size   # Total data size in bytes
        
        # Warmup phase (exclude timing)
        for _ in range(warmup_iters):
            dist.all_reduce(data, op=dist.ReduceOp.SUM)
            torch.cuda.synchronize()  # Ensure CUDA operations complete
        
        # Timed measurement
        start_time = time.perf_counter()
        for _ in range(num_iters):
            dist.all_reduce(data, op=dist.ReduceOp.SUM)
            torch.cuda.synchronize()  # Synchronize after each iteration
        end_time = time.perf_counter()
        
        # Bandwidth calculation formula:
        # Effective transferred data = 2*(N-1)/N * size_per_gpu * num_iters
        # (All-Reduce requires 2*(N-1)/N data transfer per operation)
        total_data_transferred = 2 * (world_size - 1) / world_size * total_bytes * num_iters
        elapsed_time = end_time - start_time
        bandwidth = total_data_transferred / elapsed_time / (1024**3)  # Convert to GB/s
        
        if rank == 0:
            size_mb = total_bytes / (1024**2)  # Convert bytes to MB
            print(f"{size_mb:<12.2f} {bandwidth:<15.2f}")

 if __name__ == "__main__":
    benchmark_all_reduce()
	import os
	import time
	import torch
	import torch.distributed as dist
	from torch.nn.parallel import DistributedDataParallel as DDP

	def benchmark_all_reduce():
	# Initialize distributed environment
	rank = int(os.environ["RANK"])
	local_rank = int(os.environ["LOCAL_RANK"])
	world_size = int(os.environ["WORLD_SIZE"])

	# Initialize NCCL backend (automatically enables RDMA when available)
	dist.init_process_group(backend="nccl")
	torch.cuda.set_device(local_rank)

	# Benchmark parameters
	data_sizes = [2**i for i in range(20, 31)] # Test from 1MB (2^20 elements) to 1GB (2^30 elements)
	num_iters = 100 # Number of iterations per data size
	warmup_iters = 10 # Warmup iterations to exclude initialization overhead

	if rank == 0:
	print(f"{'Size (MB)':<12} {'Bandwidth (GB/s)':<15}")

	for size in data_sizes:
	# Create CUDA tensor (float32: 4 bytes per element)
	data = torch.rand(size, dtype=torch.float32, device="cuda")
	element_size = data.element_size() # 4 bytes for float32
	total_bytes = size * element_size # Total data size in bytes

	# Warmup phase (exclude timing)
	for _ in range(warmup_iters):
	dist.all_reduce(data, op=dist.ReduceOp.SUM)
	torch.cuda.synchronize() # Ensure CUDA operations complete

	# Timed measurement
	start_time = time.perf_counter()
	for _ in range(num_iters):
	dist.all_reduce(data, op=dist.ReduceOp.SUM)
	torch.cuda.synchronize() # Synchronize after each iteration
	end_time = time.perf_counter()

	# Bandwidth calculation formula:
	# Effective transferred data = 2(N-1)/N size_per_gpu * num_iters
	# (All-Reduce requires 2*(N-1)/N data transfer per operation)
	total_data_transferred = 2 * (world_size - 1) / world_size * total_bytes * num_iters
	elapsed_time = end_time - start_time
	bandwidth = total_data_transferred / elapsed_time / (1024**3) # Convert to GB/s

	if rank == 0:
	size_mb = total_bytes / (1024**2) # Convert bytes to MB
	print(f"{size_mb:<12.2f} {bandwidth:<15.2f}")

	if __name__ == "__main__":
	benchmark_all_reduce()