Last active
October 30, 2025 14:41
-
-
Save maxidl/844160a41d7a4d6d8bc99af24eb1d208 to your computer and use it in GitHub Desktop.
SLURM PyTorch NCCL Multi-Node Test Script: A SLURM batch script that tests PyTorch's NCCL functionality across multiple GPU nodes. The script sets up a distributed PyTorch environment using torchrun and runs a comprehensive test that verifies NCCL initialization, inter-process communication barriers, and proper cleanup. Includes diagnostic outpu…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| #SBATCH --job-name=pytorch-nccl-test | |
| #SBATCH --partition= | |
| #SBATCH --account= | |
| #SBATCH --qos= | |
| #SBATCH --nodes=2 | |
| #SBATCH --ntasks-per-node=1 | |
| #SBATCH --cpus-per-task=32 | |
| #SBATCH --gres=gpu:H100:4 | |
| #SBATCH --time 0:05:00 | |
| #SBATCH --output=%x-%j.out | |
| #SBATCH --exclude= | |
| # SLURM PyTorch NCCL Multi-Node Test Script: | |
| # An example SLURM batch script that tests PyTorch's NCCL functionality across multiple GPU nodes and | |
| # performs a bandwidth test. | |
| # The script sets up a distributed PyTorch environment using torchrun and runs a comprehensive | |
| # test that verifies NCCL initialization, inter-process communication barriers, bandwidth, and proper cleanup. | |
| # Includes diagnostic output for troubleshooting multi-node GPU communication issues in HPC environments. | |
| # Print job information | |
| echo "=== SLURM Job Information ===" | |
| echo "SLURM_JOB_NAME: ${SLURM_JOB_NAME}" | |
| echo "SLURM_JOB_ID: ${SLURM_JOB_ID}" | |
| echo "SLURM_ARRAY_JOB_ID: ${SLURM_ARRAY_JOB_ID}" | |
| echo "SLURM_ARRAY_TASK_ID: ${SLURM_ARRAY_TASK_ID}" | |
| echo "SLURM_JOB_NUM_NODES: ${SLURM_JOB_NUM_NODES}" | |
| echo "SLURM_JOB_NODELIST: ${SLURM_JOB_NODELIST}" | |
| echo "SLURM_JOB_PARTITION: ${SLURM_JOB_PARTITION}" | |
| echo "SLURM_JOB_ACCOUNT: ${SLURM_JOB_ACCOUNT}" | |
| echo "=============================" | |
| set -e | |
| GPUS_PER_NODE=4 | |
| MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | |
| MASTER_PORT=$((20000 + $SLURM_JOB_ID % 10000)) | |
| # Print job information | |
| echo "=== SLURM Job Information ===" | |
| echo "SLURM_JOB_NAME: ${SLURM_JOB_NAME}" | |
| echo "SLURM_JOB_ID: ${SLURM_JOB_ID}" | |
| echo "SLURM_JOB_PARTITION: ${SLURM_JOB_PARTITION}" | |
| echo "SLURM_JOB_ACCOUNT: ${SLURM_JOB_ACCOUNT}" | |
| echo "SLURM_JOB_NUM_NODES: ${SLURM_JOB_NUM_NODES}" | |
| echo "SLURM_JOB_NODELIST: ${SLURM_JOB_NODELIST}" | |
| echo "SLURM_NODEID: ${SLURM_NODEID}" | |
| echo "=============================" | |
| LAUNCHER=( | |
| torchrun | |
| --nproc_per_node "$GPUS_PER_NODE" | |
| --nnodes "$SLURM_JOB_NUM_NODES" | |
| --rdzv_backend c10d | |
| --rdzv_id "$SLURM_JOB_ID" | |
| --rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}" | |
| ) | |
| export SCRIPT=pytorch-nccl-test.py | |
| cat << EOT > $SCRIPT | |
| import torch.distributed as dist | |
| import torch | |
| import socket | |
| import os | |
| import fcntl | |
| import time | |
| SUCCESS=0 | |
| def printflock(*msgs): | |
| """ print """ | |
| with open(__file__, "r") as fh: | |
| fcntl.flock(fh, fcntl.LOCK_EX) | |
| try: | |
| print(*msgs, flush=True) | |
| finally: | |
| fcntl.flock(fh, fcntl.LOCK_UN) | |
| local_rank = int(os.environ["LOCAL_RANK"]) | |
| header = f"{socket.gethostname()}-{local_rank}" | |
| if local_rank == 0: | |
| printflock(f"{header}: torch.__version__: {torch.__version__}") | |
| printflock(f"{header}: torch.version.cuda: {torch.version.cuda}") | |
| printflock(f"{header}: torch.cuda.is_available(): {torch.cuda.is_available()}") | |
| printflock(f"{header}: torch.cuda.nccl.version(): {torch.cuda.nccl.version()}") | |
| printflock(f'{header}: running dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) ...') | |
| torch.cuda.set_device(local_rank) | |
| dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) | |
| printflock(f'{header}: dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) SUCCESS') | |
| try: | |
| printflock(f"{header}: Trying dist.barrier()") | |
| dist.barrier() | |
| printflock(f"{header}: NCCL {torch.cuda.nccl.version()} OK") | |
| SUCCESS=1 | |
| # NCCL bandwidth test: configurable sizes and iterations | |
| try: | |
| world_size = dist.get_world_size() | |
| # Config via env: NCCL_BW_SIZES_MIB (comma list), or NCCL_BW_MIB (single); NCCL_BW_ITERS; NCCL_BW_DTYPE | |
| sizes_env = os.environ.get("NCCL_BW_SIZES_MIB") | |
| if sizes_env: | |
| sizes_mib = [int(s.strip()) for s in sizes_env.split(",") if s.strip()] | |
| else: | |
| single_env = os.environ.get("NCCL_BW_MIB") | |
| if single_env: | |
| sizes_mib = [int(single_env)] | |
| else: | |
| sizes_mib = [64, 128, 256, 512, 1024, 2048] | |
| iters = int(os.environ.get("NCCL_BW_ITERS", "20")) | |
| dtype_name = os.environ.get("NCCL_BW_DTYPE", "bf16").lower() | |
| dtype = torch.bfloat16 if dtype_name == "bf16" else (torch.float16 if dtype_name == "fp16" else torch.float32) | |
| def bench_allreduce(bytes_per_tensor: int) -> float: | |
| elem_size = torch.tensor([], dtype=dtype).element_size() | |
| num_elements = bytes_per_tensor // elem_size | |
| tensor = torch.ones(num_elements, dtype=dtype, device=f"cuda:{local_rank}") | |
| # warmup | |
| for _ in range(3): | |
| dist.all_reduce(tensor, op=dist.ReduceOp.SUM) | |
| torch.cuda.synchronize() | |
| start = time.perf_counter() | |
| for _ in range(iters): | |
| dist.all_reduce(tensor, op=dist.ReduceOp.SUM) | |
| torch.cuda.synchronize() | |
| elapsed_s = (time.perf_counter() - start) / iters | |
| effective_bytes = 2 * (world_size - 1) / world_size * bytes_per_tensor | |
| bandwidth_gbps = effective_bytes / elapsed_s / 1e9 | |
| return elapsed_s, bandwidth_gbps | |
| for size_mib in sizes_mib: | |
| bytes_per_tensor = size_mib * 1024 * 1024 | |
| elapsed_s, bandwidth_gbps = bench_allreduce(bytes_per_tensor) | |
| printflock(f"{header}: AllReduce {size_mib} MiB avg {elapsed_s*1e3:.3f} ms, eff BW {bandwidth_gbps:.2f} GB/s (iters={iters}, dtype={dtype_name})") | |
| bw_tensor = torch.tensor([bandwidth_gbps], device=f"cuda:{local_rank}", dtype=torch.float32) | |
| gathered = [torch.zeros_like(bw_tensor) for _ in range(world_size)] | |
| dist.all_gather(gathered, bw_tensor) | |
| bws = torch.stack(gathered).flatten().cpu().tolist() | |
| if dist.get_rank() == 0: | |
| mn = min(bws); mx = max(bws); mean = sum(bws) / len(bws) | |
| printflock(f"AGG NCCL BW GB/s [{size_mib} MiB] -> min: {mn:.2f}, mean: {mean:.2f}, max: {mx:.2f}") | |
| except Exception as e: | |
| printflock(f"{header}: Bandwidth test failed: {e}") | |
| except Exception as e: | |
| printflock(f"{header}: NCCL {torch.cuda.nccl.version()} ERROR: {e}") | |
| raise | |
| finally: | |
| # Properly destroy the process group to avoid resource leaks | |
| if dist.is_initialized(): | |
| printflock(f"{header}: Destroying process group...") | |
| dist.destroy_process_group() | |
| printflock(f"{header}: Process group destroyed successfully") | |
| time.sleep(1) | |
| printflock(f"{header}: NCCL TEST SUCCESS: {bool(SUCCESS)}") | |
| EOT | |
| export NCCL_DEBUG=WARN | |
| # export NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV | |
| # export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3 | |
| # export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 | |
| # export GLOO_SOCKET_IFNAME=ib0 | |
| # export NCCL_SOCKET_IFNAME=ib0 | |
| # export NCCL_NET_GDR_LEVEL=0 | |
| # export NCCL_NET_GDR_READ=0 | |
| # export NCCL_P2P_DISABLE=1 | |
| # export TORCH_NCCL_BLOCKING_WAIT=0 | |
| # export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=600 | |
| # export TORCH_COMPILE_DISABLE=1 | |
| # export ACCELERATE_USE_CUDA_GRAPHS=0 | |
| # export TORCH_CUDAGRAPHS=0 | |
| # export TORCHDYNAMO_DISABLE=1 | |
| # export NCCL_IB_DISABLE=1 | |
| # export NCCL_NET=IB | |
| # export OMP_NUM_THREADS=2 | |
| # export NCCL_SHM_DISABLE=1 | |
| # # export CUDA_DEVICE_MAX_CONNECTIONS=1 | |
| # export TORCH_DISTRIBUTED_DEBUG="DETAIL" | |
| # export CUDA_LAUNCH_BLOCKING=1 | |
| echo "=============================" | |
| echo "Software versions:" | |
| srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvidia-smi: $(nvidia-smi)"' | |
| srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvidia driver version: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits)"' | |
| srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvcc version: $(nvcc --version)"' | |
| srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ibstat: $(ibstat)"' | |
| srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ibdev2netdev: $(ibdev2netdev)"' | |
| srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ofed_info -s: $(ofed_info -s)"' | |
| srun --jobid $SLURM_JOBID bash -c "mods=\$(lsmod | grep -E 'nvidia_peermem|nv_peer_mem' || true); echo \"\$(hostname): lsmod grep -E nvidia_peermem|nv_peer_mem: \$mods\"" | |
| echo "=============================" | |
| echo "NCCL env vars:" | |
| echo "NCCL_DEBUG: $NCCL_DEBUG" | |
| echo "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME" | |
| echo "NCCL_NET_GDR_LEVEL: $NCCL_NET_GDR_LEVEL" | |
| echo "NCCL_P2P_DISABLE: $NCCL_P2P_DISABLE" | |
| echo "NCCL_IB_DISABLE: $NCCL_IB_DISABLE" | |
| echo "=============================" | |
| echo "Running NCCL test:" | |
| echo "NCCL launcher:" "${LAUNCHER[@]}" --node_rank "$SLURM_PROCID" "$SCRIPT" | |
| SRUN_NCCL=( | |
| srun -u --jobid "$SLURM_JOBID" --time 10:00 --kill-on-bad-exit=1 --wait=60 | |
| ) | |
| "${SRUN_NCCL[@]}" "${LAUNCHER[@]}" --node_rank "$SLURM_NODEID" "$SCRIPT" | |
| nccl_rc=$? | |
| if [ $nccl_rc -ne 0 ]; then | |
| echo "NCCL test failed with rc=$nccl_rc" | |
| exit $nccl_rc | |
| fi | |
| # Expected bandwidth example: With a single 200G IB NIC per node and 4 GPUs per node, the per-rank all-reduce plateau is roughly 2 × NIC_GBps / local_ranks ≈ 2 × 25 / 4 ≈ 12.5 GB/s. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment