Skip to content

Instantly share code, notes, and snippets.

@maxidl
Last active October 30, 2025 14:41
Show Gist options
  • Select an option

  • Save maxidl/844160a41d7a4d6d8bc99af24eb1d208 to your computer and use it in GitHub Desktop.

Select an option

Save maxidl/844160a41d7a4d6d8bc99af24eb1d208 to your computer and use it in GitHub Desktop.
SLURM PyTorch NCCL Multi-Node Test Script: A SLURM batch script that tests PyTorch's NCCL functionality across multiple GPU nodes. The script sets up a distributed PyTorch environment using torchrun and runs a comprehensive test that verifies NCCL initialization, inter-process communication barriers, and proper cleanup. Includes diagnostic outpu…
#!/bin/bash
#SBATCH --job-name=pytorch-nccl-test
#SBATCH --partition=
#SBATCH --account=
#SBATCH --qos=
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:H100:4
#SBATCH --time 0:05:00
#SBATCH --output=%x-%j.out
#SBATCH --exclude=
# SLURM PyTorch NCCL Multi-Node Test Script:
# An example SLURM batch script that tests PyTorch's NCCL functionality across multiple GPU nodes and
# performs a bandwidth test.
# The script sets up a distributed PyTorch environment using torchrun and runs a comprehensive
# test that verifies NCCL initialization, inter-process communication barriers, bandwidth, and proper cleanup.
# Includes diagnostic output for troubleshooting multi-node GPU communication issues in HPC environments.
# Print job information
echo "=== SLURM Job Information ==="
echo "SLURM_JOB_NAME: ${SLURM_JOB_NAME}"
echo "SLURM_JOB_ID: ${SLURM_JOB_ID}"
echo "SLURM_ARRAY_JOB_ID: ${SLURM_ARRAY_JOB_ID}"
echo "SLURM_ARRAY_TASK_ID: ${SLURM_ARRAY_TASK_ID}"
echo "SLURM_JOB_NUM_NODES: ${SLURM_JOB_NUM_NODES}"
echo "SLURM_JOB_NODELIST: ${SLURM_JOB_NODELIST}"
echo "SLURM_JOB_PARTITION: ${SLURM_JOB_PARTITION}"
echo "SLURM_JOB_ACCOUNT: ${SLURM_JOB_ACCOUNT}"
echo "============================="
set -e
GPUS_PER_NODE=4
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=$((20000 + $SLURM_JOB_ID % 10000))
# Print job information
echo "=== SLURM Job Information ==="
echo "SLURM_JOB_NAME: ${SLURM_JOB_NAME}"
echo "SLURM_JOB_ID: ${SLURM_JOB_ID}"
echo "SLURM_JOB_PARTITION: ${SLURM_JOB_PARTITION}"
echo "SLURM_JOB_ACCOUNT: ${SLURM_JOB_ACCOUNT}"
echo "SLURM_JOB_NUM_NODES: ${SLURM_JOB_NUM_NODES}"
echo "SLURM_JOB_NODELIST: ${SLURM_JOB_NODELIST}"
echo "SLURM_NODEID: ${SLURM_NODEID}"
echo "============================="
LAUNCHER=(
torchrun
--nproc_per_node "$GPUS_PER_NODE"
--nnodes "$SLURM_JOB_NUM_NODES"
--rdzv_backend c10d
--rdzv_id "$SLURM_JOB_ID"
--rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}"
)
export SCRIPT=pytorch-nccl-test.py
cat << EOT > $SCRIPT
import torch.distributed as dist
import torch
import socket
import os
import fcntl
import time
SUCCESS=0
def printflock(*msgs):
""" print """
with open(__file__, "r") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
try:
print(*msgs, flush=True)
finally:
fcntl.flock(fh, fcntl.LOCK_UN)
local_rank = int(os.environ["LOCAL_RANK"])
header = f"{socket.gethostname()}-{local_rank}"
if local_rank == 0:
printflock(f"{header}: torch.__version__: {torch.__version__}")
printflock(f"{header}: torch.version.cuda: {torch.version.cuda}")
printflock(f"{header}: torch.cuda.is_available(): {torch.cuda.is_available()}")
printflock(f"{header}: torch.cuda.nccl.version(): {torch.cuda.nccl.version()}")
printflock(f'{header}: running dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) ...')
torch.cuda.set_device(local_rank)
dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}"))
printflock(f'{header}: dist.init_process_group("nccl", device_id=torch.device(f"cuda:{local_rank}")) SUCCESS')
try:
printflock(f"{header}: Trying dist.barrier()")
dist.barrier()
printflock(f"{header}: NCCL {torch.cuda.nccl.version()} OK")
SUCCESS=1
# NCCL bandwidth test: configurable sizes and iterations
try:
world_size = dist.get_world_size()
# Config via env: NCCL_BW_SIZES_MIB (comma list), or NCCL_BW_MIB (single); NCCL_BW_ITERS; NCCL_BW_DTYPE
sizes_env = os.environ.get("NCCL_BW_SIZES_MIB")
if sizes_env:
sizes_mib = [int(s.strip()) for s in sizes_env.split(",") if s.strip()]
else:
single_env = os.environ.get("NCCL_BW_MIB")
if single_env:
sizes_mib = [int(single_env)]
else:
sizes_mib = [64, 128, 256, 512, 1024, 2048]
iters = int(os.environ.get("NCCL_BW_ITERS", "20"))
dtype_name = os.environ.get("NCCL_BW_DTYPE", "bf16").lower()
dtype = torch.bfloat16 if dtype_name == "bf16" else (torch.float16 if dtype_name == "fp16" else torch.float32)
def bench_allreduce(bytes_per_tensor: int) -> float:
elem_size = torch.tensor([], dtype=dtype).element_size()
num_elements = bytes_per_tensor // elem_size
tensor = torch.ones(num_elements, dtype=dtype, device=f"cuda:{local_rank}")
# warmup
for _ in range(3):
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(iters):
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
torch.cuda.synchronize()
elapsed_s = (time.perf_counter() - start) / iters
effective_bytes = 2 * (world_size - 1) / world_size * bytes_per_tensor
bandwidth_gbps = effective_bytes / elapsed_s / 1e9
return elapsed_s, bandwidth_gbps
for size_mib in sizes_mib:
bytes_per_tensor = size_mib * 1024 * 1024
elapsed_s, bandwidth_gbps = bench_allreduce(bytes_per_tensor)
printflock(f"{header}: AllReduce {size_mib} MiB avg {elapsed_s*1e3:.3f} ms, eff BW {bandwidth_gbps:.2f} GB/s (iters={iters}, dtype={dtype_name})")
bw_tensor = torch.tensor([bandwidth_gbps], device=f"cuda:{local_rank}", dtype=torch.float32)
gathered = [torch.zeros_like(bw_tensor) for _ in range(world_size)]
dist.all_gather(gathered, bw_tensor)
bws = torch.stack(gathered).flatten().cpu().tolist()
if dist.get_rank() == 0:
mn = min(bws); mx = max(bws); mean = sum(bws) / len(bws)
printflock(f"AGG NCCL BW GB/s [{size_mib} MiB] -> min: {mn:.2f}, mean: {mean:.2f}, max: {mx:.2f}")
except Exception as e:
printflock(f"{header}: Bandwidth test failed: {e}")
except Exception as e:
printflock(f"{header}: NCCL {torch.cuda.nccl.version()} ERROR: {e}")
raise
finally:
# Properly destroy the process group to avoid resource leaks
if dist.is_initialized():
printflock(f"{header}: Destroying process group...")
dist.destroy_process_group()
printflock(f"{header}: Process group destroyed successfully")
time.sleep(1)
printflock(f"{header}: NCCL TEST SUCCESS: {bool(SUCCESS)}")
EOT
export NCCL_DEBUG=WARN
# export NCCL_DEBUG_SUBSYS=INIT,GRAPH,ENV
# export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3
# export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
# export GLOO_SOCKET_IFNAME=ib0
# export NCCL_SOCKET_IFNAME=ib0
# export NCCL_NET_GDR_LEVEL=0
# export NCCL_NET_GDR_READ=0
# export NCCL_P2P_DISABLE=1
# export TORCH_NCCL_BLOCKING_WAIT=0
# export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=600
# export TORCH_COMPILE_DISABLE=1
# export ACCELERATE_USE_CUDA_GRAPHS=0
# export TORCH_CUDAGRAPHS=0
# export TORCHDYNAMO_DISABLE=1
# export NCCL_IB_DISABLE=1
# export NCCL_NET=IB
# export OMP_NUM_THREADS=2
# export NCCL_SHM_DISABLE=1
# # export CUDA_DEVICE_MAX_CONNECTIONS=1
# export TORCH_DISTRIBUTED_DEBUG="DETAIL"
# export CUDA_LAUNCH_BLOCKING=1
echo "============================="
echo "Software versions:"
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvidia-smi: $(nvidia-smi)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvidia driver version: $(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): nvcc version: $(nvcc --version)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ibstat: $(ibstat)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ibdev2netdev: $(ibdev2netdev)"'
srun --jobid $SLURM_JOBID bash -c 'echo "$(hostname): ofed_info -s: $(ofed_info -s)"'
srun --jobid $SLURM_JOBID bash -c "mods=\$(lsmod | grep -E 'nvidia_peermem|nv_peer_mem' || true); echo \"\$(hostname): lsmod grep -E nvidia_peermem|nv_peer_mem: \$mods\""
echo "============================="
echo "NCCL env vars:"
echo "NCCL_DEBUG: $NCCL_DEBUG"
echo "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME"
echo "NCCL_NET_GDR_LEVEL: $NCCL_NET_GDR_LEVEL"
echo "NCCL_P2P_DISABLE: $NCCL_P2P_DISABLE"
echo "NCCL_IB_DISABLE: $NCCL_IB_DISABLE"
echo "============================="
echo "Running NCCL test:"
echo "NCCL launcher:" "${LAUNCHER[@]}" --node_rank "$SLURM_PROCID" "$SCRIPT"
SRUN_NCCL=(
srun -u --jobid "$SLURM_JOBID" --time 10:00 --kill-on-bad-exit=1 --wait=60
)
"${SRUN_NCCL[@]}" "${LAUNCHER[@]}" --node_rank "$SLURM_NODEID" "$SCRIPT"
nccl_rc=$?
if [ $nccl_rc -ne 0 ]; then
echo "NCCL test failed with rc=$nccl_rc"
exit $nccl_rc
fi
# Expected bandwidth example: With a single 200G IB NIC per node and 4 GPUs per node, the per-rank all-reduce plateau is roughly 2 × NIC_GBps / local_ranks ≈ 2 × 25 / 4 ≈ 12.5 GB/s.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment