Skip to content

Instantly share code, notes, and snippets.

@Quentin-Anthony
Last active October 16, 2022 17:14
Show Gist options
  • Save Quentin-Anthony/766ffe6b75ffe62f1d456f9a9465973e to your computer and use it in GitHub Desktop.
Save Quentin-Anthony/766ffe6b75ffe62f1d456f9a9465973e to your computer and use it in GitHub Desktop.
EFA BW test for Stability cluster (adapted from Azure script)
#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --job-name=gputest
#SBATCH --nodes 1
#SBATCH --ntasks-per-node 8
#SBATCH --cpus-per-gpu=6
#SBATCH --gres=gpu:8
#SBATCH --nodelist gpu-st-p4d-24xlarge-42
#SBATCH --output=%x_%j.out
#SBATCH --open-mode=append
#SBATCH --exclusive
#SBATCH --comment neox
export LD_LIBRARY_PATH=/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib64
module load cuda/11.6
export PATH=/opt/amazon/efa/bin:$PATH
export FI_EFA_FORK_SAFE=1
export FI_LOG_LEVEL=1
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
export NCCL_DEBUG=info
export OMPI_MCA_mtl_base_verbose=1
export FI_EFA_ENABLE_SHM_TRANSFER=0
export FI_PROVIDER=efa
export FI_EFA_TX_MIN_CREDITS=64
export EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW=5
ldd `which all_reduce_perf`
echo $LD_LIBRARY_PATH
echo $PATH
MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa --allow-run-as-root"
ENVIRON_VARS="-x LD_LIBRARY_PATH -x NCCL_SHM_DISABLE=1 -x NCCL_P2P_DISABLE=1 -x NCCL_NET_GDR_LEVEL=SYS"
NCCL_ARGS="-b 500M -f 2 -g 1 -e 1G -n 50 -w 10"
function die() {
echo "$*" 1>&2
exit 1
}
function log() {
echo "$*" 1>&2
}
function dbg() {
echo "$*" 1>&2
}
function collect_nccl_allreduce_ib_loopback_data() {
nccl_allreduce_ib_loopback_out=$(mpirun $MPI_ARGS $ENVIRON_VARS all_reduce_perf $NCCL_ARGS)
nccl_allreduce_ib_loopback_out_rc=$?
if [[ $nccl_allreduce_ib_loopback_out_rc != 0 ]]; then
log "nccl_allreduce_ib_loopback_freq_out"
die 1 "$FUNCNAME: nccl_allreduce (IB loopback) returned error code $nccl_allreduce_ib_loopback_out_rc"
fi
IFS=$'\n'
nccl_allreduce_ib_loopback_out_lines=( $nccl_allreduce_ib_loopback_out )
IFS=$' \t\n'
}
function check_nccl_allreduce_ib_loopback() {
collect_nccl_allreduce_ib_loopback_data
for ((i=0; i<${#nccl_allreduce_ib_loopback_out_lines[*]}; i++))
do
if [[ "${nccl_allreduce_ib_loopback_out_lines[$i]//bandwidth}" != "${nccl_allreduce_ib_loopback_out_lines[$i]}" ]]
then
IFS=$' \t\n'
nccl_allreduce_ib_loopback_out_line=( ${nccl_allreduce_ib_loopback_out_lines[$i]} )
avg_bus_bw=${nccl_allreduce_ib_loopback_out_line[5]}
dbg "Measured Avg NCCL allreduce ib loopback bus BW $avg_bus_bw GB/s"
break
fi
done
dbg "Measured Avg NCCL allreduce IB loopback bus BW=$avg_bus_bw, Expected NCCL allreduce IB loopback BW=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW"
if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]]
then
log "$nccl_allreduce_ib_loopback_out"
die 1 "$FUNCNAME: NCCL allreduce IB loopback, BUS BW (expected > $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s"
return 1
fi
}
check_nccl_allreduce_ib_loopback
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment