Last active
October 16, 2022 17:14
-
-
Save Quentin-Anthony/766ffe6b75ffe62f1d456f9a9465973e to your computer and use it in GitHub Desktop.
EFA BW test for Stability cluster (adapted from Azure script)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --partition=gpu | |
#SBATCH --job-name=gputest | |
#SBATCH --nodes 1 | |
#SBATCH --ntasks-per-node 8 | |
#SBATCH --cpus-per-gpu=6 | |
#SBATCH --gres=gpu:8 | |
#SBATCH --nodelist gpu-st-p4d-24xlarge-42 | |
#SBATCH --output=%x_%j.out | |
#SBATCH --open-mode=append | |
#SBATCH --exclusive | |
#SBATCH --comment neox | |
export LD_LIBRARY_PATH=/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib64 | |
module load cuda/11.6 | |
export PATH=/opt/amazon/efa/bin:$PATH | |
export FI_EFA_FORK_SAFE=1 | |
export FI_LOG_LEVEL=1 | |
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
export NCCL_DEBUG=info | |
export OMPI_MCA_mtl_base_verbose=1 | |
export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
export FI_PROVIDER=efa | |
export FI_EFA_TX_MIN_CREDITS=64 | |
export EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW=5 | |
ldd `which all_reduce_perf` | |
echo $LD_LIBRARY_PATH | |
echo $PATH | |
MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa --allow-run-as-root" | |
ENVIRON_VARS="-x LD_LIBRARY_PATH -x NCCL_SHM_DISABLE=1 -x NCCL_P2P_DISABLE=1 -x NCCL_NET_GDR_LEVEL=SYS" | |
NCCL_ARGS="-b 500M -f 2 -g 1 -e 1G -n 50 -w 10" | |
function die() { | |
echo "$*" 1>&2 | |
exit 1 | |
} | |
function log() { | |
echo "$*" 1>&2 | |
} | |
function dbg() { | |
echo "$*" 1>&2 | |
} | |
function collect_nccl_allreduce_ib_loopback_data() { | |
nccl_allreduce_ib_loopback_out=$(mpirun $MPI_ARGS $ENVIRON_VARS all_reduce_perf $NCCL_ARGS) | |
nccl_allreduce_ib_loopback_out_rc=$? | |
if [[ $nccl_allreduce_ib_loopback_out_rc != 0 ]]; then | |
log "nccl_allreduce_ib_loopback_freq_out" | |
die 1 "$FUNCNAME: nccl_allreduce (IB loopback) returned error code $nccl_allreduce_ib_loopback_out_rc" | |
fi | |
IFS=$'\n' | |
nccl_allreduce_ib_loopback_out_lines=( $nccl_allreduce_ib_loopback_out ) | |
IFS=$' \t\n' | |
} | |
function check_nccl_allreduce_ib_loopback() { | |
collect_nccl_allreduce_ib_loopback_data | |
for ((i=0; i<${#nccl_allreduce_ib_loopback_out_lines[*]}; i++)) | |
do | |
if [[ "${nccl_allreduce_ib_loopback_out_lines[$i]//bandwidth}" != "${nccl_allreduce_ib_loopback_out_lines[$i]}" ]] | |
then | |
IFS=$' \t\n' | |
nccl_allreduce_ib_loopback_out_line=( ${nccl_allreduce_ib_loopback_out_lines[$i]} ) | |
avg_bus_bw=${nccl_allreduce_ib_loopback_out_line[5]} | |
dbg "Measured Avg NCCL allreduce ib loopback bus BW $avg_bus_bw GB/s" | |
break | |
fi | |
done | |
dbg "Measured Avg NCCL allreduce IB loopback bus BW=$avg_bus_bw, Expected NCCL allreduce IB loopback BW=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW" | |
if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]] | |
then | |
log "$nccl_allreduce_ib_loopback_out" | |
die 1 "$FUNCNAME: NCCL allreduce IB loopback, BUS BW (expected > $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s" | |
return 1 | |
fi | |
} | |
check_nccl_allreduce_ib_loopback |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment