conda create -n mpi -c conda-forge -c nvidia mpich gcc=11 gxx=11 make automake ipython cuda-toolkit cuda-version=12.3 nccl cuda-cudart-static --yes
git clone https://github.com/NVIDIA/nccl-tests.git
CUDA_HOME=$CONDA_PREFIX NCCL_HOME=$CONDA_PREFIX/include MPI_HOME=$CONDA_PREFIX MPI=1 make
mpirun --hostfile hosts -np 2 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
mpirun --hostfile hosts -np 48 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
mpirun --hostfile hosts --map-by ppr:1:node -n 2 echo $HOSTNAME ompi_info --parsable -l 9 --all | grep mpi_built_with_cuda_support:value