create an env:
python3.8 -m venv .env
source .env/bin/activate
pip install -U pip
pip3 install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113
fix the paths in simple.sh
then do sbatch simple.sh or simple_torchrun.sh
create an env:
python3.8 -m venv .env
source .env/bin/activate
pip install -U pip
pip3 install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113
fix the paths in simple.sh
then do sbatch simple.sh or simple_torchrun.sh
| #!/bin/bash | |
| #SBATCH --partition=compute-od-gpu | |
| #SBATCH --job-name=debug_torch_init | |
| #SBATCH --nodes 48 | |
| #SBATCH --ntasks-per-node 8 | |
| #SBATCH --cpus-per-gpu=6 | |
| #SBATCH --gres=gpu:8 | |
| #SBATCH --output=%x_%j.out | |
| #SBATCH --comment "Key=Monitoring,Value=ON" | |
| #SBATCH --exclusive | |
| module load intelmpi | |
| source /opt/intel/mpi/latest/env/vars.sh | |
| export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH | |
| export NCCL_PROTO=simple | |
| export PATH=/opt/amazon/efa/bin:$PATH | |
| export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so" | |
| export FI_EFA_FORK_SAFE=1 | |
| export FI_LOG_LEVEL=1 | |
| export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
| #export NCCL_ALGO=ring | |
| export NCCL_DEBUG=info | |
| #export NCCL_DEBUG_SUBSYS=INIT,ENV,GRAPH,COLL | |
| export PYTHONFAULTHANDLER=1 | |
| export CUDA_LAUNCH_BLOCKING=0 | |
| export OMPI_MCA_mtl_base_verbose=1 | |
| export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
| export FI_PROVIDER=efa | |
| export FI_EFA_TX_MIN_CREDITS=64 | |
| export NCCL_TREE_THRESHOLD=0 | |
| export TORCH_CPP_LOG_LEVEL=INFO | |
| export TORCH_DISTRIBUTED_DEBUG=INFO | |
| #export NCCL_P2P_DISABLE=1 | |
| #export NCCL_IBEXT_DISABLE=1 | |
| #export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond" | |
| # sent to sub script | |
| export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | |
| echo $HOSTNAMES | |
| export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
| export MASTER_PORT=12802 | |
| export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | |
| echo go $COUNT_NODE | |
| echo $HOSTNAMES | |
| source /fsx/rom1504/open_clip/.env/bin/activate | |
| #export I_MPI_PMI_LIBRARY=/opt/slurm/lib/libpmi.so | |
| srun --cpu_bind=v --accel-bind=gn python /fsx/rom1504/open_clip/simple_init.py |
| import torch | |
| import argparse | |
| import os | |
| def world_info_from_env(): | |
| local_rank = 0 | |
| for v in ('SLURM_LOCALID', 'MPI_LOCALRANKID', 'OMPI_COMM_WORLD_LOCAL_RANK', 'LOCAL_RANK'): | |
| if v in os.environ: | |
| local_rank = int(os.environ[v]) | |
| break | |
| global_rank = 0 | |
| for v in ('SLURM_PROCID', 'PMI_RANK', 'OMPI_COMM_WORLD_RANK', 'RANK'): | |
| if v in os.environ: | |
| global_rank = int(os.environ[v]) | |
| break | |
| world_size = 1 | |
| for v in ('SLURM_NTASKS', 'PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'WORLD_SIZE'): | |
| if v in os.environ: | |
| world_size = int(os.environ[v]) | |
| break | |
| return local_rank, global_rank, world_size | |
| local_rank, global_rank, world_size = world_info_from_env() | |
| torch.distributed.init_process_group( | |
| backend="nccl", | |
| init_method="env://", | |
| world_size=world_size, | |
| rank=global_rank, | |
| ) |
| import torch | |
| import os | |
| print("SLURM_PROCID", os.environ["SLURM_PROCID"]) | |
| torch.distributed.init_process_group( | |
| backend="nccl", | |
| init_method="env://", | |
| ) |
| #!/bin/bash | |
| #SBATCH --partition=compute-od-gpu | |
| #SBATCH --job-name=debug_torch_init | |
| #SBATCH --nodes 48 | |
| #SBATCH --ntasks-per-node 8 | |
| #SBATCH --cpus-per-gpu=6 | |
| #SBATCH --gres=gpu:8 | |
| #SBATCH --output=%x_%j.out | |
| #SBATCH --comment "Key=Monitoring,Value=ON" | |
| #SBATCH --exclusive | |
| module load intelmpi | |
| source /opt/intel/mpi/latest/env/vars.sh | |
| export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH | |
| export NCCL_PROTO=simple | |
| export PATH=/opt/amazon/efa/bin:$PATH | |
| export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so" | |
| export FI_EFA_FORK_SAFE=1 | |
| export FI_LOG_LEVEL=1 | |
| export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
| #export NCCL_ALGO=ring | |
| export NCCL_DEBUG=info | |
| #export NCCL_DEBUG_SUBSYS=INIT,ENV,GRAPH,COLL | |
| export PYTHONFAULTHANDLER=1 | |
| export CUDA_LAUNCH_BLOCKING=0 | |
| export OMPI_MCA_mtl_base_verbose=1 | |
| export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
| export FI_PROVIDER=efa | |
| export FI_EFA_TX_MIN_CREDITS=64 | |
| export NCCL_TREE_THRESHOLD=0 | |
| export TORCH_CPP_LOG_LEVEL=INFO | |
| export TORCH_DISTRIBUTED_DEBUG=INFO | |
| #export NCCL_P2P_DISABLE=1 | |
| #export NCCL_IBEXT_DISABLE=1 | |
| #export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond" | |
| # sent to sub script | |
| export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | |
| echo $HOSTNAMES | |
| export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
| export MASTER_PORT=12802 | |
| export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | |
| echo go $COUNT_NODE | |
| echo $HOSTNAMES | |
| source /fsx/rom1504/open_clip/.env/bin/activate | |
| #export I_MPI_PMI_LIBRARY=/opt/slurm/lib/libpmi.so | |
| srun --cpu_bind=v --accel-bind=gn python /fsx/rom1504/open_clip/simple_init.py | |
| export LAUNCHER="torchrun --nproc_per_node 8 \ | |
| --nnodes $SLURM_NNODES \ | |
| --rdzv_endpoint $MASTER_ADDR \ | |
| --rdzv_backend c10d \ | |
| --rdzv_id $SLURM_JOBID \ | |
| --max_restarts 0 \ | |
| --tee 3" | |
| echo $LAUNCHER | |
| export TRAIN_CMD="/fsx/rom1504/open_clip/simple_init_torchrun.py" | |
| # NOTE we are delaying expansion of SLURM_PROCID here, so that node ranks are in env | |
| srun bash -c "$LAUNCHER --node_rank \$SLURM_PROCID $TRAIN_CMD" |