Skip to content

Instantly share code, notes, and snippets.

@nousr
Last active October 25, 2022 23:53
Show Gist options
  • Save nousr/4759ad960719b9dbcf28831403fd1dff to your computer and use it in GitHub Desktop.
Save nousr/4759ad960719b9dbcf28831403fd1dff to your computer and use it in GitHub Desktop.
LOCAL_ID=$SLURM_LOCALID
GLOBAL_RANK=$SLURM_PROCID
NODE_ID=$SLURM_NODEID
NUM_PROCS=$SLURM_NTASKS
CPUS=$SLURM_CPUS_PER_GPU
NUM_NODES=$SLURM_NNODES
MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
if [ $GLOBAL_RANK == 0 ]
then
echo -e "MASTER ADDR: $MASTER_ADDR\tLOCAL ID: $LOCAL_ID\tGLOBAL RANK: $GLOBAL_RANK\tNODE ID: $NODE_ID\tNUM PROCS: $NUM_PROCS\tCPUS PER TASK: $CPUS"
fi
echo $SLURM_JOB_NODELIST
# start virtual environment
source /fsx/nousr/DALLE2-pytorch/.env/bin/activate
# empty torch cache before starting
python3 -c "import torch; torch.cuda.empty_cache()"
# launch process
accelerate launch \
--multi_gpu \
--num_processes=$(( $NUM_PROCS * 8 )) \
--num_cpu_threads_per_process=$CPUS \
--num_machines=$NUM_NODES \
--machine_rank=$NODE_ID \
--gpu_ids="0,1,2,3,4,5,6,7,8" \
--mixed_precision="no" \
--main_process_ip=$MASTER_ADDR \
--main_process_port=3068 \
/fsx/nousr/DALLE2-pytorch/train_diffusion_prior.py \
--config_file /fsx/nousr/DALLE2-pytorch/configs/h_14_prior.json
#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --job-name=h-14-prior
#SBATCH --nodes 2
#SBATCH --ntasks-per-node 1
#SBATCH --cpus-per-gpu=6
#SBATCH --gres=gpu:8
#SBATCH --output=%x_%j.out
#SBATCH --comment dalle2
#SBATCH --exclusive
srun --comment dalle2 prior.sh
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment