Created
July 1, 2025 18:24
-
-
Save naufalso/8e8101fb3542ac106d0954e0f3117715 to your computer and use it in GitHub Desktop.
Acceleate Submit SLURM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name=multigpu | |
#SBATCH -D . | |
#SBATCH --output=O-%x.%j | |
#SBATCH --error=E-%x.%j | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks-per-node=1 # number of MP tasks | |
#SBATCH --gres=gpu:4 # number of GPUs per node | |
#SBATCH --cpus-per-task=160 # number of cores per tasks | |
#SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) | |
###################### | |
### Set environment ### | |
###################### | |
source activateEnvironment.sh | |
export GPUS_PER_NODE=4 | |
###################### | |
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" | |
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" | |
export SCRIPT_ARGS=" \ | |
--mixed_precision fp16 \ | |
--output_dir ${ACCELERATE_DIR}/examples/output \ | |
--with_tracking \ | |
" | |
accelerate launch --num_processes $GPUS_PER_NODE $SCRIPT $SCRIPT_ARGS |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name=multinode | |
#SBATCH -D . | |
#SBATCH --output=O-%x.%j | |
#SBATCH --error=E-%x.%j | |
#SBATCH --nodes=4 # number of nodes | |
#SBATCH --ntasks-per-node=1 # number of MP tasks | |
#SBATCH --gres=gpu:4 # number of GPUs per node | |
#SBATCH --cpus-per-task=160 # number of cores per tasks | |
#SBATCH --time=01:59:00 # maximum execution time (HH:MM:SS) | |
###################### | |
### Set environment ### | |
###################### | |
source activateEnvironment.sh | |
export GPUS_PER_NODE=4 | |
###################### | |
###################### | |
#### Set network ##### | |
###################### | |
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | |
###################### | |
export LAUNCHER="accelerate launch \ | |
--num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ | |
--num_machines $SLURM_NNODES \ | |
--rdzv_backend c10d \ | |
--main_process_ip $head_node_ip \ | |
--main_process_port 29500 \ | |
" | |
export ACCELERATE_DIR="${ACCELERATE_DIR:-/accelerate}" | |
export SCRIPT="${ACCELERATE_DIR}/examples/complete_nlp_example.py" | |
export SCRIPT_ARGS=" \ | |
--mixed_precision fp16 \ | |
--output_dir ${ACCELERATE_DIR}/examples/output \ | |
" | |
# This step is necessary because accelerate launch does not handle multiline arguments properly | |
export CMD="$LAUNCHER $PYTHON_FILE $ARGS" | |
srun $CMD |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment