Neeratyoy · July 21, 2023 23:01
diff --git a/gpu-alloc-sbatch.sh b/gpu-alloc-sbatch.sh
 #!/bin/bash

 ################################################################
 # The goal of this script:
 #   Run a job where 4 workers are triggered AT THE SAME TIME 
 #    and each worker performs a model training using one GPU. 
 #   Run an array job of such jobs.
 #
 # NOTE: the solutions to SLURM issues are dependent on the 
 #       setup and vary from cluster to cluster. 
 ################################################################

 #SBATCH --time 2-00:00
 #SBATCH --job-name max24hrs
 #SBATCH --partition ...
 #SBATCH --array 0-11%4
 #SBATCH --error ...
 #SBATCH --output ...
 #SBATCH --gres=gpu:4
 #SBATCH -c 8
 #SBATCH --mem-per-cpu 12000

 for i in $(seq 1 4); do
    srun --ntasks 1 --cpus-per-task 2 --gres=gpu:1 --exclusive python -m script_to_run_.py --experiment_args $SLURM_ARRAY_TASK_ID &  # the & is important
 done
 wait  # this is important for the job to not be killed along with the background processes
	#!/bin/bash

	################################################################
	# The goal of this script:
	# Run a job where 4 workers are triggered AT THE SAME TIME
	# and each worker performs a model training using one GPU.
	# Run an array job of such jobs.
	#
	# NOTE: the solutions to SLURM issues are dependent on the
	# setup and vary from cluster to cluster.
	################################################################

	#SBATCH --time 2-00:00
	#SBATCH --job-name max24hrs
	#SBATCH --partition ...
	#SBATCH --array 0-11%4
	#SBATCH --error ...
	#SBATCH --output ...
	#SBATCH --gres=gpu:4
	#SBATCH -c 8
	#SBATCH --mem-per-cpu 12000

	for i in $(seq 1 4); do
	srun --ntasks 1 --cpus-per-task 2 --gres=gpu:1 --exclusive python -m script_to_run_.py --experiment_args $SLURM_ARRAY_TASK_ID & # the & is important
	done
	wait # this is important for the job to not be killed along with the background processes