Created
May 15, 2023 23:24
-
-
Save smutch/5ae94f5df5806d44edf4fb833bbae5b0 to your computer and use it in GitHub Desktop.
sh: MPS SLURM wrapper script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Author: Simon Mutch <[email protected]> | |
# Date: 2018-08-14 | |
set -e # error on exit | |
get_log_dir() { | |
echo "$JOBFS/log_mps-$1" | |
} | |
get_pipe_dir() { | |
echo "$JOBFS/mps-$1" | |
} | |
# number of devices on each node | |
NDEVICES=$(( $(wc -c <<< $(sed s/,//g <<< $SLURM_STEP_GPUS))-1 )) | |
# use the first rank of each node to spawn an mps controller | |
if [[ $SLURM_LOCALID == 0 ]]; then | |
for (( i = 0; i < $NDEVICES; i++ )); do | |
log_dir=$(get_log_dir $i) | |
pipe_dir=$(get_pipe_dir $i) | |
mkdir $log_dir | |
mkdir $pipe_dir | |
CUDA_VISIBLE_DEVICES=$i CUDA_MPS_LOG_DIRECTORY=$log_dir CUDA_MPS_PIPE_DIRECTORY=$pipe_dir \ | |
nvidia-cuda-mps-control -d | |
echo "$(hostname): Started mps-control for device $i in $pipe_dir" | |
done | |
fi | |
# Set the environment variables for each rank. | |
# Note that this is where the decision of which rank uses which GPU is being made... | |
local_size=$(sed s/\(.*\)//g <<< $SLURM_JOB_CPUS_PER_NODE) | |
# # TEMP -------- | |
# local_size=32 | |
# NDEVICES=1 | |
# SLURM_LOCALID=$OMPI_COMM_WORLD_NODE_RANK | |
# # --------------- | |
my_device=$(( ($SLURM_LOCALID * $NDEVICES / $local_size) % $NDEVICES )) # consecutive ranks share | |
# --- OR --- | |
# my_device=$(( $SLURM_LOCALID % $NDEVICES )) # alternate | |
export CUDA_VISIBLE_DEVICES=0 | |
export CUDA_MPS_PIPE_DIRECTORY=$JOBFS/mps-$my_device | |
function cleanup() { | |
# Once we've exited our program, stop the controller, copy the logs and cleanup. | |
if [[ $SLURM_LOCALID == 0 ]]; then | |
for (( i = 0; i < $NDEVICES; i++ )); do | |
log_dir=$(get_log_dir $i) | |
pipe_dir=$(get_pipe_dir $i) | |
echo "quit" | CUDA_MPS_PIPE_DIRECTORY=$pipe_dir nvidia-cuda-mps-control | |
cp $log_dir/control.log ./mps-control-$(hostname)_$i.log | |
cp $log_dir/server.log ./mps-server-$(hostname)_$i.log | |
rm -r $log_dir | |
rm -r $pipe_dir | |
echo "$(hostname): Stopped mps-control for device $i at $pipe_dir" | |
done | |
fi | |
} | |
trap cleanup EXIT SIGINT SIGTERM ERR KILL | |
# Run the actual executable! | |
# Note the absence of 'srun' or 'mpiexec' etc. here... | |
"$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment