Skip to content

Instantly share code, notes, and snippets.

@MikeDacre
Created February 26, 2016 20:12
Show Gist options
  • Save MikeDacre/c2875bcff4ccae8771a3 to your computer and use it in GitHub Desktop.
Save MikeDacre/c2875bcff4ccae8771a3 to your computer and use it in GitHub Desktop.
SLURM DMTCP Test
#!/bin/bash
#SBATCH --partition=hbfraser
#SBATCH --time=00:02:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --signal=36@30
#----------------------------- Set up DMTCP environment for a job ------------#
###############################################################################
# Start DMTCP coordinator on the launching node. Free TCP port is automatically
# allocated. This function creates a dmtcp_command.$JOBID script, which serves
# as a wrapper around dmtcp_command. The script tunes dmtcp_command for the
# exact dmtcp_coordinator (its hostname and port). Instead of typing
# "dmtcp_command -h <coordinator hostname> -p <coordinator port> <command>",
# you just type "dmtcp_command.$JOBID <command>" and talk to the coordinator
# for JOBID job.
###############################################################################
module load dmtcp
fname=dmtcp_command.$SLURM_JOBID
start_coordinator()
{
############################################################
# For debugging when launching a custom coordinator, uncomment
# the following lines and provide the proper host and port for
# the coordinator.
############################################################
# export DMTCP_COORD_HOST=$h
# export DMTCP_COORD_PORT=$p
# return
h=`hostname`
check_coordinator=`which dmtcp_coordinator`
if [ -z "$check_coordinator" ]; then
echo "No dmtcp_coordinator found. Check your DMTCP installation and PATH settings."
exit 0
fi
dmtcp_coordinator --daemon --exit-on-last -p 0 --port-file $fname $@ 1>/dev/null 2>&1
while true; do
if [ -f "$fname" ]; then
p=`cat $fname`
if [ -n "$p" ]; then
# try to communicate ? dmtcp_command -p $p l
break
fi
fi
done
# Create dmtcp_command wrapper for easy communication with coordinator
p=`cat $fname`
chmod +x $fname
echo "#!/bin/bash" > $fname
echo >> $fname
echo "export PATH=$PATH" >> $fname
echo "export DMTCP_COORD_HOST=$h" >> $fname
echo "export DMTCP_COORD_PORT=$p" >> $fname
echo "dmtcp_command \$@" >> $fname
# Set up local environment for DMTCP
export DMTCP_COORD_HOST=$h
export DMTCP_COORD_PORT=$p
}
cleanup() {
echo "Cleaning!"
$fname -bc
sleep 1
$fname -q
rm $fname
echo "Cleaned!"
exit 42
}
###################################################################################
# Print out the SLURM job information. Remove this if you don't need it.
###################################################################################
# Print out the SLURM job information. Remove this if you don't need it.
echo "SLURM_JOBID="$SLURM_JOBID
echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
echo "SLURM_NNODES"=$SLURM_NNODES
echo "SLURMTMPDIR="$SLURMTMPDIR
echo "working directory = "$SLURM_SUBMIT_DIR
# changedir to workdir
cd $SLURM_SUBMIT_DIR
#------------------------------------- Launch application ---------------------#
module load python
################################################################################
# 1. Start DMTCP coordinator
################################################################################
start_coordinator # -i 120
srun dmtcp_launch --with-plugin /home/dacre/dmtcp/dmtcp-2.4.4/contrib/slurm-ft/slurm-ft.o --checkpoint-open-files python test.py
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment