Created
February 26, 2016 20:12
-
-
Save MikeDacre/c2875bcff4ccae8771a3 to your computer and use it in GitHub Desktop.
SLURM DMTCP Test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --partition=hbfraser | |
#SBATCH --time=00:02:00 | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks-per-node=1 | |
#SBATCH --signal=36@30 | |
#----------------------------- Set up DMTCP environment for a job ------------# | |
############################################################################### | |
# Start DMTCP coordinator on the launching node. Free TCP port is automatically | |
# allocated. This function creates a dmtcp_command.$JOBID script, which serves | |
# as a wrapper around dmtcp_command. The script tunes dmtcp_command for the | |
# exact dmtcp_coordinator (its hostname and port). Instead of typing | |
# "dmtcp_command -h <coordinator hostname> -p <coordinator port> <command>", | |
# you just type "dmtcp_command.$JOBID <command>" and talk to the coordinator | |
# for JOBID job. | |
############################################################################### | |
module load dmtcp | |
fname=dmtcp_command.$SLURM_JOBID | |
start_coordinator() | |
{ | |
############################################################ | |
# For debugging when launching a custom coordinator, uncomment | |
# the following lines and provide the proper host and port for | |
# the coordinator. | |
############################################################ | |
# export DMTCP_COORD_HOST=$h | |
# export DMTCP_COORD_PORT=$p | |
# return | |
h=`hostname` | |
check_coordinator=`which dmtcp_coordinator` | |
if [ -z "$check_coordinator" ]; then | |
echo "No dmtcp_coordinator found. Check your DMTCP installation and PATH settings." | |
exit 0 | |
fi | |
dmtcp_coordinator --daemon --exit-on-last -p 0 --port-file $fname $@ 1>/dev/null 2>&1 | |
while true; do | |
if [ -f "$fname" ]; then | |
p=`cat $fname` | |
if [ -n "$p" ]; then | |
# try to communicate ? dmtcp_command -p $p l | |
break | |
fi | |
fi | |
done | |
# Create dmtcp_command wrapper for easy communication with coordinator | |
p=`cat $fname` | |
chmod +x $fname | |
echo "#!/bin/bash" > $fname | |
echo >> $fname | |
echo "export PATH=$PATH" >> $fname | |
echo "export DMTCP_COORD_HOST=$h" >> $fname | |
echo "export DMTCP_COORD_PORT=$p" >> $fname | |
echo "dmtcp_command \$@" >> $fname | |
# Set up local environment for DMTCP | |
export DMTCP_COORD_HOST=$h | |
export DMTCP_COORD_PORT=$p | |
} | |
cleanup() { | |
echo "Cleaning!" | |
$fname -bc | |
sleep 1 | |
$fname -q | |
rm $fname | |
echo "Cleaned!" | |
exit 42 | |
} | |
################################################################################### | |
# Print out the SLURM job information. Remove this if you don't need it. | |
################################################################################### | |
# Print out the SLURM job information. Remove this if you don't need it. | |
echo "SLURM_JOBID="$SLURM_JOBID | |
echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST | |
echo "SLURM_NNODES"=$SLURM_NNODES | |
echo "SLURMTMPDIR="$SLURMTMPDIR | |
echo "working directory = "$SLURM_SUBMIT_DIR | |
# changedir to workdir | |
cd $SLURM_SUBMIT_DIR | |
#------------------------------------- Launch application ---------------------# | |
module load python | |
################################################################################ | |
# 1. Start DMTCP coordinator | |
################################################################################ | |
start_coordinator # -i 120 | |
srun dmtcp_launch --with-plugin /home/dacre/dmtcp/dmtcp-2.4.4/contrib/slurm-ft/slurm-ft.o --checkpoint-open-files python test.py |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment