Skip to content

Instantly share code, notes, and snippets.

@s-sajid-ali
Created December 22, 2022 17:07
Show Gist options
  • Save s-sajid-ali/f54221641e7df1843b470770dea2599d to your computer and use it in GitHub Desktop.
Save s-sajid-ali/f54221641e7df1843b470770dea2599d to your computer and use it in GitHub Desktop.
submit script for icarus workflow on alcf theta that results in a crash
#!/bin/bash
#COBALT -A HEP_on_HPC
#COBALT -n 4
#COBALT -t 01:00:00
#COBALT --mode script
export NUM_DAEMON_NODES=1
export NUM_DAEMON_TOTAL_RANKS=$(($NUM_DAEMON_NODES*16))
export NUM_DAEMON_HYPERTHREADS=1
export NUM_DAEMON_RANKS_PER_NODE=$(($NUM_DAEMON_TOTAL_RANKS/$NUM_DAEMON_NODES))
export NUM_DAEMON_THREADS_PER_RANK=$(($NUM_DAEMON_HYPERTHREADS*64/$NUM_DAEMON_RANKS_PER_NODE))
PDOMAIN=hepnos-sajid
SSGFILE=hepnos.ssg
CONNECTIONFILE=$(pwd)/connection.json
ICARUSWF_BUILD=/projects/HEP_on_HPC/sajid/icarus_hepnos/icaruswf/build
rm ${SSGFILE}
rm ${CONNECTIONFILE}
export MPICH_GNI_NDREG_ENTRIES=1024
export MPICH_MAX_THREAD_SAFETY=multiple
export CET_PLUGIN_PATH=${ICARUSWF_BUILD}/src/modules:${CET_PLUGIN_PATH}
export FHICL_FILE_PATH=${ICARUSWF_BUILD}/fcl:${FHICL_FILE_PATH}
echo "%%% start $(date)"
module unload darshan
module swap PrgEnv-intel PrgEnv-gnu
module load gcc/9.3.0
module swap cray-mpich/7.7.14 cray-mpich-abi/7.7.14
echo "%%% after_module_loads $(date)"
echo "Setting up protection domain"
apstat -P | grep ${PDOMAIN} || apmgr pdomain -c -u ${PDOMAIN}
# activate spack environment
. /projects/HEP_on_HPC/icaruscode/spack/share/spack/setup-env.sh
spack env activate icaruscode-9-37-02-03-vecmt-hepnos-0_7_1
# options to collect diagnostic profiles
export MARGO_ENABLE_DIAGNOSTICS=0
export MARGO_ENABLE_PROFILING=0
echo "%%% before_start_daemon $(date)"
aprun -n $NUM_DAEMON_TOTAL_RANKS \
-N $NUM_DAEMON_RANKS_PER_NODE \
-d $NUM_DAEMON_THREADS_PER_RANK \
-j $NUM_DAEMON_HYPERTHREADS \
-cc none \
-p ${PDOMAIN} \
bedrock ofi+gni -c hepnos.json -v info &> server-log &
sleep 30
echo "%%% after_start_daemon $(date)"
while [ ! -f ${SSGFILE} ]; do sleep 10 && echo "waiting for ssgfile"; done
echo "%%% before_start_list_dbs $(date)"
aprun -n 1 -N 1 \
-p ${PDOMAIN} \
hepnos-list-databases ofi+gni -s ${SSGFILE} > ${CONNECTIONFILE}
sleep 2
echo "%%% after_start_list_dbs $(date)"
while [ ! -f ${CONNECTIONFILE} ]; do sleep 10; done
sed -i '$ d' ${CONNECTIONFILE} # we have to because aprun adds a line
# set relevant env var
export ICARUSWF_BUILD=/projects/HEP_on_HPC/sajid/icarus_hepnos/icaruswf/build
# Create Queues
aprun -n 1 -N 1 -p ${PDOMAIN} ${ICARUSWF_BUILD}/src/modules/cheesyQueue_maker ofi+gni connection.json DetSim HitFinding &> queue-creation-log
export DATA_DIR=/projects/HEP_on_HPC/icarus_data/icaruscode-v09_37_01_03p02/icaruscode-09_37_01_03p02-samples
export CONFIG_DIR=/projects/HEP_on_HPC/icarus_data/icaruscode-v09_37_01_03p02/icaruscode-09_37_01_03p02-configs
export BASEDIR=$(pwd)
# Create all the directories!
for THREADS in 4
do
cd $BASEDIR
mkdir -p threads_$THREADS
cd threads_$THREADS
for RUN in 1
do
mkdir -p run_$RUN
done
done
cd $BASEDIR
# run the benchmark
for RUN in 1
do
# run icaruswf-bench with varying number of client threads
for THREADS in 4
do
cd threads_${THREADS}/run_${RUN}
cp $CONNECTIONFILE .
export NUM_CLIENT_NODES=3
export NUM_CLIENT_TOTAL_RANKS=$(($NUM_CLIENT_NODES*2))
export NUM_CLIENT_HYPERTHREADS=1
export NUM_CLIENT_RANKS_PER_NODE=$(($NUM_CLIENT_TOTAL_RANKS/$NUM_CLIENT_NODES))
export NUM_CLIENT_THREADS_PER_RANK=$THREADS
export PMI_NO_FORK=1
echo "%%% before icaruswf-sigproc-hitfind with $THREADS threads, run number $RUN at $(date)"
aprun -n $NUM_CLIENT_TOTAL_RANKS \
-N $NUM_CLIENT_RANKS_PER_NODE \
-d $NUM_CLIENT_THREADS_PER_RANK \
-j $NUM_CLIENT_HYPERTHREADS \
-cc none \
-p ${PDOMAIN} \
${ICARUSWF_BUILD}/src/modules/mpi_wrapper 1 &> wrapper_out
echo "%%% after icaruswf-sigproc-hitfind with $THREADS threads, run number $RUN at $(date)"
cd ${BASEDIR}
done
done
echo "%%% before_start_shutdown $(date)"
aprun -n 1 \
-N 1 \
-p ${PDOMAIN} \
hepnos-shutdown ofi+gni ${CONNECTIONFILE}
echo "%%% after_end_shutdown $(date)"
echo "Destroying protection domain"
apmgr pdomain -r -u ${PDOMAIN}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment