Created
December 22, 2022 17:07
-
-
Save s-sajid-ali/f54221641e7df1843b470770dea2599d to your computer and use it in GitHub Desktop.
submit script for icarus workflow on alcf theta that results in a crash
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#COBALT -A HEP_on_HPC | |
#COBALT -n 4 | |
#COBALT -t 01:00:00 | |
#COBALT --mode script | |
export NUM_DAEMON_NODES=1 | |
export NUM_DAEMON_TOTAL_RANKS=$(($NUM_DAEMON_NODES*16)) | |
export NUM_DAEMON_HYPERTHREADS=1 | |
export NUM_DAEMON_RANKS_PER_NODE=$(($NUM_DAEMON_TOTAL_RANKS/$NUM_DAEMON_NODES)) | |
export NUM_DAEMON_THREADS_PER_RANK=$(($NUM_DAEMON_HYPERTHREADS*64/$NUM_DAEMON_RANKS_PER_NODE)) | |
PDOMAIN=hepnos-sajid | |
SSGFILE=hepnos.ssg | |
CONNECTIONFILE=$(pwd)/connection.json | |
ICARUSWF_BUILD=/projects/HEP_on_HPC/sajid/icarus_hepnos/icaruswf/build | |
rm ${SSGFILE} | |
rm ${CONNECTIONFILE} | |
export MPICH_GNI_NDREG_ENTRIES=1024 | |
export MPICH_MAX_THREAD_SAFETY=multiple | |
export CET_PLUGIN_PATH=${ICARUSWF_BUILD}/src/modules:${CET_PLUGIN_PATH} | |
export FHICL_FILE_PATH=${ICARUSWF_BUILD}/fcl:${FHICL_FILE_PATH} | |
echo "%%% start $(date)" | |
module unload darshan | |
module swap PrgEnv-intel PrgEnv-gnu | |
module load gcc/9.3.0 | |
module swap cray-mpich/7.7.14 cray-mpich-abi/7.7.14 | |
echo "%%% after_module_loads $(date)" | |
echo "Setting up protection domain" | |
apstat -P | grep ${PDOMAIN} || apmgr pdomain -c -u ${PDOMAIN} | |
# activate spack environment | |
. /projects/HEP_on_HPC/icaruscode/spack/share/spack/setup-env.sh | |
spack env activate icaruscode-9-37-02-03-vecmt-hepnos-0_7_1 | |
# options to collect diagnostic profiles | |
export MARGO_ENABLE_DIAGNOSTICS=0 | |
export MARGO_ENABLE_PROFILING=0 | |
echo "%%% before_start_daemon $(date)" | |
aprun -n $NUM_DAEMON_TOTAL_RANKS \ | |
-N $NUM_DAEMON_RANKS_PER_NODE \ | |
-d $NUM_DAEMON_THREADS_PER_RANK \ | |
-j $NUM_DAEMON_HYPERTHREADS \ | |
-cc none \ | |
-p ${PDOMAIN} \ | |
bedrock ofi+gni -c hepnos.json -v info &> server-log & | |
sleep 30 | |
echo "%%% after_start_daemon $(date)" | |
while [ ! -f ${SSGFILE} ]; do sleep 10 && echo "waiting for ssgfile"; done | |
echo "%%% before_start_list_dbs $(date)" | |
aprun -n 1 -N 1 \ | |
-p ${PDOMAIN} \ | |
hepnos-list-databases ofi+gni -s ${SSGFILE} > ${CONNECTIONFILE} | |
sleep 2 | |
echo "%%% after_start_list_dbs $(date)" | |
while [ ! -f ${CONNECTIONFILE} ]; do sleep 10; done | |
sed -i '$ d' ${CONNECTIONFILE} # we have to because aprun adds a line | |
# set relevant env var | |
export ICARUSWF_BUILD=/projects/HEP_on_HPC/sajid/icarus_hepnos/icaruswf/build | |
# Create Queues | |
aprun -n 1 -N 1 -p ${PDOMAIN} ${ICARUSWF_BUILD}/src/modules/cheesyQueue_maker ofi+gni connection.json DetSim HitFinding &> queue-creation-log | |
export DATA_DIR=/projects/HEP_on_HPC/icarus_data/icaruscode-v09_37_01_03p02/icaruscode-09_37_01_03p02-samples | |
export CONFIG_DIR=/projects/HEP_on_HPC/icarus_data/icaruscode-v09_37_01_03p02/icaruscode-09_37_01_03p02-configs | |
export BASEDIR=$(pwd) | |
# Create all the directories! | |
for THREADS in 4 | |
do | |
cd $BASEDIR | |
mkdir -p threads_$THREADS | |
cd threads_$THREADS | |
for RUN in 1 | |
do | |
mkdir -p run_$RUN | |
done | |
done | |
cd $BASEDIR | |
# run the benchmark | |
for RUN in 1 | |
do | |
# run icaruswf-bench with varying number of client threads | |
for THREADS in 4 | |
do | |
cd threads_${THREADS}/run_${RUN} | |
cp $CONNECTIONFILE . | |
export NUM_CLIENT_NODES=3 | |
export NUM_CLIENT_TOTAL_RANKS=$(($NUM_CLIENT_NODES*2)) | |
export NUM_CLIENT_HYPERTHREADS=1 | |
export NUM_CLIENT_RANKS_PER_NODE=$(($NUM_CLIENT_TOTAL_RANKS/$NUM_CLIENT_NODES)) | |
export NUM_CLIENT_THREADS_PER_RANK=$THREADS | |
export PMI_NO_FORK=1 | |
echo "%%% before icaruswf-sigproc-hitfind with $THREADS threads, run number $RUN at $(date)" | |
aprun -n $NUM_CLIENT_TOTAL_RANKS \ | |
-N $NUM_CLIENT_RANKS_PER_NODE \ | |
-d $NUM_CLIENT_THREADS_PER_RANK \ | |
-j $NUM_CLIENT_HYPERTHREADS \ | |
-cc none \ | |
-p ${PDOMAIN} \ | |
${ICARUSWF_BUILD}/src/modules/mpi_wrapper 1 &> wrapper_out | |
echo "%%% after icaruswf-sigproc-hitfind with $THREADS threads, run number $RUN at $(date)" | |
cd ${BASEDIR} | |
done | |
done | |
echo "%%% before_start_shutdown $(date)" | |
aprun -n 1 \ | |
-N 1 \ | |
-p ${PDOMAIN} \ | |
hepnos-shutdown ofi+gni ${CONNECTIONFILE} | |
echo "%%% after_end_shutdown $(date)" | |
echo "Destroying protection domain" | |
apmgr pdomain -r -u ${PDOMAIN} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment