Created
June 22, 2021 16:47
-
-
Save wyphan/02da258f98ca1a030e1dca2c220dee87 to your computer and use it in GitHub Desktop.
Summit scripts for Nsight Systems and Nsight Compute
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#BSUB -P MAT201 | |
#BSUB -W 2:00 | |
#BSUB -nnodes 16 | |
#BSUB -alloc_flags "smt4" | |
#BSUB -J crpa-nio-pm | |
#BSUB -N [email protected] | |
export exe="elk-nv-prof-acc" | |
export pp="pp_u4-nv" | |
export exedir="${HOME}/exciting-plus-gpu/bin" | |
export cudacheckexe="deviceQuery" | |
export cudacheckdir="${PROJWORK}/mat201/cuda-samples/NVIDIA_CUDA-10.2_Samples/bin/ppc64le/linux/release" | |
export jobtitle="crpa-nio-pm" | |
export txzfile="SUMMIT-PGI-OpenACCsparse-ncu-crpa-NiO-PM-GGA-nk8-ngsh10-ne20.tar.xz" | |
export logfile="crpa-acc.log" | |
files="${logfile} env.log modules.log cudaDeviceQuery.log *.in *.OUT q/ *.hdf5 cRPA*.dat *.nsight-cuprof-report" | |
export destdir="$HOME/exciting+/NiO/PM-GGA-noU" | |
# Load modules | |
module unload darshan-runtime | |
module load job-step-viewer | |
module load nvhpc/21.3 | |
module load cuda/10.2.89 | |
module load netlib-lapack | |
module load essl | |
module load fftw | |
module load hdf5 | |
module load nvlibs/21.3 | |
module load magma | |
module load nsight-compute | |
module list > modules.log 2>&1 | |
# Prepare the job | |
echo "`date` Job ${LSB_JOBID} launched from `hostname`" | |
cd ${LS_SUBCWD} | |
echo "Workdir is `pwd`" | |
cp ${exedir}/${exe} ./ | |
cp ${exedir}/${pp} ./ | |
env > env.log | |
# Check CUDA | |
cp ${cudacheckdir}/${cudacheckexe} ./ | |
jsrun -n 1 -c 1 -a 1 -g 1 ./${cudacheckexe} > cudaDeviceQuery.log | |
# Write elk-wrapper.sh | |
cat << EOF > elk-wrapper.sh | |
#!/bin/bash | |
export exe="${exe}" | |
export pfx="crpa-NiO-PM" | |
# Ranks to profile | |
export ranks=( 0 ) | |
# Only profile selected ranks | |
if [[ " \${ranks[@]} " =~ " \${OMPI_COMM_WORLD_RANK} " ]]; then | |
nv-nsight-cu-cli \ | |
--target-processes all --print-summary=per-gpu \ | |
--metrics "regex:smsp__sass_thread_inst_executed_op_d" \ | |
--kernel-id ::regex:*fillbatch*:1 \ | |
-f -o "\${pfx}_\${OMPI_COMM_WORLD_RANK}" ./\${exe} | |
fi | |
if [[ ! " \${ranks[@]} " =~ " \${OMPI_COMM_WORLD_RANK} " ]]; then | |
./\${exe} | |
fi | |
EOF | |
chmod +x ./elk-wrapper.sh | |
# Make sure this matches the bsub alloc_flags! | |
# Number of OpenMP threads per physical core | |
export smtlv=4 | |
# Number of GPU per resource set | |
export gpures=1 | |
echo "`date` Launching ${exe} with 6 resource sets per node (3 per socket)" | |
echo "Each resource set contains 1 rank, ${smtlv} threads, ${gpures} GPU" | |
jsrun --smpiargs "-disable_gpu_hooks" -r 6 -K 3 -c 7 -a 1 -g ${gpures} \ | |
-E OMP_NUM_THREADS=${smtlv} -E OMP_STACKSIZE=2G \ | |
./elk-wrapper.sh | |
echo "`date` Done" | |
if [ -e u4_0000.hdf5 ]; then | |
# Post-process with pp_u4 (serial code) | |
echo "`date` Post-processing started" | |
jsrun -r 1 -c 1 -a 1 -g 0 ./${pp} | |
echo "`date` Post-processing done" | |
fi | |
# Compress outputs and send to home folder | |
cp ${jobtitle}.${LSB_JOBID} ${logfile} | |
export XZ_DEFAULTS="-T 0" | |
tar cJf ${txzfile} ${files} | |
cp ${txzfile} ${destdir}/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#BSUB -P MAT201 | |
#BSUB -W 2:00 | |
#BSUB -nnodes 48 | |
#BSUB -alloc_flags "smt4 gpumps" | |
#BSUB -J crpa-la2cuo4-tetra-pm | |
#BSUB -N [email protected] | |
export exe="elk-pgi-prof-acc" | |
export pp="pp_u4-pgi" | |
export exedir="${HOME}/exciting-plus-gpu/bin" | |
export cudacheckexe="deviceQuery" | |
export cudacheckdir="${PROJWORK}/mat201/cuda-samples/NVIDIA_CUDA-10.2_Samples/bin/ppc64le/linux/release" | |
export jobtitle="crpa-la2cuo4-tetra-pm" | |
export txzfile="SUMMIT-PGI-OpenACCsparse-nsys-crpa-La2CuO4-tetra-PM-GGA-nk6-gordonbell.tar.xz" | |
export logfile="crpa-acc.log" | |
files="${logfile} env.log modules.log cudaDeviceQuery.log *.in *.OUT q/ *.hdf5 cRPA*.dat *.qdrep" | |
export destdir="$HOME/exciting+/La2CuO4-tetra/PM-LSDA-noU" | |
# Load modules | |
module load job-step-viewer | |
module load pgi/20.4 | |
module load cuda/10.2.89 | |
module load netlib-lapack | |
module load essl | |
module load fftw | |
module load hdf5 | |
module load pgilibs/20.4 | |
module load magma | |
module load nsight-systems | |
module list > modules.log 2>&1 | |
# Prepare the job | |
echo "`date` Job ${LSB_JOBID} launched from `hostname`" | |
cd ${LS_SUBCWD} | |
echo "Workdir is `pwd`" | |
cp ${exedir}/${exe} ./ | |
cp ${exedir}/${pp} ./ | |
env > env.log | |
# Check CUDA | |
cp ${cudacheckdir}/${cudacheckexe} ./ | |
jsrun -n 1 -c 1 -a 1 -g 1 ./${cudacheckexe} > cudaDeviceQuery.log | |
# Write elk-wrapper.sh | |
cat << EOF > elk-wrapper.sh | |
#!/bin/bash | |
export exe="${exe}" | |
export pfx="crpa-La2CuO4-PM" | |
# Use installed NSight Systems | |
export NSYSDIR="${OLCF_NSIGHT_SYSTEMS_ROOT}/target-linux-ppc64le" | |
export nsys=\${NSYSDIR}/nsys | |
# Ranks to profile | |
export ranks=( 0 ) | |
# Only profile selected ranks | |
if [[ " \${ranks[@]} " =~ " \${OMPI_COMM_WORLD_RANK} " ]]; then | |
\${nsys} profile --stats=true --sample=none -t openacc,nvtx -f true -o "\${pfx}_%q{OMPI_COMM_WORLD_RANK}" ./\${exe} | |
fi | |
if [[ ! " \${ranks[@]} " =~ " \${OMPI_COMM_WORLD_RANK} " ]]; then | |
./\${exe} | |
fi | |
EOF | |
chmod +x ./elk-wrapper.sh | |
# Make sure this matches the bsub alloc_flags! | |
# Number of OpenMP threads per physical core | |
export smtlv=4 | |
# Number of GPU per resource set | |
export gpures=1 | |
echo "`date` Launching ${exe} with 6 resource sets per node (3 per socket)" | |
echo "Each resource set contains 1 rank, ${smtlv} threads, ${gpures} GPU" | |
#jsrun --smpiargs="-gpu" -r 6 -K 3 -c 7 -a 1 -g ${gpures} \ | |
jsrun -r 6 -K 3 -c 7 -a 1 -g ${gpures} \ | |
-E OMP_NUM_THREADS=${smtlv} -E OMP_STACKSIZE=2G \ | |
./elk-wrapper.sh | |
echo "`date` Done" | |
if [ -e u4_0000.hdf5 ]; then | |
# Post-process with pp_u4 (serial code) | |
echo "`date` Post-processing started" | |
jsrun -r 1 -c 1 -a 1 -g 0 ./${pp} | |
echo "`date` Post-processing done" | |
fi | |
# Compress outputs and send to home folder | |
cp ${jobtitle}.${LSB_JOBID} ${logfile} | |
export XZ_DEFAULTS="-T 0" | |
tar cJf ${txzfile} ${files} | |
cp ${txzfile} ${destdir}/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment