Skip to content

Instantly share code, notes, and snippets.

@fbouynot
Last active June 28, 2024 15:59
Show Gist options
  • Save fbouynot/2715180632dd6efe97339f51747a196c to your computer and use it in GitHub Desktop.
Save fbouynot/2715180632dd6efe97339f51747a196c to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version. Please see LICENSE.txt at the top level of
# the source code distribution for details.
set -euo pipefail
TMP_DIR="$(mktemp -d)"
trap 'rm -rf -- "$TMP_DIR"' EXIT
check_gpu() {
nvidia-smi -q -i "${1}" -d ECC,ROW_REMAPPER -f "${TMP_DIR}/nvidiasmi-${1}.txt"
sramc=$(grep 'SRAM Correctable' "${TMP_DIR}/nvidiasmi-${1}.txt" | tail -n 1 | awk '{print $4}')
sramu=$(grep 'SRAM Uncorrectable' "${TMP_DIR}/nvidiasmi-${1}.txt" | tail -n 1 | awk '{print $4}')
dramu=$(grep 'DRAM Uncorrectable' "${TMP_DIR}/nvidiasmi-${1}.txt" | tail -n 1 | awk '{print $4}')
rr=$(grep 'Remapping Failure Occurred' "${TMP_DIR}/nvidiasmi-${1}.txt" | awk '{print $5}')
# DRAM Uncorrectable error + remapped row
if [ "${dramu}" -gt 0 ] && [ "${rr}" == 'Yes' ]; then
echo 2
return 0
fi
# SRAM Uncorrectable error
if [ "${sramu}" -ge 5 ]; then
echo 2
return 0
fi
# SRAM correctable error
if [ "${sramc}" -ge 100 ]; then
echo 1
return 0
fi
echo 0
return 0
}
main() {
# Store number of GPU on this machine
gpucount=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
# For each GPU, run check_gpu in background in parallel and store in a file
for ((i = 0 ; i < "${gpucount}" ; i++ )); do
check_gpu "${i}" > "${TMP_DIR}/retcode-${i}" &
done
# Wait for all gpu_check to finish
wait
cat "${TMP_DIR}"/retcode-* | sort -nr | head -n 1
exit 0
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment