Skip to content

Instantly share code, notes, and snippets.

@LouisFaure
Created May 23, 2025 16:16
Show Gist options
  • Save LouisFaure/db4e108c026a579f16d2ab5c85f30656 to your computer and use it in GitHub Desktop.
Save LouisFaure/db4e108c026a579f16d2ab5c85f30656 to your computer and use it in GitHub Desktop.
Check available resources in SLURM node
#!/bin/bash
# This script parses the output of 'scontrol show node <node_name>'
# to determine the available CPU, memory, and GPUs for a new task.
# Check if a node name is provided as an argument
if [ -z "$1" ]; then
echo "Usage: $0 <node_name>"
echo "Example: $0 iscg009"
exit 1
fi
NODE_NAME="$1"
echo "Fetching scontrol data for node: $NODE_NAME..."
# Execute scontrol and capture its output
SCONTROL_OUTPUT=$(scontrol show node "$NODE_NAME" 2>&1)
# Check if scontrol command was successful
if [ $? -ne 0 ]; then
echo "Error running 'scontrol show node $NODE_NAME'. Please ensure the node name is correct and you have appropriate permissions."
echo "Scontrol output:\n$SCONTROL_OUTPUT"
exit 1
fi
# --- Parse CPU information ---
CPU_EFFECTIVE=$(echo "$SCONTROL_OUTPUT" | grep -oP 'CPUEfctv=\K[0-9]+')
CPU_ALLOCATED=$(echo "$SCONTROL_OUTPUT" | grep -oP 'CPUAlloc=\K[0-9]+')
# --- Parse Memory information ---
MEM_CFGTRES_RAW=$(echo "$SCONTROL_OUTPUT" | grep -oP 'CfgTRES=.*mem=\K[0-9]+[MG]?')
MEM_ALLOCTRES_RAW=$(echo "$SCONTROL_OUTPUT" | grep -oP 'AllocTRES=.*mem=\K[0-9]+[MG]?')
# --- Parse GPU information ---
# Gres=gpu:h100:8(S:0-1) -> Total GPUs
# CfgTRES=...,gres/gpu=8,... -> Total GPUs (alternative, could be missing type)
# AllocTRES=...,gres/gpu=6,... -> Allocated GPUs (also could be missing type)
# Prioritize parsing from CfgTRES and AllocTRES for consistency with other TRES values.
# CfgTRES=cpu=56,mem=1031308M,billing=56,gres/gpu=8,gres/gpu:h100=8
# AllocTRES=cpu=33,mem=972G,gres/gpu=6,gres/gpu:h100=6
# Extract total configured GPUs from CfgTRES
# Looks for "gres/gpu=<number>" or "gres/gpu:<type>=<number>" if specific type is needed.
# Let's try to get the general 'gres/gpu=<num>' first.
TOTAL_GPUS_CFG=$(echo "$SCONTROL_OUTPUT" | grep -oP 'CfgTRES=.*gres/gpu=\K[0-9]+' | head -n 1) # Added head -n 1 in case it appears twice
# Extract allocated GPUs from AllocTRES
ALLOCATED_GPUS=$(echo "$SCONTROL_OUTPUT" | grep -oP 'AllocTRES=.*gres/gpu=\K[0-9]+' | head -n 1) # Added head -n 1
# Fallback: If CfgTRES 'gres/gpu' is not found, try 'Gres=' line for total GPUs.
# This assumes the format "Gres=gpu:type:count" or similar.
if [ -z "$TOTAL_GPUS_CFG" ]; then
GRES_LINE=$(echo "$SCONTROL_OUTPUT" | grep -oP 'Gres=\K.*')
if [[ "$GRES_LINE" =~ gpu:[^:]*([0-9]+) ]]; then
TOTAL_GPUS_CFG="${BASH_REMATCH[1]}"
fi
fi
# Default to 0 if no GPU information is found
TOTAL_GPUS_CFG=${TOTAL_GPUS_CFG:-0}
ALLOCATED_GPUS=${ALLOCATED_GPUS:-0}
# Function to convert memory string (e.g., "972G", "1031308M") to MB
convert_mem_to_mb() {
local mem_str="$1"
local value
local unit
if [[ "$mem_str" =~ ([0-9]+)([MG]) ]]; then
value="${BASH_REMATCH[1]}"
unit="${BASH_REMATCH[2]}"
elif [[ "$mem_str" =~ ([0-9]+) ]]; then # Assume MB if no unit
value="${BASH_REMATCH[1]}"
unit="M"
else
echo "Error: Could not parse memory string '$mem_str'" >&2
echo "0"
return
fi
if [ "$unit" == "G" ]; then
echo "$((value * 1024))"
else # Assume M (MB)
echo "$value"
fi
}
MEM_CFGTRES_MB=$(convert_mem_to_mb "$MEM_CFGTRES_RAW")
MEM_ALLOCTRES_MB=$(convert_mem_to_mb "$MEM_ALLOCTRES_RAW")
# --- Calculate Free Resources ---
FREE_CPU=0
if [[ -n "$CPU_EFFECTIVE" && -n "$CPU_ALLOCATED" ]]; then
FREE_CPU=$((CPU_EFFECTIVE - CPU_ALLOCATED))
fi
FREE_MEMORY_MB=0
if [[ -n "$MEM_CFGTRES_MB" && -n "$MEM_ALLOCTRES_MB" ]]; then
FREE_MEMORY_MB=$((MEM_CFGTRES_MB - MEM_ALLOCTRES_MB))
fi
FREE_GPUS=0
if [[ -n "$TOTAL_GPUS_CFG" && -n "$ALLOCATED_GPUS" ]]; then
FREE_GPUS=$((TOTAL_GPUS_CFG - ALLOCATED_GPUS))
fi
# Convert free memory to GB for readability
FREE_MEMORY_GB=$(awk "BEGIN {printf \"%.2f\", $FREE_MEMORY_MB / 1024}")
# --- Output Results ---
echo "--------------------------------------------------"
echo "Resource Availability for Node: $NODE_NAME"
echo "--------------------------------------------------"
echo "Total Effective CPUs: $CPU_EFFECTIVE"
echo "Allocated CPUs: $CPU_ALLOCATED"
echo "Free CPUs: $FREE_CPU"
echo ""
echo "Total Configured Memory: $(printf "%.2fG" "$(awk "BEGIN {printf \"%.2f\", $MEM_CFGTRES_MB / 1024}")") ($MEM_CFGTRES_MB MB)"
echo "Allocated Memory: $(printf "%.2fG" "$(awk "BEGIN {printf \"%.2f\", $MEM_ALLOCTRES_MB / 1024}")") ($MEM_ALLOCTRES_MB MB)"
echo "Free Memory: ${FREE_MEMORY_GB}G ($FREE_MEMORY_MB MB)"
echo ""
echo "Total Configured GPUs: $TOTAL_GPUS_CFG"
echo "Allocated GPUs: $ALLOCATED_GPUS"
echo "Free GPUs: $FREE_GPUS"
echo "--------------------------------------------------"
# Example usage for checking specific resource needs:
# if (( FREE_CPU >= 1 && FREE_MEMORY_MB >= 4096 && FREE_GPUS >= 1 )); then
# echo "Node has at least 1 CPU, 4GB memory, and 1 GPU available."
# else
# echo "Node does NOT meet the minimum requirements of 1 CPU, 4GB memory, and 1 GPU."
# fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment