Skip to content

Instantly share code, notes, and snippets.

@LouisFaure
Created May 30, 2025 16:47
Show Gist options
  • Save LouisFaure/bd6001026d2e207f6b874d09264dc9ea to your computer and use it in GitHub Desktop.
Save LouisFaure/bd6001026d2e207f6b874d09264dc9ea to your computer and use it in GitHub Desktop.
Check all node resources
#!/bin/bash
# --- Configuration ---
PARTITION_NAME="componc_gpu" # Define the partition name here
# --- Function to convert memory string (e.g., "972G", "1031308M") to MB ---
convert_mem_to_mb() {
local mem_str="$1"
local value
local unit
if [[ "$mem_str" =~ ([0-9]+)([MG]) ]]; then
value="${BASH_REMATCH[1]}"
unit="${BASH_REMATCH[2]}"
elif [[ "$mem_str" =~ ([0-9]+) ]]; then # Assume MB if no unit
value="${BASH_REMATCH[1]}"
unit="M"
else
# Handle cases where memory string might be just "0" or empty
echo "0"
return
fi
if [ "$unit" == "G" ]; then
echo "$((value * 1024))"
else # Assume M (MB)
echo "$value"
fi
}
# --- Function to fetch and process data for a single node ---
# This function will now return a space-separated string of values
get_node_data() {
local NODE_NAME="$1"
SCONTROL_OUTPUT=$(scontrol show node "$NODE_NAME" 2>&1)
if [ $? -ne 0 ]; then
# Return a default set of values for error
# NodeName State CPU_T CPU_A CPU_F Mem_T Mem_A Mem_F GPU_T GPU_A GPU_F
echo "$NODE_NAME ERROR N/A N/A N/A N/A N/A N/A N/A N/A N/A"
return
fi
# --- Parse CPU information ---
CPU_EFFECTIVE=$(echo "$SCONTROL_OUTPUT" | grep -oP 'CPUEfctv=\K[0-9]+' | head -n 1)
CPU_ALLOCATED=$(echo "$SCONTROL_OUTPUT" | grep -oP 'CPUAlloc=\K[0-9]+' | head -n 1)
# --- Parse Memory information ---
MEM_CFGTRES_RAW=$(echo "$SCONTROL_OUTPUT" | grep -oP 'CfgTRES=.*mem=\K[0-9]+[MG]?' | head -n 1)
MEM_ALLOCTRES_RAW=$(echo "$SCONTROL_OUTPUT" | grep -oP 'AllocTRES=.*mem=\K[0-9]+[MG]?' | head -n 1)
MEM_CFGTRES_MB=$(convert_mem_to_mb "$MEM_CFGTRES_RAW")
MEM_ALLOCTRES_MB=$(convert_mem_to_mb "$MEM_ALLOCTRES_RAW")
# --- Parse GPU information ---
TOTAL_GPUS_CFG=$(echo "$SCONTROL_OUTPUT" | grep -oP 'CfgTRES=.*gres/gpu=\K[0-9]+' | head -n 1)
ALLOCATED_GPUS=$(echo "$SCONTROL_OUTPUT" | grep -oP 'AllocTRES=.*gres/gpu=\K[0-9]+' | head -n 1)
# Fallback to Gres= line if CfgTRES 'gres/gpu' is not found
if [ -z "$TOTAL_GPUS_CFG" ]; then
GRES_LINE=$(echo "$SCONTROL_OUTPUT" | grep -oP 'Gres=\K.*' | head -n 1)
if [[ "$GRES_LINE" =~ gpu:[^:]*([0-9]+) ]]; then
TOTAL_GPUS_CFG="${BASH_REMATCH[1]}"
fi
fi
TOTAL_GPUS_CFG=${TOTAL_GPUS_CFG:-0}
ALLOCATED_GPUS=${ALLOCATED_GPUS:-0}
# --- Calculate Free Resources ---
FREE_CPU=0
if [[ -n "$CPU_EFFECTIVE" && -n "$CPU_ALLOCATED" ]]; then
FREE_CPU=$((CPU_EFFECTIVE - CPU_ALLOCATED))
fi
FREE_MEMORY_MB=0
if [[ -n "$MEM_CFGTRES_MB" && -n "$MEM_ALLOCTRES_MB" ]]; then
FREE_MEMORY_MB=$((MEM_CFGTRES_MB - MEM_ALLOCTRES_MB))
fi
FREE_GPUS=0
if [[ -n "$TOTAL_GPUS_CFG" && -n "$ALLOCATED_GPUS" ]]; then
FREE_GPUS=$((TOTAL_GPUS_CFG - ALLOCATED_GPUS))
fi
# Determine node state
NODE_STATE=$(echo "$SCONTROL_OUTPUT" | grep -oP 'State=\K[A-Za-z]+\s*' | head -n 1)
NODE_STATE_CLEANED=$(echo "$NODE_STATE" | xargs) # Trim whitespace
# Format memory in GB for output
MEM_CFGTRES_GB=$(awk "BEGIN {printf \"%.2f\", ${MEM_CFGTRES_MB:-0} / 1024}")
MEM_ALLOCTRES_GB=$(awk "BEGIN {printf \"%.2f\", ${MEM_ALLOCTRES_MB:-0} / 1024}")
FREE_MEMORY_GB=$(awk "BEGIN {printf \"%.2f\", ${FREE_MEMORY_MB:-0} / 1024}")
# Output the collected data as a space-separated string
echo "$NODE_NAME $NODE_STATE_CLEANED ${CPU_EFFECTIVE:-N/A} ${CPU_ALLOCATED:-N/A} ${FREE_CPU:-N/A} ${MEM_CFGTRES_GB:-N/A} ${MEM_ALLOCTRES_GB:-N/A} ${FREE_MEMORY_GB:-N/A} ${TOTAL_GPUS_CFG:-N/A} ${ALLOCATED_GPUS:-N/A} ${FREE_GPUS:-N/A}"
}
# --- Main Script Logic ---
echo "Gathering resource information for all nodes in partition '$PARTITION_NAME'..."
echo "States considered 'available': IDLE, MIXED, ALLOCATED (DRAINED are excluded)."
echo ""
# Get a list of all nodes in the specified partition that are not 'drained'.
NODES_RAW=$(sinfo -p "$PARTITION_NAME" -t idle,alloc,mixed -h -o "%N" 2>/dev/null)
processed_nodes=()
OLD_IFS=$IFS
IFS=','
for component in $NODES_RAW; do
if [[ $component =~ ^([a-zA-Z]+)\[([0-9]+)-([0-9]+)\]$ ]]; then
prefix="${BASH_REMATCH[1]}"
start_num=$((10#${BASH_REMATCH[2]}))
end_num=$((10#${BASH_REMATCH[3]}))
padding_length=${#BASH_REMATCH[2]}
for (( i=start_num; i<=end_num; i++ )); do
formatted_num=$(printf "%0${padding_length}d" "$i")
processed_nodes+=("${prefix}${formatted_num}")
done
else
processed_nodes+=("$component")
fi
done
IFS=$OLD_IFS
if [ ${#processed_nodes[@]} -eq 0 ]; then
echo "No available nodes found in partition '$PARTITION_NAME' (or partition name is incorrect)."
exit 0
fi
# Prepare header for the table
# The header should exactly match the number and order of columns returned by get_node_data
HEADER="Node State CPU_T CPU_A CPU_F Mem_T Mem_A Mem_F GPU_T GPU_A GPU_F"
# Create a temporary file to store the data and header
TEMP_DATA_FILE=$(mktemp)
# Write the header to the temp file
echo "$HEADER" > "$TEMP_DATA_FILE"
# Loop through each node and collect its data, writing to the temp file
for NODE in "${processed_nodes[@]}"; do
get_node_data "$NODE" >> "$TEMP_DATA_FILE"
done
# Use column -t to print the table from the temporary file
# -t: create a table
# -o " ": use a single space as column separator for output (optional, but good for clarity)
# -N: no header (we want to use our own) OR you can use -s to define the input separator and apply column -t on all of it
column -t "$TEMP_DATA_FILE"
# Clean up the temporary file
rm "$TEMP_DATA_FILE"
echo "" # Add a newline after the table
echo "Script finished."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment