Skip to content

Instantly share code, notes, and snippets.

@cjbarker
Last active May 25, 2026 22:16
Show Gist options
  • Select an option

  • Save cjbarker/57cbf5bb259e4c4a4f2877a89282bfbc to your computer and use it in GitHub Desktop.

Select an option

Save cjbarker/57cbf5bb259e4c4a4f2877a89282bfbc to your computer and use it in GitHub Desktop.
watchdog-llama.sh
#!/bin/bash
# Llama.cpp Process Watchdog Script
# This script monitors a Llama.cpp server process and checks its health status.
# If the service fails to respond to API queries, it can optionally restart the process.
#
# Usage:
# ./watchdog-llama.sh [process_name]
# - If process_name is provided, it will monitor that specific process
# - If not provided, it defaults to "/home/cj/tmp-llama/llama-server"
# --verbose: Enable verbose output to STDOUT in addition to log file
# --loop: Run continuously (default: run once and exit)
# --log-level=[DEBUG|INFO|WARN|ERROR]: Set log level
set -euo pipefail
# Configuration
WATCHDOG_NAME="llama-watchdog"
LOG_FILE="${HOME}/.local/log/llama-watchdog.log"
CHECK_INTERVAL=30 # seconds
MAX_RETRIES=3
RESTART_ENABLED=true
LLAMA_PID_FILE="/tmp/llama.pid" # Optional: specify where the pid file is stored
LLAMA_SWAP_URL="http://localhost:8080"
LLAMA_SWAP_MODEL_ID="Qwen3.6-35B"
# Initialize flags
VERBOSE=false
LOOP_MODE=false
LOG_LEVEL="WARN" # Default log level: WARN (equivalent to level 3)
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--verbose)
VERBOSE=true
LOG_LEVEL="DEBUG" # When verbose is enabled, set log level to DEBUG
shift
;;
--loop)
LOOP_MODE=true
shift
;;
--log-level=*)
LOG_LEVEL="${1#*=}"
shift
;;
--log-level)
LOG_LEVEL="$2"
shift 2
;;
*)
# First non-flag argument is the process name
if [ -z "${PROCESS_NAME:-}" ]; then
PROCESS_NAME="$1"
fi
shift
;;
esac
done
# Set default process name if not provided
if [ -z "${PROCESS_NAME:-}" ]; then
PROCESS_NAME="/home/cj/tmp-llama/llama-server"
fi
# Log level constants
DEBUG_LEVEL=1
INFO_LEVEL=2
WARN_LEVEL=3
ERROR_LEVEL=4
# Get numeric value for log level
get_log_level_value() {
case "$1" in
"DEBUG") echo $DEBUG_LEVEL ;;
"INFO") echo $INFO_LEVEL ;;
"WARN") echo $WARN_LEVEL ;;
"ERROR") echo $ERROR_LEVEL ;;
*) echo $WARN_LEVEL ;; # Default to WARN if invalid level
esac
}
# Logging function with level filtering
log() {
local level="$1"
local message="$2"
# Get the numeric value of the current log level
local current_level_value=$(get_log_level_value "$LOG_LEVEL")
# Get the numeric value of the message level
local message_level_value=0
case "$level" in
"DEBUG") message_level_value=$DEBUG_LEVEL ;;
"INFO") message_level_value=$INFO_LEVEL ;;
"WARN") message_level_value=$WARN_LEVEL ;;
"ERROR") message_level_value=$ERROR_LEVEL ;;
*) message_level_value=$INFO_LEVEL ;; # Default to INFO for unknown levels
esac
# Only log if message level is >= current log level
if [ $message_level_value -ge $current_level_value ]; then
local formatted_message="[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $message"
if [ "$VERBOSE" = true ]; then
echo "$formatted_message" | tee -a "$LOG_FILE"
else
echo "$formatted_message" >> "$LOG_FILE"
fi
fi
}
# Signal handler for graceful shutdown
cleanup() {
log "INFO" "Received shutdown signal. Cleaning up..."
exit 0
}
# Set up signal traps
trap cleanup SIGTERM SIGINT
# Check if curl is available
if ! command -v curl &> /dev/null; then
log "ERROR" "curl is not installed. Please install curl to use this watchdog."
exit 1
fi
# Function to check if Llama.cpp process is running
is_process_running() {
local pid=$1
if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then
return 1
fi
return 0
}
# Function to find Llama.cpp processes
find_llama_processes() {
# Look for processes containing the specified process name in their command line
log "DEBUG" "Finding process running: $PROCESS_NAME" >&2
pgrep -f "$PROCESS_NAME" 2>/dev/null | grep -xv "$$" # Exclude current process (exact match)
}
# Function to test API health
test_api_health() {
local base_url="${1:-http://localhost:8080}"
local endpoints=("/health" "/v1/models" "/v1/chat/completions")
local all_healthy=true
log "INFO" "Testing API health at $base_url"
for endpoint in "${endpoints[@]}"; do
local full_url="${base_url}${endpoint}"
log "DEBUG" "Checking endpoint: $full_url"
local response_code=0
local retries=0
# Try up to MAX_RETRIES times with exponential backoff
while [ $retries -lt $MAX_RETRIES ]; do
if [ "$endpoint" = "/v1/chat/completions" ]; then
# Use POST request with JSON body for chat/completions endpoint
response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 \
-X POST \
-H "Content-Type: application/json" \
-d "{\"model\": \"${LLAMA_SWAP_MODEL_ID}\", \"messages\": [{\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},{\"role\":\"user\",\"content\":\"hello!\"}]}" \
"$full_url" 2>/dev/null || echo "000")
else
# Using curl with timeout to avoid hanging for other endpoints
response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$full_url" 2>/dev/null || echo "000")
fi
# Special handling for /health endpoint - allow 200 and 503 status codes
if [ "$endpoint" = "/health" ]; then
if [ "$response_code" = "200" ] || [ "$response_code" = "503" ]; then
log "DEBUG" "Endpoint $endpoint returned HTTP $response_code"
break # Endpoint is healthy, move to next endpoint
elif [ "$response_code" = "000" ]; then
# Connection timeout or network error
log "WARN" "Connection failed to $endpoint (HTTP $response_code)"
retries=$((retries + 1))
if [ $retries -lt $MAX_RETRIES ]; then
sleep $((2 ** retries)) # Exponential backoff
fi
else
# For /health, any other status code is considered unhealthy
log "WARN" "Endpoint $endpoint returned HTTP $response_code (only 200 and 503 are acceptable for health)"
all_healthy=false
break # Endpoint is unhealthy, move to next endpoint
fi
elif [ "$endpoint" = "/v1/chat/completions" ]; then
# For chat/completions endpoint, only 200 is considered successful
if [ "$response_code" = "200" ]; then
log "DEBUG" "Endpoint $endpoint returned HTTP $response_code"
break # Endpoint is healthy, move to next endpoint
elif [ "$response_code" = "000" ]; then
# Connection timeout or network error
log "WARN" "Connection failed to $endpoint (HTTP $response_code)"
retries=$((retries + 1))
if [ $retries -lt $MAX_RETRIES ]; then
sleep $((2 ** retries)) # Exponential backoff
fi
else
# Any other response code is considered a failure for chat/completions
log "ERROR" "Endpoint $endpoint returned HTTP $response_code (expected 200)"
all_healthy=false
break # Endpoint is unhealthy, move to next endpoint
fi
else
# For all other endpoints, use the original logic
if [ "$response_code" -ge 200 ] && [ "$response_code" -lt 400 ]; then
log "DEBUG" "Endpoint $endpoint returned HTTP $response_code"
break # Endpoint is healthy, move to next endpoint
elif [ "$response_code" = "000" ]; then
# Connection timeout or network error
log "WARN" "Connection failed to $endpoint (HTTP $response_code)"
retries=$((retries + 1))
if [ $retries -lt $MAX_RETRIES ]; then
sleep $((2 ** retries)) # Exponential backoff
fi
else
# Other HTTP errors (4xx, 5xx, etc.)
log "DEBUG" "Endpoint $endpoint returned HTTP $response_code"
break # Endpoint is healthy (got a response), move to next endpoint
fi
fi
done
# If we exhausted retries for this endpoint, mark as unhealthy
if [ $retries -ge $MAX_RETRIES ]; then
log "ERROR" "Failed to reach endpoint $endpoint after $MAX_RETRIES attempts"
all_healthy=false
fi
done
# Return the overall health status
if [ "$all_healthy" = true ]; then
log "INFO" "All health check endpoints passed"
return 0
else
log "WARN" "One or more health check endpoints failed"
return 1
fi
}
# Function to reload model via llama-swap API (unload then load)
reload_model() {
log "INFO" "Attempting to reload model '$LLAMA_SWAP_MODEL_ID' via llama-swap API at $LLAMA_SWAP_URL"
# Step 1: Check if model is currently loaded
local check_url="${LLAMA_SWAP_URL}/upstream/${LLAMA_SWAP_MODEL_ID}/v1/models"
log "INFO" "Checking if model is loaded: GET $check_url"
local check_response
check_response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 -L \
"$check_url" 2>/dev/null || echo "000")
if [ "$check_response" -ge 200 ] && [ "$check_response" -lt 400 ]; then
# Model is loaded, unload it first
local unload_url="${LLAMA_SWAP_URL}/api/models/unload/${LLAMA_SWAP_MODEL_ID}"
log "INFO" "Model is loaded, unloading: POST $unload_url"
local unload_response
unload_response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 \
-X POST "$unload_url" 2>/dev/null || echo "000")
if [ "$unload_response" = "200" ]; then
log "INFO" "Model '$LLAMA_SWAP_MODEL_ID' unloaded successfully (HTTP $unload_response)"
else
log "ERROR" "Failed to unload model '$LLAMA_SWAP_MODEL_ID' (HTTP $unload_response)"
return 1
fi
# Brief pause to allow cleanup
sleep 3
else
log "INFO" "Model '$LLAMA_SWAP_MODEL_ID' is not loaded (HTTP $check_response), skipping unload"
fi
# Step 2: Load the model by hitting its upstream endpoint
local upstream_url="${LLAMA_SWAP_URL}/upstream/${LLAMA_SWAP_MODEL_ID}"
log "INFO" "Loading model: GET $upstream_url"
local load_response
load_response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 60 -L \
"${upstream_url}/v1/models" 2>/dev/null || echo "000")
if [ "$load_response" -ge 200 ] && [ "$load_response" -lt 400 ]; then
log "INFO" "Model '$LLAMA_SWAP_MODEL_ID' loaded successfully (HTTP $load_response)"
else
log "ERROR" "Failed to load model '$LLAMA_SWAP_MODEL_ID' (HTTP $load_response)"
return 1
fi
log "INFO" "Model '$LLAMA_SWAP_MODEL_ID' reloaded successfully"
return 0
}
# Function to restart Llama.cpp (if enabled)
restart_llama() {
log "INFO" "Attempting to restart Llama.cpp service (process: $PROCESS_NAME)"
# Kill any existing processes
local pids=$(find_llama_processes)
if [ -n "$pids" ]; then
log "INFO" "Killing existing Llama.cpp processes: $pids"
echo "$pids" | xargs kill -TERM 2>/dev/null || true
sleep 5 # Wait for processes to terminate
# Force kill if needed
echo "$pids" | xargs kill -9 2>/dev/null || true
fi
# Start the Llama.cpp server with the specified command
# Using a more generic approach for the restart command
log "INFO" "Starting Llama.cpp with command: $PROCESS_NAME --host 0.0.0.0 --port 8080"
$PROCESS_NAME -m ~/.cache/huggingface/hub/models--unsloth--Qwen3-Coder-30B-A3B-Instruct-GGUF/snapshots/b17cb02dd882d5b6ab62fc777ad2995f19668350/Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf -c 131072 --host 0.0.0.0 --port 8080 -ngl 999 --threads 16 --jinja > ~/llama-server-output.log 2>&1 &
echo $! > "$LLAMA_PID_FILE"
log "INFO" "Llama.cpp restarted successfully"
}
# Run a single health check cycle
run_check() {
local pids=$(find_llama_processes)
if [ -z "$pids" ]; then
log "INFO" "No Llama.cpp processes found running"
return
fi
log "INFO" "Found Llama.cpp processes with PIDs: $pids"
for pid in $pids; do
if ! is_process_running "$pid"; then
log "WARN" "Process $pid is not running but still in pgrep results"
continue
fi
log "INFO" "Checking health for process $pid"
if test_api_health "http://localhost:8080"; then
log "DEBUG" "Process $pid is healthy"
else
log "ERROR" "Process $pid is unhealthy"
if [ "$RESTART_ENABLED" = true ]; then
log "INFO" "Restart functionality enabled, attempting model reload via llama-swap"
reload_model
else
log "INFO" "Restart functionality disabled, just logging failure"
fi
fi
done
}
# Main entry point
main() {
log "INFO" "Starting $WATCHDOG_NAME for Llama.cpp monitoring (watching: $PROCESS_NAME)"
if [ "$LOOP_MODE" = true ]; then
log "INFO" "Running in loop mode (interval: ${CHECK_INTERVAL}s)"
while true; do
run_check
sleep $CHECK_INTERVAL
done
else
run_check
fi
}
# If run directly (not sourced), execute main function
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment