Last active
May 25, 2026 22:16
-
-
Save cjbarker/57cbf5bb259e4c4a4f2877a89282bfbc to your computer and use it in GitHub Desktop.
watchdog-llama.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Llama.cpp Process Watchdog Script | |
| # This script monitors a Llama.cpp server process and checks its health status. | |
| # If the service fails to respond to API queries, it can optionally restart the process. | |
| # | |
| # Usage: | |
| # ./watchdog-llama.sh [process_name] | |
| # - If process_name is provided, it will monitor that specific process | |
| # - If not provided, it defaults to "/home/cj/tmp-llama/llama-server" | |
| # --verbose: Enable verbose output to STDOUT in addition to log file | |
| # --loop: Run continuously (default: run once and exit) | |
| # --log-level=[DEBUG|INFO|WARN|ERROR]: Set log level | |
| set -euo pipefail | |
| # Configuration | |
| WATCHDOG_NAME="llama-watchdog" | |
| LOG_FILE="${HOME}/.local/log/llama-watchdog.log" | |
| CHECK_INTERVAL=30 # seconds | |
| MAX_RETRIES=3 | |
| RESTART_ENABLED=true | |
| LLAMA_PID_FILE="/tmp/llama.pid" # Optional: specify where the pid file is stored | |
| LLAMA_SWAP_URL="http://localhost:8080" | |
| LLAMA_SWAP_MODEL_ID="Qwen3.6-35B" | |
| # Initialize flags | |
| VERBOSE=false | |
| LOOP_MODE=false | |
| LOG_LEVEL="WARN" # Default log level: WARN (equivalent to level 3) | |
| # Parse command line arguments | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| --verbose) | |
| VERBOSE=true | |
| LOG_LEVEL="DEBUG" # When verbose is enabled, set log level to DEBUG | |
| shift | |
| ;; | |
| --loop) | |
| LOOP_MODE=true | |
| shift | |
| ;; | |
| --log-level=*) | |
| LOG_LEVEL="${1#*=}" | |
| shift | |
| ;; | |
| --log-level) | |
| LOG_LEVEL="$2" | |
| shift 2 | |
| ;; | |
| *) | |
| # First non-flag argument is the process name | |
| if [ -z "${PROCESS_NAME:-}" ]; then | |
| PROCESS_NAME="$1" | |
| fi | |
| shift | |
| ;; | |
| esac | |
| done | |
| # Set default process name if not provided | |
| if [ -z "${PROCESS_NAME:-}" ]; then | |
| PROCESS_NAME="/home/cj/tmp-llama/llama-server" | |
| fi | |
| # Log level constants | |
| DEBUG_LEVEL=1 | |
| INFO_LEVEL=2 | |
| WARN_LEVEL=3 | |
| ERROR_LEVEL=4 | |
| # Get numeric value for log level | |
| get_log_level_value() { | |
| case "$1" in | |
| "DEBUG") echo $DEBUG_LEVEL ;; | |
| "INFO") echo $INFO_LEVEL ;; | |
| "WARN") echo $WARN_LEVEL ;; | |
| "ERROR") echo $ERROR_LEVEL ;; | |
| *) echo $WARN_LEVEL ;; # Default to WARN if invalid level | |
| esac | |
| } | |
| # Logging function with level filtering | |
| log() { | |
| local level="$1" | |
| local message="$2" | |
| # Get the numeric value of the current log level | |
| local current_level_value=$(get_log_level_value "$LOG_LEVEL") | |
| # Get the numeric value of the message level | |
| local message_level_value=0 | |
| case "$level" in | |
| "DEBUG") message_level_value=$DEBUG_LEVEL ;; | |
| "INFO") message_level_value=$INFO_LEVEL ;; | |
| "WARN") message_level_value=$WARN_LEVEL ;; | |
| "ERROR") message_level_value=$ERROR_LEVEL ;; | |
| *) message_level_value=$INFO_LEVEL ;; # Default to INFO for unknown levels | |
| esac | |
| # Only log if message level is >= current log level | |
| if [ $message_level_value -ge $current_level_value ]; then | |
| local formatted_message="[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $message" | |
| if [ "$VERBOSE" = true ]; then | |
| echo "$formatted_message" | tee -a "$LOG_FILE" | |
| else | |
| echo "$formatted_message" >> "$LOG_FILE" | |
| fi | |
| fi | |
| } | |
| # Signal handler for graceful shutdown | |
| cleanup() { | |
| log "INFO" "Received shutdown signal. Cleaning up..." | |
| exit 0 | |
| } | |
| # Set up signal traps | |
| trap cleanup SIGTERM SIGINT | |
| # Check if curl is available | |
| if ! command -v curl &> /dev/null; then | |
| log "ERROR" "curl is not installed. Please install curl to use this watchdog." | |
| exit 1 | |
| fi | |
| # Function to check if Llama.cpp process is running | |
| is_process_running() { | |
| local pid=$1 | |
| if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then | |
| return 1 | |
| fi | |
| return 0 | |
| } | |
| # Function to find Llama.cpp processes | |
| find_llama_processes() { | |
| # Look for processes containing the specified process name in their command line | |
| log "DEBUG" "Finding process running: $PROCESS_NAME" >&2 | |
| pgrep -f "$PROCESS_NAME" 2>/dev/null | grep -xv "$$" # Exclude current process (exact match) | |
| } | |
| # Function to test API health | |
| test_api_health() { | |
| local base_url="${1:-http://localhost:8080}" | |
| local endpoints=("/health" "/v1/models" "/v1/chat/completions") | |
| local all_healthy=true | |
| log "INFO" "Testing API health at $base_url" | |
| for endpoint in "${endpoints[@]}"; do | |
| local full_url="${base_url}${endpoint}" | |
| log "DEBUG" "Checking endpoint: $full_url" | |
| local response_code=0 | |
| local retries=0 | |
| # Try up to MAX_RETRIES times with exponential backoff | |
| while [ $retries -lt $MAX_RETRIES ]; do | |
| if [ "$endpoint" = "/v1/chat/completions" ]; then | |
| # Use POST request with JSON body for chat/completions endpoint | |
| response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 \ | |
| -X POST \ | |
| -H "Content-Type: application/json" \ | |
| -d "{\"model\": \"${LLAMA_SWAP_MODEL_ID}\", \"messages\": [{\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},{\"role\":\"user\",\"content\":\"hello!\"}]}" \ | |
| "$full_url" 2>/dev/null || echo "000") | |
| else | |
| # Using curl with timeout to avoid hanging for other endpoints | |
| response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$full_url" 2>/dev/null || echo "000") | |
| fi | |
| # Special handling for /health endpoint - allow 200 and 503 status codes | |
| if [ "$endpoint" = "/health" ]; then | |
| if [ "$response_code" = "200" ] || [ "$response_code" = "503" ]; then | |
| log "DEBUG" "Endpoint $endpoint returned HTTP $response_code" | |
| break # Endpoint is healthy, move to next endpoint | |
| elif [ "$response_code" = "000" ]; then | |
| # Connection timeout or network error | |
| log "WARN" "Connection failed to $endpoint (HTTP $response_code)" | |
| retries=$((retries + 1)) | |
| if [ $retries -lt $MAX_RETRIES ]; then | |
| sleep $((2 ** retries)) # Exponential backoff | |
| fi | |
| else | |
| # For /health, any other status code is considered unhealthy | |
| log "WARN" "Endpoint $endpoint returned HTTP $response_code (only 200 and 503 are acceptable for health)" | |
| all_healthy=false | |
| break # Endpoint is unhealthy, move to next endpoint | |
| fi | |
| elif [ "$endpoint" = "/v1/chat/completions" ]; then | |
| # For chat/completions endpoint, only 200 is considered successful | |
| if [ "$response_code" = "200" ]; then | |
| log "DEBUG" "Endpoint $endpoint returned HTTP $response_code" | |
| break # Endpoint is healthy, move to next endpoint | |
| elif [ "$response_code" = "000" ]; then | |
| # Connection timeout or network error | |
| log "WARN" "Connection failed to $endpoint (HTTP $response_code)" | |
| retries=$((retries + 1)) | |
| if [ $retries -lt $MAX_RETRIES ]; then | |
| sleep $((2 ** retries)) # Exponential backoff | |
| fi | |
| else | |
| # Any other response code is considered a failure for chat/completions | |
| log "ERROR" "Endpoint $endpoint returned HTTP $response_code (expected 200)" | |
| all_healthy=false | |
| break # Endpoint is unhealthy, move to next endpoint | |
| fi | |
| else | |
| # For all other endpoints, use the original logic | |
| if [ "$response_code" -ge 200 ] && [ "$response_code" -lt 400 ]; then | |
| log "DEBUG" "Endpoint $endpoint returned HTTP $response_code" | |
| break # Endpoint is healthy, move to next endpoint | |
| elif [ "$response_code" = "000" ]; then | |
| # Connection timeout or network error | |
| log "WARN" "Connection failed to $endpoint (HTTP $response_code)" | |
| retries=$((retries + 1)) | |
| if [ $retries -lt $MAX_RETRIES ]; then | |
| sleep $((2 ** retries)) # Exponential backoff | |
| fi | |
| else | |
| # Other HTTP errors (4xx, 5xx, etc.) | |
| log "DEBUG" "Endpoint $endpoint returned HTTP $response_code" | |
| break # Endpoint is healthy (got a response), move to next endpoint | |
| fi | |
| fi | |
| done | |
| # If we exhausted retries for this endpoint, mark as unhealthy | |
| if [ $retries -ge $MAX_RETRIES ]; then | |
| log "ERROR" "Failed to reach endpoint $endpoint after $MAX_RETRIES attempts" | |
| all_healthy=false | |
| fi | |
| done | |
| # Return the overall health status | |
| if [ "$all_healthy" = true ]; then | |
| log "INFO" "All health check endpoints passed" | |
| return 0 | |
| else | |
| log "WARN" "One or more health check endpoints failed" | |
| return 1 | |
| fi | |
| } | |
| # Function to reload model via llama-swap API (unload then load) | |
| reload_model() { | |
| log "INFO" "Attempting to reload model '$LLAMA_SWAP_MODEL_ID' via llama-swap API at $LLAMA_SWAP_URL" | |
| # Step 1: Check if model is currently loaded | |
| local check_url="${LLAMA_SWAP_URL}/upstream/${LLAMA_SWAP_MODEL_ID}/v1/models" | |
| log "INFO" "Checking if model is loaded: GET $check_url" | |
| local check_response | |
| check_response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 -L \ | |
| "$check_url" 2>/dev/null || echo "000") | |
| if [ "$check_response" -ge 200 ] && [ "$check_response" -lt 400 ]; then | |
| # Model is loaded, unload it first | |
| local unload_url="${LLAMA_SWAP_URL}/api/models/unload/${LLAMA_SWAP_MODEL_ID}" | |
| log "INFO" "Model is loaded, unloading: POST $unload_url" | |
| local unload_response | |
| unload_response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 \ | |
| -X POST "$unload_url" 2>/dev/null || echo "000") | |
| if [ "$unload_response" = "200" ]; then | |
| log "INFO" "Model '$LLAMA_SWAP_MODEL_ID' unloaded successfully (HTTP $unload_response)" | |
| else | |
| log "ERROR" "Failed to unload model '$LLAMA_SWAP_MODEL_ID' (HTTP $unload_response)" | |
| return 1 | |
| fi | |
| # Brief pause to allow cleanup | |
| sleep 3 | |
| else | |
| log "INFO" "Model '$LLAMA_SWAP_MODEL_ID' is not loaded (HTTP $check_response), skipping unload" | |
| fi | |
| # Step 2: Load the model by hitting its upstream endpoint | |
| local upstream_url="${LLAMA_SWAP_URL}/upstream/${LLAMA_SWAP_MODEL_ID}" | |
| log "INFO" "Loading model: GET $upstream_url" | |
| local load_response | |
| load_response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 60 -L \ | |
| "${upstream_url}/v1/models" 2>/dev/null || echo "000") | |
| if [ "$load_response" -ge 200 ] && [ "$load_response" -lt 400 ]; then | |
| log "INFO" "Model '$LLAMA_SWAP_MODEL_ID' loaded successfully (HTTP $load_response)" | |
| else | |
| log "ERROR" "Failed to load model '$LLAMA_SWAP_MODEL_ID' (HTTP $load_response)" | |
| return 1 | |
| fi | |
| log "INFO" "Model '$LLAMA_SWAP_MODEL_ID' reloaded successfully" | |
| return 0 | |
| } | |
| # Function to restart Llama.cpp (if enabled) | |
| restart_llama() { | |
| log "INFO" "Attempting to restart Llama.cpp service (process: $PROCESS_NAME)" | |
| # Kill any existing processes | |
| local pids=$(find_llama_processes) | |
| if [ -n "$pids" ]; then | |
| log "INFO" "Killing existing Llama.cpp processes: $pids" | |
| echo "$pids" | xargs kill -TERM 2>/dev/null || true | |
| sleep 5 # Wait for processes to terminate | |
| # Force kill if needed | |
| echo "$pids" | xargs kill -9 2>/dev/null || true | |
| fi | |
| # Start the Llama.cpp server with the specified command | |
| # Using a more generic approach for the restart command | |
| log "INFO" "Starting Llama.cpp with command: $PROCESS_NAME --host 0.0.0.0 --port 8080" | |
| $PROCESS_NAME -m ~/.cache/huggingface/hub/models--unsloth--Qwen3-Coder-30B-A3B-Instruct-GGUF/snapshots/b17cb02dd882d5b6ab62fc777ad2995f19668350/Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf -c 131072 --host 0.0.0.0 --port 8080 -ngl 999 --threads 16 --jinja > ~/llama-server-output.log 2>&1 & | |
| echo $! > "$LLAMA_PID_FILE" | |
| log "INFO" "Llama.cpp restarted successfully" | |
| } | |
| # Run a single health check cycle | |
| run_check() { | |
| local pids=$(find_llama_processes) | |
| if [ -z "$pids" ]; then | |
| log "INFO" "No Llama.cpp processes found running" | |
| return | |
| fi | |
| log "INFO" "Found Llama.cpp processes with PIDs: $pids" | |
| for pid in $pids; do | |
| if ! is_process_running "$pid"; then | |
| log "WARN" "Process $pid is not running but still in pgrep results" | |
| continue | |
| fi | |
| log "INFO" "Checking health for process $pid" | |
| if test_api_health "http://localhost:8080"; then | |
| log "DEBUG" "Process $pid is healthy" | |
| else | |
| log "ERROR" "Process $pid is unhealthy" | |
| if [ "$RESTART_ENABLED" = true ]; then | |
| log "INFO" "Restart functionality enabled, attempting model reload via llama-swap" | |
| reload_model | |
| else | |
| log "INFO" "Restart functionality disabled, just logging failure" | |
| fi | |
| fi | |
| done | |
| } | |
| # Main entry point | |
| main() { | |
| log "INFO" "Starting $WATCHDOG_NAME for Llama.cpp monitoring (watching: $PROCESS_NAME)" | |
| if [ "$LOOP_MODE" = true ]; then | |
| log "INFO" "Running in loop mode (interval: ${CHECK_INTERVAL}s)" | |
| while true; do | |
| run_check | |
| sleep $CHECK_INTERVAL | |
| done | |
| else | |
| run_check | |
| fi | |
| } | |
| # If run directly (not sourced), execute main function | |
| if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | |
| main "$@" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment