Created
March 28, 2026 21:47
-
-
Save ansrivas/bb21176261cc2c87a27ddf9938105d72 to your computer and use it in GitHub Desktop.
llama-server-test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -Eeuo pipefail | |
| # Usage: | |
| # chmod +x probe-llama-vram.sh | |
| # ./probe-llama-vram.sh | |
| # | |
| # Optional overrides: | |
| # MODEL="mradermacher/Huihui-Qwen3.5-27B-abliterated-GGUF:Q4_K_M" ./probe-llama-vram.sh | |
| # PORT=8080 TIMEOUT_SECS=90 ./probe-llama-vram.sh | |
| MODEL="${MODEL:-mradermacher/Huihui-Qwen3.5-27B-abliterated-GGUF:Q4_K_M}" | |
| PORT="${PORT:-8080}" | |
| TIMEOUT_SECS="${TIMEOUT_SECS:-90}" | |
| LOG_DIR="${LOG_DIR:-./probe-logs}" | |
| LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-./llama-server}" | |
| # Start conservative. Increase only if model loads. | |
| # Order matters: first successful config is recorded, but the script continues | |
| # to search for a better one and prints the best successful combo at the end. | |
| CONTEXTS=( | |
| 4096 | |
| 8192 | |
| 16384 | |
| 32768 | |
| ) | |
| NGLS=( | |
| 4 | |
| 6 | |
| 8 | |
| 10 | |
| 12 | |
| 14 | |
| 16 | |
| ) | |
| BATCHES=( | |
| 64 | |
| 128 | |
| 256 | |
| ) | |
| # If your fork supports these, keep them enabled. | |
| COMMON_ARGS=( | |
| -hf "$MODEL" | |
| -ctk turbo3 | |
| -ctv turbo3 | |
| -fa on | |
| --port "$PORT" | |
| ) | |
| mkdir -p "$LOG_DIR" | |
| have_cmd() { | |
| command -v "$1" >/dev/null 2>&1 | |
| } | |
| require_cmd() { | |
| if ! have_cmd "$1"; then | |
| echo "error: required command not found: $1" >&2 | |
| exit 1 | |
| fi | |
| } | |
| require_cmd bash | |
| require_cmd grep | |
| require_cmd awk | |
| require_cmd sed | |
| require_cmd timeout | |
| require_cmd nvidia-smi | |
| if [[ ! -x "$LLAMA_SERVER_BIN" ]]; then | |
| echo "error: llama server binary not found or not executable: $LLAMA_SERVER_BIN" >&2 | |
| echo "hint: set LLAMA_SERVER_BIN=/full/path/to/llama-server" >&2 | |
| exit 1 | |
| fi | |
| cleanup_server() { | |
| local pid="${1:-}" | |
| if [[ -n "$pid" ]]; then | |
| kill "$pid" >/dev/null 2>&1 || true | |
| wait "$pid" >/dev/null 2>&1 || true | |
| fi | |
| } | |
| free_vram_mib() { | |
| # Returns free VRAM in MiB for GPU 0 | |
| nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]' | |
| } | |
| used_vram_mib() { | |
| nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]' | |
| } | |
| total_vram_mib() { | |
| nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]' | |
| } | |
| wait_for_load_result() { | |
| local logfile="$1" | |
| local seconds="$2" | |
| local i | |
| for ((i=0; i<seconds; i++)); do | |
| if grep -Eq \ | |
| 'main: exiting due to model loading error|failed to load model|cudaMalloc failed: out of memory|unable to allocate CUDA0 buffer|error loading model' \ | |
| "$logfile"; then | |
| return 2 | |
| fi | |
| if grep -Eq \ | |
| 'HTTP server is listening|listening on|server is listening|model loaded|slot init|srv +main' \ | |
| "$logfile"; then | |
| return 0 | |
| fi | |
| sleep 1 | |
| done | |
| return 1 | |
| } | |
| score_config() { | |
| # Higher is better | |
| local ngl="$1" | |
| local ctx="$2" | |
| local batch="$3" | |
| echo $(( ngl * 1000000 + ctx * 100 + batch )) | |
| } | |
| run_probe() { | |
| local ngl="$1" | |
| local ctx="$2" | |
| local batch="$3" | |
| local tag="ngl_${ngl}__ctx_${ctx}__b_${batch}" | |
| local logfile="${LOG_DIR}/${tag}.log" | |
| echo | |
| echo "==> probing: ngl=${ngl} ctx=${ctx} batch=${batch}" | |
| echo " free_vram_before=$(free_vram_mib) MiB / total=$(total_vram_mib) MiB" | |
| : > "$logfile" | |
| local pid="" | |
| set +e | |
| TURBO_LAYER_ADAPTIVE=1 "$LLAMA_SERVER_BIN" \ | |
| "${COMMON_ARGS[@]}" \ | |
| -c "$ctx" \ | |
| -ngl "$ngl" \ | |
| -b "$batch" \ | |
| >"$logfile" 2>&1 & | |
| pid=$! | |
| wait_for_load_result "$logfile" "$TIMEOUT_SECS" | |
| local result=$? | |
| if [[ $result -eq 0 ]]; then | |
| echo " result=SUCCESS used_vram_now=$(used_vram_mib) MiB" | |
| cleanup_server "$pid" | |
| set -e | |
| return 0 | |
| fi | |
| cleanup_server "$pid" | |
| set -e | |
| case "$result" in | |
| 1) | |
| echo " result=TIMEOUT" | |
| ;; | |
| 2) | |
| echo " result=FAILED" | |
| ;; | |
| *) | |
| echo " result=UNKNOWN" | |
| ;; | |
| esac | |
| echo " last_log_lines:" | |
| tail -n 20 "$logfile" | sed 's/^/ /' | |
| return 1 | |
| } | |
| best_found=0 | |
| best_score=-1 | |
| best_ngl=0 | |
| best_ctx=0 | |
| best_batch=0 | |
| echo "GPU memory: total=$(total_vram_mib) MiB free=$(free_vram_mib) MiB" | |
| echo "Model: $MODEL" | |
| echo "Binary: $LLAMA_SERVER_BIN" | |
| echo "Logs: $LOG_DIR" | |
| # Strategy: | |
| # - Iterate by context first, then ngl, then batch | |
| # - Conservative-to-aggressive ordering | |
| # - Record all successes, keep the "best" by a simple score | |
| for ctx in "${CONTEXTS[@]}"; do | |
| for ngl in "${NGLS[@]}"; do | |
| for batch in "${BATCHES[@]}"; do | |
| if run_probe "$ngl" "$ctx" "$batch"; then | |
| local_score="$(score_config "$ngl" "$ctx" "$batch")" | |
| if (( local_score > best_score )); then | |
| best_score="$local_score" | |
| best_ngl="$ngl" | |
| best_ctx="$ctx" | |
| best_batch="$batch" | |
| best_found=1 | |
| fi | |
| fi | |
| done | |
| done | |
| done | |
| echo | |
| echo "============================================================" | |
| if (( best_found == 1 )); then | |
| echo "BEST SUCCESSFUL CONFIG FOUND" | |
| echo " -ngl ${best_ngl}" | |
| echo " -c ${best_ctx}" | |
| echo " -b ${best_batch}" | |
| echo | |
| echo "Run it with:" | |
| echo | |
| cat <<EOF | |
| TURBO_LAYER_ADAPTIVE=1 ${LLAMA_SERVER_BIN} \\ | |
| -hf "${MODEL}" \\ | |
| -c ${best_ctx} \\ | |
| -ngl ${best_ngl} \\ | |
| -ctk turbo3 \\ | |
| -ctv turbo3 \\ | |
| -fa on \\ | |
| -b ${best_batch} \\ | |
| --port ${PORT} | |
| EOF | |
| else | |
| echo "No tested configuration succeeded." | |
| echo | |
| echo "Try these next:" | |
| echo " 1. use a smaller model" | |
| echo " 2. reduce context below 4096" | |
| echo " 3. reduce batch below 64" | |
| echo " 4. reduce -ngl below 4" | |
| fi | |
| echo "============================================================" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment