Skip to content

Instantly share code, notes, and snippets.

@ansrivas
Created March 28, 2026 21:47
Show Gist options
  • Select an option

  • Save ansrivas/bb21176261cc2c87a27ddf9938105d72 to your computer and use it in GitHub Desktop.

Select an option

Save ansrivas/bb21176261cc2c87a27ddf9938105d72 to your computer and use it in GitHub Desktop.
llama-server-test
#!/usr/bin/env bash
set -Eeuo pipefail
# Usage:
# chmod +x probe-llama-vram.sh
# ./probe-llama-vram.sh
#
# Optional overrides:
# MODEL="mradermacher/Huihui-Qwen3.5-27B-abliterated-GGUF:Q4_K_M" ./probe-llama-vram.sh
# PORT=8080 TIMEOUT_SECS=90 ./probe-llama-vram.sh
MODEL="${MODEL:-mradermacher/Huihui-Qwen3.5-27B-abliterated-GGUF:Q4_K_M}"
PORT="${PORT:-8080}"
TIMEOUT_SECS="${TIMEOUT_SECS:-90}"
LOG_DIR="${LOG_DIR:-./probe-logs}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-./llama-server}"
# Start conservative. Increase only if model loads.
# Order matters: first successful config is recorded, but the script continues
# to search for a better one and prints the best successful combo at the end.
CONTEXTS=(
4096
8192
16384
32768
)
NGLS=(
4
6
8
10
12
14
16
)
BATCHES=(
64
128
256
)
# If your fork supports these, keep them enabled.
COMMON_ARGS=(
-hf "$MODEL"
-ctk turbo3
-ctv turbo3
-fa on
--port "$PORT"
)
mkdir -p "$LOG_DIR"
have_cmd() {
command -v "$1" >/dev/null 2>&1
}
require_cmd() {
if ! have_cmd "$1"; then
echo "error: required command not found: $1" >&2
exit 1
fi
}
require_cmd bash
require_cmd grep
require_cmd awk
require_cmd sed
require_cmd timeout
require_cmd nvidia-smi
if [[ ! -x "$LLAMA_SERVER_BIN" ]]; then
echo "error: llama server binary not found or not executable: $LLAMA_SERVER_BIN" >&2
echo "hint: set LLAMA_SERVER_BIN=/full/path/to/llama-server" >&2
exit 1
fi
cleanup_server() {
local pid="${1:-}"
if [[ -n "$pid" ]]; then
kill "$pid" >/dev/null 2>&1 || true
wait "$pid" >/dev/null 2>&1 || true
fi
}
free_vram_mib() {
# Returns free VRAM in MiB for GPU 0
nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]'
}
used_vram_mib() {
nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]'
}
total_vram_mib() {
nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]'
}
wait_for_load_result() {
local logfile="$1"
local seconds="$2"
local i
for ((i=0; i<seconds; i++)); do
if grep -Eq \
'main: exiting due to model loading error|failed to load model|cudaMalloc failed: out of memory|unable to allocate CUDA0 buffer|error loading model' \
"$logfile"; then
return 2
fi
if grep -Eq \
'HTTP server is listening|listening on|server is listening|model loaded|slot init|srv +main' \
"$logfile"; then
return 0
fi
sleep 1
done
return 1
}
score_config() {
# Higher is better
local ngl="$1"
local ctx="$2"
local batch="$3"
echo $(( ngl * 1000000 + ctx * 100 + batch ))
}
run_probe() {
local ngl="$1"
local ctx="$2"
local batch="$3"
local tag="ngl_${ngl}__ctx_${ctx}__b_${batch}"
local logfile="${LOG_DIR}/${tag}.log"
echo
echo "==> probing: ngl=${ngl} ctx=${ctx} batch=${batch}"
echo " free_vram_before=$(free_vram_mib) MiB / total=$(total_vram_mib) MiB"
: > "$logfile"
local pid=""
set +e
TURBO_LAYER_ADAPTIVE=1 "$LLAMA_SERVER_BIN" \
"${COMMON_ARGS[@]}" \
-c "$ctx" \
-ngl "$ngl" \
-b "$batch" \
>"$logfile" 2>&1 &
pid=$!
wait_for_load_result "$logfile" "$TIMEOUT_SECS"
local result=$?
if [[ $result -eq 0 ]]; then
echo " result=SUCCESS used_vram_now=$(used_vram_mib) MiB"
cleanup_server "$pid"
set -e
return 0
fi
cleanup_server "$pid"
set -e
case "$result" in
1)
echo " result=TIMEOUT"
;;
2)
echo " result=FAILED"
;;
*)
echo " result=UNKNOWN"
;;
esac
echo " last_log_lines:"
tail -n 20 "$logfile" | sed 's/^/ /'
return 1
}
best_found=0
best_score=-1
best_ngl=0
best_ctx=0
best_batch=0
echo "GPU memory: total=$(total_vram_mib) MiB free=$(free_vram_mib) MiB"
echo "Model: $MODEL"
echo "Binary: $LLAMA_SERVER_BIN"
echo "Logs: $LOG_DIR"
# Strategy:
# - Iterate by context first, then ngl, then batch
# - Conservative-to-aggressive ordering
# - Record all successes, keep the "best" by a simple score
for ctx in "${CONTEXTS[@]}"; do
for ngl in "${NGLS[@]}"; do
for batch in "${BATCHES[@]}"; do
if run_probe "$ngl" "$ctx" "$batch"; then
local_score="$(score_config "$ngl" "$ctx" "$batch")"
if (( local_score > best_score )); then
best_score="$local_score"
best_ngl="$ngl"
best_ctx="$ctx"
best_batch="$batch"
best_found=1
fi
fi
done
done
done
echo
echo "============================================================"
if (( best_found == 1 )); then
echo "BEST SUCCESSFUL CONFIG FOUND"
echo " -ngl ${best_ngl}"
echo " -c ${best_ctx}"
echo " -b ${best_batch}"
echo
echo "Run it with:"
echo
cat <<EOF
TURBO_LAYER_ADAPTIVE=1 ${LLAMA_SERVER_BIN} \\
-hf "${MODEL}" \\
-c ${best_ctx} \\
-ngl ${best_ngl} \\
-ctk turbo3 \\
-ctv turbo3 \\
-fa on \\
-b ${best_batch} \\
--port ${PORT}
EOF
else
echo "No tested configuration succeeded."
echo
echo "Try these next:"
echo " 1. use a smaller model"
echo " 2. reduce context below 4096"
echo " 3. reduce batch below 64"
echo " 4. reduce -ngl below 4"
fi
echo "============================================================"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment