ansrivas · March 28, 2026 21:47
diff --git a/test.sh b/test.sh
 #!/usr/bin/env bash
 set -Eeuo pipefail

 # Usage:
 #   chmod +x probe-llama-vram.sh
 #   ./probe-llama-vram.sh
 #
 # Optional overrides:
 #   MODEL="mradermacher/Huihui-Qwen3.5-27B-abliterated-GGUF:Q4_K_M" ./probe-llama-vram.sh
 #   PORT=8080 TIMEOUT_SECS=90 ./probe-llama-vram.sh

 MODEL="${MODEL:-mradermacher/Huihui-Qwen3.5-27B-abliterated-GGUF:Q4_K_M}"
 PORT="${PORT:-8080}"
 TIMEOUT_SECS="${TIMEOUT_SECS:-90}"
 LOG_DIR="${LOG_DIR:-./probe-logs}"
 LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-./llama-server}"

 # Start conservative. Increase only if model loads.
 # Order matters: first successful config is recorded, but the script continues
 # to search for a better one and prints the best successful combo at the end.
 CONTEXTS=(
  4096
  8192
  16384
  32768
 )

 NGLS=(
  4
  6
  8
  10
  12
  14
  16
 )

 BATCHES=(
  64
  128
  256
 )

 # If your fork supports these, keep them enabled.
 COMMON_ARGS=(
  -hf "$MODEL"
  -ctk turbo3
  -ctv turbo3
  -fa on
  --port "$PORT"
 )

 mkdir -p "$LOG_DIR"

 have_cmd() {
  command -v "$1" >/dev/null 2>&1
 }

 require_cmd() {
  if ! have_cmd "$1"; then
    echo "error: required command not found: $1" >&2
    exit 1
  fi
 }

 require_cmd bash
 require_cmd grep
 require_cmd awk
 require_cmd sed
 require_cmd timeout
 require_cmd nvidia-smi

 if [[ ! -x "$LLAMA_SERVER_BIN" ]]; then
  echo "error: llama server binary not found or not executable: $LLAMA_SERVER_BIN" >&2
  echo "hint: set LLAMA_SERVER_BIN=/full/path/to/llama-server" >&2
  exit 1
 fi

 cleanup_server() {
  local pid="${1:-}"
  if [[ -n "$pid" ]]; then
    kill "$pid" >/dev/null 2>&1 || true
    wait "$pid" >/dev/null 2>&1 || true
  fi
 }

 free_vram_mib() {
  # Returns free VRAM in MiB for GPU 0
  nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]'
 }

 used_vram_mib() {
  nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]'
 }

 total_vram_mib() {
  nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n1 | tr -d '[:space:]'
 }

 wait_for_load_result() {
  local logfile="$1"
  local seconds="$2"

  local i
  for ((i=0; i<seconds; i++)); do
    if grep -Eq \
      'main: exiting due to model loading error|failed to load model|cudaMalloc failed: out of memory|unable to allocate CUDA0 buffer|error loading model' \
      "$logfile"; then
      return 2
    fi

    if grep -Eq \
      'HTTP server is listening|listening on|server is listening|model loaded|slot init|srv +main' \
      "$logfile"; then
      return 0
    fi

    sleep 1
  done

  return 1
 }

 score_config() {
  # Higher is better
  local ngl="$1"
  local ctx="$2"
  local batch="$3"
  echo $(( ngl * 1000000 + ctx * 100 + batch ))
 }

 run_probe() {
  local ngl="$1"
  local ctx="$2"
  local batch="$3"

  local tag="ngl_${ngl}__ctx_${ctx}__b_${batch}"
  local logfile="${LOG_DIR}/${tag}.log"

  echo
  echo "==> probing: ngl=${ngl} ctx=${ctx} batch=${batch}"
  echo "    free_vram_before=$(free_vram_mib) MiB / total=$(total_vram_mib) MiB"

  : > "$logfile"

  local pid=""
  set +e
  TURBO_LAYER_ADAPTIVE=1 "$LLAMA_SERVER_BIN" \
    "${COMMON_ARGS[@]}" \
    -c "$ctx" \
    -ngl "$ngl" \
    -b "$batch" \
    >"$logfile" 2>&1 &
  pid=$!

  wait_for_load_result "$logfile" "$TIMEOUT_SECS"
  local result=$?

  if [[ $result -eq 0 ]]; then
    echo "    result=SUCCESS used_vram_now=$(used_vram_mib) MiB"
    cleanup_server "$pid"
    set -e
    return 0
  fi

  cleanup_server "$pid"
  set -e

  case "$result" in
    1)
      echo "    result=TIMEOUT"
      ;;
    2)
      echo "    result=FAILED"
      ;;
    *)
      echo "    result=UNKNOWN"
      ;;
  esac

  echo "    last_log_lines:"
  tail -n 20 "$logfile" | sed 's/^/      /'
  return 1
 }

 best_found=0
 best_score=-1
 best_ngl=0
 best_ctx=0
 best_batch=0

 echo "GPU memory: total=$(total_vram_mib) MiB free=$(free_vram_mib) MiB"
 echo "Model: $MODEL"
 echo "Binary: $LLAMA_SERVER_BIN"
 echo "Logs: $LOG_DIR"

 # Strategy:
 # - Iterate by context first, then ngl, then batch
 # - Conservative-to-aggressive ordering
 # - Record all successes, keep the "best" by a simple score
 for ctx in "${CONTEXTS[@]}"; do
  for ngl in "${NGLS[@]}"; do
    for batch in "${BATCHES[@]}"; do
      if run_probe "$ngl" "$ctx" "$batch"; then
        local_score="$(score_config "$ngl" "$ctx" "$batch")"
        if (( local_score > best_score )); then
          best_score="$local_score"
          best_ngl="$ngl"
          best_ctx="$ctx"
          best_batch="$batch"
          best_found=1
        fi
      fi
    done
  done
 done

 echo
 echo "============================================================"
 if (( best_found == 1 )); then
  echo "BEST SUCCESSFUL CONFIG FOUND"
  echo "  -ngl ${best_ngl}"
  echo "  -c   ${best_ctx}"
  echo "  -b   ${best_batch}"
  echo
  echo "Run it with:"
  echo
  cat <<EOF
 TURBO_LAYER_ADAPTIVE=1 ${LLAMA_SERVER_BIN} \\
  -hf "${MODEL}" \\
  -c ${best_ctx} \\
  -ngl ${best_ngl} \\
  -ctk turbo3 \\
  -ctv turbo3 \\
  -fa on \\
  -b ${best_batch} \\
  --port ${PORT}
 EOF
 else
  echo "No tested configuration succeeded."
  echo
  echo "Try these next:"
  echo "  1. use a smaller model"
  echo "  2. reduce context below 4096"
  echo "  3. reduce batch below 64"
  echo "  4. reduce -ngl below 4"
 fi
 echo "============================================================"
	#!/usr/bin/env bash
	set -Eeuo pipefail

	# Usage:
	# chmod +x probe-llama-vram.sh
	# ./probe-llama-vram.sh
	#
	# Optional overrides:
	# MODEL="mradermacher/Huihui-Qwen3.5-27B-abliterated-GGUF:Q4_K_M" ./probe-llama-vram.sh
	# PORT=8080 TIMEOUT_SECS=90 ./probe-llama-vram.sh

	MODEL="${MODEL:-mradermacher/Huihui-Qwen3.5-27B-abliterated-GGUF:Q4_K_M}"
	PORT="${PORT:-8080}"
	TIMEOUT_SECS="${TIMEOUT_SECS:-90}"
	LOG_DIR="${LOG_DIR:-./probe-logs}"
	LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-./llama-server}"

	# Start conservative. Increase only if model loads.
	# Order matters: first successful config is recorded, but the script continues
	# to search for a better one and prints the best successful combo at the end.
	CONTEXTS=(
	4096
	8192
	16384
	32768
	)

	NGLS=(
	4
	6
	8
	10
	12
	14
	16
	)

	BATCHES=(
	64
	128
	256
	)

	# If your fork supports these, keep them enabled.
	COMMON_ARGS=(
	-hf "$MODEL"
	-ctk turbo3
	-ctv turbo3
	-fa on
	--port "$PORT"
	)

	mkdir -p "$LOG_DIR"

	have_cmd() {
	command -v "$1" >/dev/null 2>&1
	}

	require_cmd() {
	if ! have_cmd "$1"; then
	echo "error: required command not found: $1" >&2
	exit 1
	fi
	}

	require_cmd bash
	require_cmd grep
	require_cmd awk
	require_cmd sed
	require_cmd timeout
	require_cmd nvidia-smi

	if [[ ! -x "$LLAMA_SERVER_BIN" ]]; then
	echo "error: llama server binary not found or not executable: $LLAMA_SERVER_BIN" >&2
	echo "hint: set LLAMA_SERVER_BIN=/full/path/to/llama-server" >&2
	exit 1
	fi

	cleanup_server() {
	local pid="${1:-}"
	if [[ -n "$pid" ]]; then
	kill "$pid" >/dev/null 2>&1 \|\| true
	wait "$pid" >/dev/null 2>&1 \|\| true
	fi
	}

	free_vram_mib() {
	# Returns free VRAM in MiB for GPU 0
	nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits \| head -n1 \| tr -d '[:space:]'
	}

	used_vram_mib() {
	nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits \| head -n1 \| tr -d '[:space:]'
	}

	total_vram_mib() {
	nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits \| head -n1 \| tr -d '[:space:]'
	}

	wait_for_load_result() {
	local logfile="$1"
	local seconds="$2"

	local i
	for ((i=0; i<seconds; i++)); do
	if grep -Eq \
	'main: exiting due to model loading error\|failed to load model\|cudaMalloc failed: out of memory\|unable to allocate CUDA0 buffer\|error loading model' \
	"$logfile"; then
	return 2
	fi

	if grep -Eq \
	'HTTP server is listening\|listening on\|server is listening\|model loaded\|slot init\|srv +main' \
	"$logfile"; then
	return 0
	fi

	sleep 1
	done

	return 1
	}

	score_config() {
	# Higher is better
	local ngl="$1"
	local ctx="$2"
	local batch="$3"
	echo $(( ngl * 1000000 + ctx * 100 + batch ))
	}

	run_probe() {
	local ngl="$1"
	local ctx="$2"
	local batch="$3"

	local tag="ngl_${ngl}__ctx_${ctx}__b_${batch}"
	local logfile="${LOG_DIR}/${tag}.log"

	echo
	echo "==> probing: ngl=${ngl} ctx=${ctx} batch=${batch}"
	echo " free_vram_before=$(free_vram_mib) MiB / total=$(total_vram_mib) MiB"

	: > "$logfile"

	local pid=""
	set +e
	TURBO_LAYER_ADAPTIVE=1 "$LLAMA_SERVER_BIN" \
	"${COMMON_ARGS[@]}" \
	-c "$ctx" \
	-ngl "$ngl" \
	-b "$batch" \
	>"$logfile" 2>&1 &
	pid=$!

	wait_for_load_result "$logfile" "$TIMEOUT_SECS"
	local result=$?

	if [[ $result -eq 0 ]]; then
	echo " result=SUCCESS used_vram_now=$(used_vram_mib) MiB"
	cleanup_server "$pid"
	set -e
	return 0
	fi

	cleanup_server "$pid"
	set -e

	case "$result" in
	1)
	echo " result=TIMEOUT"
	;;
	2)
	echo " result=FAILED"
	;;
	*)
	echo " result=UNKNOWN"
	;;
	esac

	echo " last_log_lines:"
	tail -n 20 "$logfile" \| sed 's/^/ /'
	return 1
	}

	best_found=0
	best_score=-1
	best_ngl=0
	best_ctx=0
	best_batch=0

	echo "GPU memory: total=$(total_vram_mib) MiB free=$(free_vram_mib) MiB"
	echo "Model: $MODEL"
	echo "Binary: $LLAMA_SERVER_BIN"
	echo "Logs: $LOG_DIR"

	# Strategy:
	# - Iterate by context first, then ngl, then batch
	# - Conservative-to-aggressive ordering
	# - Record all successes, keep the "best" by a simple score
	for ctx in "${CONTEXTS[@]}"; do
	for ngl in "${NGLS[@]}"; do
	for batch in "${BATCHES[@]}"; do
	if run_probe "$ngl" "$ctx" "$batch"; then
	local_score="$(score_config "$ngl" "$ctx" "$batch")"
	if (( local_score > best_score )); then
	best_score="$local_score"
	best_ngl="$ngl"
	best_ctx="$ctx"
	best_batch="$batch"
	best_found=1
	fi
	fi
	done
	done
	done

	echo
	echo "============================================================"
	if (( best_found == 1 )); then
	echo "BEST SUCCESSFUL CONFIG FOUND"
	echo " -ngl ${best_ngl}"
	echo " -c ${best_ctx}"
	echo " -b ${best_batch}"
	echo
	echo "Run it with:"
	echo
	cat <<EOF
	TURBO_LAYER_ADAPTIVE=1 ${LLAMA_SERVER_BIN} \\
	-hf "${MODEL}" \\
	-c ${best_ctx} \\
	-ngl ${best_ngl} \\
	-ctk turbo3 \\
	-ctv turbo3 \\
	-fa on \\
	-b ${best_batch} \\
	--port ${PORT}
	EOF
	else
	echo "No tested configuration succeeded."
	echo
	echo "Try these next:"
	echo " 1. use a smaller model"
	echo " 2. reduce context below 4096"
	echo " 3. reduce batch below 64"
	echo " 4. reduce -ngl below 4"
	fi
	echo "============================================================"
No results found