mmguero · October 28, 2025 22:02
diff --git a/llama-run.sh b/llama-run.sh
 #!/usr/bin/env bash

 # Default settings
 DEFAULT_GPU_LAYERS=20
 DEFAULT_CTX_SIZE=768
 LLAMA_BIN="./llama.cpp/build/bin/llama-server"
 HOST="0.0.0.0"
 PORT="8832"

 # Predefined configs: "filename.gguf|gpu-layers|ctx-size"
 configs=(
  "gemma-3-12b-it-qat-q4_0.Q5_K_M.gguf|30|32768"
  "gemma-3-12b-it.Q5_K_M.gguf|30|32768"
  "gemma-3-4b-it.Q5_K_M.gguf|60|49152"
  "gemma-7b-it.Q5_K_M.gguf|26|4096"
  "GLM-4-9B-0414-Q5_K_M.gguf|40|15360"
  "GLM-4-9B-0414-UD-Q2_K_XL.gguf|60|20480"
  "jamba-reasoning-3b-F16.gguf|28|49152"
  "jamba-reasoning-3b-Q4_K_M.gguf|28|139264"
  "qwen2.5-coder-7b-instruct-q5_k_m.gguf|35|20480"
  "qwen2.5-coder-14b-instruct-q5_k_m.gguf|25|20480"
  "Qwen3-8B.Q5_K_M.gguf|40|11264"
 )

 # Build associative arrays from configs
 declare -A gpu_layers_map
 declare -A ctx_size_map

 for cfg in "${configs[@]}"; do
  IFS='|' read -r fname gpu ctx <<< "$cfg"
  gpu_layers_map["$fname"]="$gpu"
  ctx_size_map["$fname"]="$ctx"
 done

 # Find all .gguf model files (with full paths)
 mapfile -t found_models < <(find ./models -type f -name "*.gguf")

 # No models found?
 if [[ ${#found_models[@]} -eq 0 ]]; then
  echo "No .gguf models found in current directory."
  exit 1
 fi

 # Show numbered menu
 echo "Available models:"
 for i in "${!found_models[@]}"; do
  fname=$(basename "${found_models[$i]}")
  echo "$((i+1))) $fname"
 done

 # Prompt user for choice
 read -rp "Choose a model by number: " choice

 # Validate choice
 if ! [[ "$choice" =~ ^[0-9]+$ ]] || (( choice < 1 || choice > ${#found_models[@]} )); then
  echo "Invalid choice."
  exit 1
 fi

 # Get selected model info
 fullpath="${found_models[$((choice-1))]}"
 fname=$(basename "$fullpath")

 gpu="${gpu_layers_map[$fname]:-$DEFAULT_GPU_LAYERS}"
 ctx="${ctx_size_map[$fname]:-$DEFAULT_CTX_SIZE}"

 # Dry-run echo for selected model
 echo -e "\n Selected model: $fname"
 echo "   ➤ Full path: $fullpath"
 echo "   ➤ GPU layers: $gpu"
 echo "   ➤ Context size: $ctx"
 LLAMA_CUDA_FORCE_MMQ=1 "$LLAMA_BIN" \
    -m "$fullpath" \
    --gpu-layers "$gpu" \
    --ctx-size "$ctx" \
    --port "$PORT" \
    --host "$HOST"
	#!/usr/bin/env bash

	# Default settings
	DEFAULT_GPU_LAYERS=20
	DEFAULT_CTX_SIZE=768
	LLAMA_BIN="./llama.cpp/build/bin/llama-server"
	HOST="0.0.0.0"
	PORT="8832"

	# Predefined configs: "filename.gguf\|gpu-layers\|ctx-size"
	configs=(
	"gemma-3-12b-it-qat-q4_0.Q5_K_M.gguf\|30\|32768"
	"gemma-3-12b-it.Q5_K_M.gguf\|30\|32768"
	"gemma-3-4b-it.Q5_K_M.gguf\|60\|49152"
	"gemma-7b-it.Q5_K_M.gguf\|26\|4096"
	"GLM-4-9B-0414-Q5_K_M.gguf\|40\|15360"
	"GLM-4-9B-0414-UD-Q2_K_XL.gguf\|60\|20480"
	"jamba-reasoning-3b-F16.gguf\|28\|49152"
	"jamba-reasoning-3b-Q4_K_M.gguf\|28\|139264"
	"qwen2.5-coder-7b-instruct-q5_k_m.gguf\|35\|20480"
	"qwen2.5-coder-14b-instruct-q5_k_m.gguf\|25\|20480"
	"Qwen3-8B.Q5_K_M.gguf\|40\|11264"
	)

	# Build associative arrays from configs
	declare -A gpu_layers_map
	declare -A ctx_size_map

	for cfg in "${configs[@]}"; do
	IFS='\|' read -r fname gpu ctx <<< "$cfg"
	gpu_layers_map["$fname"]="$gpu"
	ctx_size_map["$fname"]="$ctx"
	done

	# Find all .gguf model files (with full paths)
	mapfile -t found_models < <(find ./models -type f -name "*.gguf")

	# No models found?
	if [[ ${#found_models[@]} -eq 0 ]]; then
	echo "No .gguf models found in current directory."
	exit 1
	fi

	# Show numbered menu
	echo "Available models:"
	for i in "${!found_models[@]}"; do
	fname=$(basename "${found_models[$i]}")
	echo "$((i+1))) $fname"
	done

	# Prompt user for choice
	read -rp "Choose a model by number: " choice

	# Validate choice
	if ! [[ "$choice" =~ ^[0-9]+$ ]] \|\| (( choice < 1 \|\| choice > ${#found_models[@]} )); then
	echo "Invalid choice."
	exit 1
	fi

	# Get selected model info
	fullpath="${found_models[$((choice-1))]}"
	fname=$(basename "$fullpath")

	gpu="${gpu_layers_map[$fname]:-$DEFAULT_GPU_LAYERS}"
	ctx="${ctx_size_map[$fname]:-$DEFAULT_CTX_SIZE}"

	# Dry-run echo for selected model
	echo -e "\n Selected model: $fname"
	echo " ➤ Full path: $fullpath"
	echo " ➤ GPU layers: $gpu"
	echo " ➤ Context size: $ctx"
	LLAMA_CUDA_FORCE_MMQ=1 "$LLAMA_BIN" \
	-m "$fullpath" \
	--gpu-layers "$gpu" \
	--ctx-size "$ctx" \
	--port "$PORT" \
	--host "$HOST"