Skip to content

Instantly share code, notes, and snippets.

@mmguero
Last active October 28, 2025 22:02
Show Gist options
  • Save mmguero/cae0c5b09f1320601f76cddb6591f524 to your computer and use it in GitHub Desktop.
Save mmguero/cae0c5b09f1320601f76cddb6591f524 to your computer and use it in GitHub Desktop.
run llama-server with an LLVM model chosen from a list with some default parameters
#!/usr/bin/env bash
# Default settings
DEFAULT_GPU_LAYERS=20
DEFAULT_CTX_SIZE=768
LLAMA_BIN="./llama.cpp/build/bin/llama-server"
HOST="0.0.0.0"
PORT="8832"
# Predefined configs: "filename.gguf|gpu-layers|ctx-size"
configs=(
"gemma-3-12b-it-qat-q4_0.Q5_K_M.gguf|30|32768"
"gemma-3-12b-it.Q5_K_M.gguf|30|32768"
"gemma-3-4b-it.Q5_K_M.gguf|60|49152"
"gemma-7b-it.Q5_K_M.gguf|26|4096"
"GLM-4-9B-0414-Q5_K_M.gguf|40|15360"
"GLM-4-9B-0414-UD-Q2_K_XL.gguf|60|20480"
"jamba-reasoning-3b-F16.gguf|28|49152"
"jamba-reasoning-3b-Q4_K_M.gguf|28|139264"
"qwen2.5-coder-7b-instruct-q5_k_m.gguf|35|20480"
"qwen2.5-coder-14b-instruct-q5_k_m.gguf|25|20480"
"Qwen3-8B.Q5_K_M.gguf|40|11264"
)
# Build associative arrays from configs
declare -A gpu_layers_map
declare -A ctx_size_map
for cfg in "${configs[@]}"; do
IFS='|' read -r fname gpu ctx <<< "$cfg"
gpu_layers_map["$fname"]="$gpu"
ctx_size_map["$fname"]="$ctx"
done
# Find all .gguf model files (with full paths)
mapfile -t found_models < <(find ./models -type f -name "*.gguf")
# No models found?
if [[ ${#found_models[@]} -eq 0 ]]; then
echo "No .gguf models found in current directory."
exit 1
fi
# Show numbered menu
echo "Available models:"
for i in "${!found_models[@]}"; do
fname=$(basename "${found_models[$i]}")
echo "$((i+1))) $fname"
done
# Prompt user for choice
read -rp "Choose a model by number: " choice
# Validate choice
if ! [[ "$choice" =~ ^[0-9]+$ ]] || (( choice < 1 || choice > ${#found_models[@]} )); then
echo "Invalid choice."
exit 1
fi
# Get selected model info
fullpath="${found_models[$((choice-1))]}"
fname=$(basename "$fullpath")
gpu="${gpu_layers_map[$fname]:-$DEFAULT_GPU_LAYERS}"
ctx="${ctx_size_map[$fname]:-$DEFAULT_CTX_SIZE}"
# Dry-run echo for selected model
echo -e "\n Selected model: $fname"
echo " ➤ Full path: $fullpath"
echo " ➤ GPU layers: $gpu"
echo " ➤ Context size: $ctx"
LLAMA_CUDA_FORCE_MMQ=1 "$LLAMA_BIN" \
-m "$fullpath" \
--gpu-layers "$gpu" \
--ctx-size "$ctx" \
--port "$PORT" \
--host "$HOST"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment