Last active
October 28, 2025 22:02
-
-
Save mmguero/cae0c5b09f1320601f76cddb6591f524 to your computer and use it in GitHub Desktop.
run llama-server with an LLVM model chosen from a list with some default parameters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # Default settings | |
| DEFAULT_GPU_LAYERS=20 | |
| DEFAULT_CTX_SIZE=768 | |
| LLAMA_BIN="./llama.cpp/build/bin/llama-server" | |
| HOST="0.0.0.0" | |
| PORT="8832" | |
| # Predefined configs: "filename.gguf|gpu-layers|ctx-size" | |
| configs=( | |
| "gemma-3-12b-it-qat-q4_0.Q5_K_M.gguf|30|32768" | |
| "gemma-3-12b-it.Q5_K_M.gguf|30|32768" | |
| "gemma-3-4b-it.Q5_K_M.gguf|60|49152" | |
| "gemma-7b-it.Q5_K_M.gguf|26|4096" | |
| "GLM-4-9B-0414-Q5_K_M.gguf|40|15360" | |
| "GLM-4-9B-0414-UD-Q2_K_XL.gguf|60|20480" | |
| "jamba-reasoning-3b-F16.gguf|28|49152" | |
| "jamba-reasoning-3b-Q4_K_M.gguf|28|139264" | |
| "qwen2.5-coder-7b-instruct-q5_k_m.gguf|35|20480" | |
| "qwen2.5-coder-14b-instruct-q5_k_m.gguf|25|20480" | |
| "Qwen3-8B.Q5_K_M.gguf|40|11264" | |
| ) | |
| # Build associative arrays from configs | |
| declare -A gpu_layers_map | |
| declare -A ctx_size_map | |
| for cfg in "${configs[@]}"; do | |
| IFS='|' read -r fname gpu ctx <<< "$cfg" | |
| gpu_layers_map["$fname"]="$gpu" | |
| ctx_size_map["$fname"]="$ctx" | |
| done | |
| # Find all .gguf model files (with full paths) | |
| mapfile -t found_models < <(find ./models -type f -name "*.gguf") | |
| # No models found? | |
| if [[ ${#found_models[@]} -eq 0 ]]; then | |
| echo "No .gguf models found in current directory." | |
| exit 1 | |
| fi | |
| # Show numbered menu | |
| echo "Available models:" | |
| for i in "${!found_models[@]}"; do | |
| fname=$(basename "${found_models[$i]}") | |
| echo "$((i+1))) $fname" | |
| done | |
| # Prompt user for choice | |
| read -rp "Choose a model by number: " choice | |
| # Validate choice | |
| if ! [[ "$choice" =~ ^[0-9]+$ ]] || (( choice < 1 || choice > ${#found_models[@]} )); then | |
| echo "Invalid choice." | |
| exit 1 | |
| fi | |
| # Get selected model info | |
| fullpath="${found_models[$((choice-1))]}" | |
| fname=$(basename "$fullpath") | |
| gpu="${gpu_layers_map[$fname]:-$DEFAULT_GPU_LAYERS}" | |
| ctx="${ctx_size_map[$fname]:-$DEFAULT_CTX_SIZE}" | |
| # Dry-run echo for selected model | |
| echo -e "\n Selected model: $fname" | |
| echo " ➤ Full path: $fullpath" | |
| echo " ➤ GPU layers: $gpu" | |
| echo " ➤ Context size: $ctx" | |
| LLAMA_CUDA_FORCE_MMQ=1 "$LLAMA_BIN" \ | |
| -m "$fullpath" \ | |
| --gpu-layers "$gpu" \ | |
| --ctx-size "$ctx" \ | |
| --port "$PORT" \ | |
| --host "$HOST" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment