Last active
April 4, 2026 22:15
-
-
Save cjbarker/545773ff1a9860b2d672667bbb1ade28 to your computer and use it in GitHub Desktop.
llama-swap-config.yaml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| macros: | |
| "Qwen3-Coder-30B-macro": > | |
| /home/cj/tmp-llama/llama-server \ | |
| --port ${PORT} \ | |
| -ngl 999 \ | |
| --ctx-size 131072 \ | |
| --batch-size 4096 \ | |
| --threads 16 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --alias Qwen3-Coder-30B \ | |
| --cache-type-k q4_0 \ | |
| --cache-type-v q4_0 \ | |
| -hf unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M | |
| #--model /home/cj/models/unsloth_Qwen3-Coder-30B-A3B-Instruct-GGUF_Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf | |
| "Qwen3-Vision-8B-macro": > | |
| /home/cj/tmp-llama/llama-server \ | |
| --port ${PORT} \ | |
| -ngl 999 \ | |
| --ctx-size 131072 \ | |
| --threads 8 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --alias Qwen3-Vision-8B \ | |
| -hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M | |
| #--model /home/cj/models/Qwen_Qwen3-VL-8B-Instruct-GGUF_Qwen3VL-8B-Instruct-Q4_K_M.gguf \ | |
| #--mmproj /home/cj/models/Qwen_Qwen3-VL-8B-Instruct-GGUF_mmproj-Qwen3VL-8B-Instruct-Q8_0.gguf | |
| "Qwen35-35B-macro": > | |
| /home/cj/tmp-llama/llama-server \ | |
| --port ${PORT} \ | |
| -ngl 99 \ | |
| --ctx-size 131072 \ | |
| --temp 0.6 \ | |
| --top-p 0.95 \ | |
| --top-k 20 \ | |
| --min-p 0.00 \ | |
| --threads 8 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --alias Qwen35-35B \ | |
| #-hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M | |
| --model /home/cj/models/unsloth_Qwen3.5-35B-A3B-GGUF_Qwen3.5-35B-A3B-Q4_K_M.gguf | |
| --mmproj /home/cj/models/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf | |
| "Gemma-4-4B-macro": > | |
| /home/cj/tmp-llama/llama-server \ | |
| --port ${PORT} \ | |
| -ngl 999 \ | |
| --ctx-size 131072 \ | |
| --threads 8 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --alias Gemma-4-4B \ | |
| -hf unsloth/gemma-4-E4B-it-GGUF:Q4_K_M | |
| #--model /home/cj/models/unsloth_gemma-4-E4B-it-GGUF_gemma-4-E4B-it-Q4_K_M.gguf | |
| #--mmproj /home/cj/models/unsloth_gemma-4-E4B-it-GGUF_mmproj-F16.gguf | |
| "Dolphin-Mistral-24B-macro": > | |
| /home/cj/tmp-llama/llama-server \ | |
| --port ${PORT} \ | |
| -ngl 99 \ | |
| --ctx-size 131072 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --cache-type-k q4_0 \ | |
| --cache-type-v q4_0 \ | |
| --alias Dolphin-Mistral-24B \ | |
| -hf Mungert/Dolphin-Mistral-24B-Venice-Edition-GGUF:Q4_K_M | |
| #--model /home/cj/models/Mungert_Dolphin-Mistral-24B-Venice-Edition-GGUF_Dolphin-Mistral-24B-Venice-Edition-q4_k_m.gguf | |
| "SmolLM3-3B-macro": > | |
| /home/cj/tmp-llama/llama-server \ | |
| --port ${PORT} \ | |
| -ngl 99 \ | |
| --ctx-size 64000 \ | |
| --no-webui \ | |
| --timeout 300 \ | |
| --flash-attn on \ | |
| --jinja \ | |
| --cache-type-k q4_0 \ | |
| --cache-type-v q4_0 \ | |
| --alias SmolLM3-3B \ | |
| --model /home/cj/models/unsloth_SmolLM3-3B-128K-GGUF_SmolLM3-3B-128K-Q4_K_M.gguf | |
| models: | |
| "Qwen3-Coder-30B": # <-- this is your model ID when calling the REST API | |
| cmd: | | |
| ${Qwen3-Coder-30B-macro} | |
| ttl: 3600 | |
| "Qwen3-Vision-8B": | |
| cmd: | | |
| ${Qwen3-Vision-8B-macro} | |
| ttl: 3600 | |
| "Qwen3.5-35B": | |
| cmd: | | |
| ${Qwen35-35B-macro} | |
| ttl: 3600 | |
| "Gemma-4-4B": | |
| cmd: | | |
| ${Gemma-4-4B-macro} | |
| ttl: 3600 | |
| "Dolphin-Mistral-24B": | |
| cmd: | | |
| ${Dolphin-Mistral-24B-macro} | |
| ttl: 3600 | |
| "SmolLM3-3B": | |
| cmd: | | |
| ${SmolLM3-3B-macro} | |
| ttl: 3600 | |
| groups: | |
| "preload-group": | |
| swap: false | |
| exclusive: true | |
| members: | |
| #- "SmolLM3-3B" | |
| - "Gemma-4-4B" | |
| - "Qwen3-Coder-30B" | |
| hooks: | |
| on_startup: | |
| # preload: a list of model ids to load on startup | |
| # - optional, default: empty list | |
| # - model names must match keys in the models sections | |
| # - when preloading multiple models at once, define a group | |
| # otherwise models will be loaded and swapped out | |
| preload: | |
| #- "SmolLM3-3B" | |
| - "Qwen3-Coder-30B" | |
| - "Gemma-4-4B" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment