Skip to content

Instantly share code, notes, and snippets.

@cjbarker
Last active April 4, 2026 22:15
Show Gist options
  • Select an option

  • Save cjbarker/545773ff1a9860b2d672667bbb1ade28 to your computer and use it in GitHub Desktop.

Select an option

Save cjbarker/545773ff1a9860b2d672667bbb1ade28 to your computer and use it in GitHub Desktop.
llama-swap-config.yaml
macros:
"Qwen3-Coder-30B-macro": >
/home/cj/tmp-llama/llama-server \
--port ${PORT} \
-ngl 999 \
--ctx-size 131072 \
--batch-size 4096 \
--threads 16 \
--no-webui \
--timeout 300 \
--flash-attn on \
--jinja \
--alias Qwen3-Coder-30B \
--cache-type-k q4_0 \
--cache-type-v q4_0 \
-hf unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M
#--model /home/cj/models/unsloth_Qwen3-Coder-30B-A3B-Instruct-GGUF_Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf
"Qwen3-Vision-8B-macro": >
/home/cj/tmp-llama/llama-server \
--port ${PORT} \
-ngl 999 \
--ctx-size 131072 \
--threads 8 \
--no-webui \
--timeout 300 \
--flash-attn on \
--jinja \
--alias Qwen3-Vision-8B \
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
#--model /home/cj/models/Qwen_Qwen3-VL-8B-Instruct-GGUF_Qwen3VL-8B-Instruct-Q4_K_M.gguf \
#--mmproj /home/cj/models/Qwen_Qwen3-VL-8B-Instruct-GGUF_mmproj-Qwen3VL-8B-Instruct-Q8_0.gguf
"Qwen35-35B-macro": >
/home/cj/tmp-llama/llama-server \
--port ${PORT} \
-ngl 99 \
--ctx-size 131072 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00 \
--threads 8 \
--no-webui \
--timeout 300 \
--flash-attn on \
--jinja \
--alias Qwen35-35B \
#-hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
--model /home/cj/models/unsloth_Qwen3.5-35B-A3B-GGUF_Qwen3.5-35B-A3B-Q4_K_M.gguf
--mmproj /home/cj/models/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf
"Gemma-4-4B-macro": >
/home/cj/tmp-llama/llama-server \
--port ${PORT} \
-ngl 999 \
--ctx-size 131072 \
--threads 8 \
--no-webui \
--timeout 300 \
--flash-attn on \
--jinja \
--alias Gemma-4-4B \
-hf unsloth/gemma-4-E4B-it-GGUF:Q4_K_M
#--model /home/cj/models/unsloth_gemma-4-E4B-it-GGUF_gemma-4-E4B-it-Q4_K_M.gguf
#--mmproj /home/cj/models/unsloth_gemma-4-E4B-it-GGUF_mmproj-F16.gguf
"Dolphin-Mistral-24B-macro": >
/home/cj/tmp-llama/llama-server \
--port ${PORT} \
-ngl 99 \
--ctx-size 131072 \
--no-webui \
--timeout 300 \
--flash-attn on \
--jinja \
--cache-type-k q4_0 \
--cache-type-v q4_0 \
--alias Dolphin-Mistral-24B \
-hf Mungert/Dolphin-Mistral-24B-Venice-Edition-GGUF:Q4_K_M
#--model /home/cj/models/Mungert_Dolphin-Mistral-24B-Venice-Edition-GGUF_Dolphin-Mistral-24B-Venice-Edition-q4_k_m.gguf
"SmolLM3-3B-macro": >
/home/cj/tmp-llama/llama-server \
--port ${PORT} \
-ngl 99 \
--ctx-size 64000 \
--no-webui \
--timeout 300 \
--flash-attn on \
--jinja \
--cache-type-k q4_0 \
--cache-type-v q4_0 \
--alias SmolLM3-3B \
--model /home/cj/models/unsloth_SmolLM3-3B-128K-GGUF_SmolLM3-3B-128K-Q4_K_M.gguf
models:
"Qwen3-Coder-30B": # <-- this is your model ID when calling the REST API
cmd: |
${Qwen3-Coder-30B-macro}
ttl: 3600
"Qwen3-Vision-8B":
cmd: |
${Qwen3-Vision-8B-macro}
ttl: 3600
"Qwen3.5-35B":
cmd: |
${Qwen35-35B-macro}
ttl: 3600
"Gemma-4-4B":
cmd: |
${Gemma-4-4B-macro}
ttl: 3600
"Dolphin-Mistral-24B":
cmd: |
${Dolphin-Mistral-24B-macro}
ttl: 3600
"SmolLM3-3B":
cmd: |
${SmolLM3-3B-macro}
ttl: 3600
groups:
"preload-group":
swap: false
exclusive: true
members:
#- "SmolLM3-3B"
- "Gemma-4-4B"
- "Qwen3-Coder-30B"
hooks:
on_startup:
# preload: a list of model ids to load on startup
# - optional, default: empty list
# - model names must match keys in the models sections
# - when preloading multiple models at once, define a group
# otherwise models will be loaded and swapped out
preload:
#- "SmolLM3-3B"
- "Qwen3-Coder-30B"
- "Gemma-4-4B"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment