cjbarker · April 4, 2026 22:15
diff --git a/gistfile1.txt b/gistfile1.txt
 macros:
  "Qwen3-Coder-30B-macro": >
    /home/cj/tmp-llama/llama-server \
      --port ${PORT} \
      -ngl 999 \
      --ctx-size 131072 \
      --batch-size 4096 \
      --threads 16 \
      --no-webui \
      --timeout 300 \
      --flash-attn on \
      --jinja \
      --alias Qwen3-Coder-30B \
      --cache-type-k q4_0 \
      --cache-type-v q4_0 \
      -hf unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M
      #--model /home/cj/models/unsloth_Qwen3-Coder-30B-A3B-Instruct-GGUF_Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf

  "Qwen3-Vision-8B-macro": >
    /home/cj/tmp-llama/llama-server \
      --port ${PORT} \
      -ngl 999 \
      --ctx-size 131072 \
      --threads 8 \
      --no-webui \
      --timeout 300 \
      --flash-attn on \
      --jinja \
      --alias Qwen3-Vision-8B \
      -hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
      #--model /home/cj/models/Qwen_Qwen3-VL-8B-Instruct-GGUF_Qwen3VL-8B-Instruct-Q4_K_M.gguf \
      #--mmproj /home/cj/models/Qwen_Qwen3-VL-8B-Instruct-GGUF_mmproj-Qwen3VL-8B-Instruct-Q8_0.gguf

  "Qwen35-35B-macro": >
    /home/cj/tmp-llama/llama-server \
      --port ${PORT} \
      -ngl 99 \
      --ctx-size 131072 \
      --temp 0.6 \
      --top-p 0.95 \
      --top-k 20 \
      --min-p 0.00 \
      --threads 8 \
      --no-webui \
      --timeout 300 \
      --flash-attn on \
      --jinja \
      --alias Qwen35-35B \
      #-hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
      --model /home/cj/models/unsloth_Qwen3.5-35B-A3B-GGUF_Qwen3.5-35B-A3B-Q4_K_M.gguf
      --mmproj /home/cj/models/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf

  "Gemma-4-4B-macro": >
    /home/cj/tmp-llama/llama-server \
      --port ${PORT} \
      -ngl 999 \
      --ctx-size 131072 \
      --threads 8 \
      --no-webui \
      --timeout 300 \
      --flash-attn on \
      --jinja \
      --alias Gemma-4-4B \
      -hf unsloth/gemma-4-E4B-it-GGUF:Q4_K_M
      #--model /home/cj/models/unsloth_gemma-4-E4B-it-GGUF_gemma-4-E4B-it-Q4_K_M.gguf
      #--mmproj /home/cj/models/unsloth_gemma-4-E4B-it-GGUF_mmproj-F16.gguf

  "Dolphin-Mistral-24B-macro": >
    /home/cj/tmp-llama/llama-server \
      --port ${PORT} \
      -ngl 99 \
      --ctx-size 131072 \
      --no-webui \
      --timeout 300 \
      --flash-attn on \
      --jinja \
      --cache-type-k q4_0 \
      --cache-type-v q4_0 \
      --alias Dolphin-Mistral-24B \
      -hf Mungert/Dolphin-Mistral-24B-Venice-Edition-GGUF:Q4_K_M
      #--model /home/cj/models/Mungert_Dolphin-Mistral-24B-Venice-Edition-GGUF_Dolphin-Mistral-24B-Venice-Edition-q4_k_m.gguf

  "SmolLM3-3B-macro": >
    /home/cj/tmp-llama/llama-server \
      --port ${PORT} \
      -ngl 99 \
      --ctx-size 64000 \
      --no-webui \
      --timeout 300 \
      --flash-attn on \
      --jinja \
      --cache-type-k q4_0 \
      --cache-type-v q4_0 \
      --alias SmolLM3-3B \
      --model /home/cj/models/unsloth_SmolLM3-3B-128K-GGUF_SmolLM3-3B-128K-Q4_K_M.gguf

 models:
  "Qwen3-Coder-30B": # <-- this is your model ID when calling the REST API
    cmd: |
      ${Qwen3-Coder-30B-macro}
    ttl: 3600

  "Qwen3-Vision-8B":
    cmd: |
      ${Qwen3-Vision-8B-macro}
    ttl: 3600

  "Qwen3.5-35B":
    cmd: |
      ${Qwen35-35B-macro}
    ttl: 3600

  "Gemma-4-4B":
    cmd: |
      ${Gemma-4-4B-macro}
    ttl: 3600

  "Dolphin-Mistral-24B":
    cmd: |
      ${Dolphin-Mistral-24B-macro}
    ttl: 3600

  "SmolLM3-3B":
    cmd: |
      ${SmolLM3-3B-macro}
    ttl: 3600

 groups:
  "preload-group":
    swap: false
    exclusive: true
    members:
      #- "SmolLM3-3B"
      - "Gemma-4-4B"
      - "Qwen3-Coder-30B"

 hooks:
  on_startup:
    # preload: a list of model ids to load on startup
    # - optional, default: empty list
    # - model names must match keys in the models sections
    # - when preloading multiple models at once, define a group
    #   otherwise models will be loaded and swapped out
    preload:
      #- "SmolLM3-3B"
      - "Qwen3-Coder-30B"
      - "Gemma-4-4B"
	macros:
	"Qwen3-Coder-30B-macro": >
	/home/cj/tmp-llama/llama-server \
	--port ${PORT} \
	-ngl 999 \
	--ctx-size 131072 \
	--batch-size 4096 \
	--threads 16 \
	--no-webui \
	--timeout 300 \
	--flash-attn on \
	--jinja \
	--alias Qwen3-Coder-30B \
	--cache-type-k q4_0 \
	--cache-type-v q4_0 \
	-hf unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q4_K_M
	#--model /home/cj/models/unsloth_Qwen3-Coder-30B-A3B-Instruct-GGUF_Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf

	"Qwen3-Vision-8B-macro": >
	/home/cj/tmp-llama/llama-server \
	--port ${PORT} \
	-ngl 999 \
	--ctx-size 131072 \
	--threads 8 \
	--no-webui \
	--timeout 300 \
	--flash-attn on \
	--jinja \
	--alias Qwen3-Vision-8B \
	-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
	#--model /home/cj/models/Qwen_Qwen3-VL-8B-Instruct-GGUF_Qwen3VL-8B-Instruct-Q4_K_M.gguf \
	#--mmproj /home/cj/models/Qwen_Qwen3-VL-8B-Instruct-GGUF_mmproj-Qwen3VL-8B-Instruct-Q8_0.gguf

	"Qwen35-35B-macro": >
	/home/cj/tmp-llama/llama-server \
	--port ${PORT} \
	-ngl 99 \
	--ctx-size 131072 \
	--temp 0.6 \
	--top-p 0.95 \
	--top-k 20 \
	--min-p 0.00 \
	--threads 8 \
	--no-webui \
	--timeout 300 \
	--flash-attn on \
	--jinja \
	--alias Qwen35-35B \
	#-hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
	--model /home/cj/models/unsloth_Qwen3.5-35B-A3B-GGUF_Qwen3.5-35B-A3B-Q4_K_M.gguf
	--mmproj /home/cj/models/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf

	"Gemma-4-4B-macro": >
	/home/cj/tmp-llama/llama-server \
	--port ${PORT} \
	-ngl 999 \
	--ctx-size 131072 \
	--threads 8 \
	--no-webui \
	--timeout 300 \
	--flash-attn on \
	--jinja \
	--alias Gemma-4-4B \
	-hf unsloth/gemma-4-E4B-it-GGUF:Q4_K_M
	#--model /home/cj/models/unsloth_gemma-4-E4B-it-GGUF_gemma-4-E4B-it-Q4_K_M.gguf
	#--mmproj /home/cj/models/unsloth_gemma-4-E4B-it-GGUF_mmproj-F16.gguf

	"Dolphin-Mistral-24B-macro": >
	/home/cj/tmp-llama/llama-server \
	--port ${PORT} \
	-ngl 99 \
	--ctx-size 131072 \
	--no-webui \
	--timeout 300 \
	--flash-attn on \
	--jinja \
	--cache-type-k q4_0 \
	--cache-type-v q4_0 \
	--alias Dolphin-Mistral-24B \
	-hf Mungert/Dolphin-Mistral-24B-Venice-Edition-GGUF:Q4_K_M
	#--model /home/cj/models/Mungert_Dolphin-Mistral-24B-Venice-Edition-GGUF_Dolphin-Mistral-24B-Venice-Edition-q4_k_m.gguf

	"SmolLM3-3B-macro": >
	/home/cj/tmp-llama/llama-server \
	--port ${PORT} \
	-ngl 99 \
	--ctx-size 64000 \
	--no-webui \
	--timeout 300 \
	--flash-attn on \
	--jinja \
	--cache-type-k q4_0 \
	--cache-type-v q4_0 \
	--alias SmolLM3-3B \
	--model /home/cj/models/unsloth_SmolLM3-3B-128K-GGUF_SmolLM3-3B-128K-Q4_K_M.gguf

	models:
	"Qwen3-Coder-30B": # <-- this is your model ID when calling the REST API
	cmd: \|
	${Qwen3-Coder-30B-macro}
	ttl: 3600

	"Qwen3-Vision-8B":
	cmd: \|
	${Qwen3-Vision-8B-macro}
	ttl: 3600

	"Qwen3.5-35B":
	cmd: \|
	${Qwen35-35B-macro}
	ttl: 3600

	"Gemma-4-4B":
	cmd: \|
	${Gemma-4-4B-macro}
	ttl: 3600

	"Dolphin-Mistral-24B":
	cmd: \|
	${Dolphin-Mistral-24B-macro}
	ttl: 3600

	"SmolLM3-3B":
	cmd: \|
	${SmolLM3-3B-macro}
	ttl: 3600

	groups:
	"preload-group":
	swap: false
	exclusive: true
	members:
	#- "SmolLM3-3B"
	- "Gemma-4-4B"
	- "Qwen3-Coder-30B"

	hooks:
	on_startup:
	# preload: a list of model ids to load on startup
	# - optional, default: empty list
	# - model names must match keys in the models sections
	# - when preloading multiple models at once, define a group
	# otherwise models will be loaded and swapped out
	preload:
	#- "SmolLM3-3B"
	- "Qwen3-Coder-30B"
	- "Gemma-4-4B"
No results found