chadbrewbaker · June 3, 2026 19:18
diff --git a/run_gemma.sh b/run_gemma.sh
 ./llama-server \
  -m gemma-4-12B-it-Q4_K_M.gguf \                  # Target model
  -md gemma-4-12B-it-assistant-Q5_K_M.gguf \       # MTP drafter (small ~0.4B)
  --spec-type draft-mtp \                          # Enable MTP speculative decoding
  --spec-draft-n-max 4 \                           # Typically 3-5 for Gemma 4 MTP (experiment)
  --spec-draft-n-min 1 \
  -c 131072 \                                      # Context (try 262144 if you have memory)
  --cache-type-k q4_0 \                            # 4-bit KV cache
  --cache-type-v q4_0 \
  -ngl 99 \                                        # Offload all layers to GPU
  -ngld 99 \                                       # Offload drafter too
  --flash-attn 1 \                                 # Flash Attention
  --temp 0.7 \                                     # Or 0.0 for deterministic
  -np 1 \                                          # MTP often requires single parallel slot
  --jinja \                                        # For proper Gemma chat template
  -b 2048 -ub 512
	./llama-server \
	-m gemma-4-12B-it-Q4_K_M.gguf \ # Target model
	-md gemma-4-12B-it-assistant-Q5_K_M.gguf \ # MTP drafter (small ~0.4B)
	--spec-type draft-mtp \ # Enable MTP speculative decoding
	--spec-draft-n-max 4 \ # Typically 3-5 for Gemma 4 MTP (experiment)
	--spec-draft-n-min 1 \
	-c 131072 \ # Context (try 262144 if you have memory)
	--cache-type-k q4_0 \ # 4-bit KV cache
	--cache-type-v q4_0 \
	-ngl 99 \ # Offload all layers to GPU
	-ngld 99 \ # Offload drafter too
	--flash-attn 1 \ # Flash Attention
	--temp 0.7 \ # Or 0.0 for deterministic
	-np 1 \ # MTP often requires single parallel slot
	--jinja \ # For proper Gemma chat template
	-b 2048 -ub 512
No results found