justinledwards · May 22, 2026 14:19
diff --git a/run_llama-qwen3627b-mtp-docker.sh b/run_llama-qwen3627b-mtp-docker.sh
 #!/bin/bash
 # llama-server startup script for Qwen3.6-27B Q4_K_M MTP GGUF
 # Uses official Docker image with CUDA backend
 #
 # MTP model from: https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF
 # Downloaded to: ~/.cache/huggingface/hub/models--unsloth--Qwen3.6-27B-MTP-GGUF/
 #
 # Key settings for 128k context on RTX 3090:
 #   -c 128144      : context (128K, VRAM-limited with MTP)
 #   -ngl 99        : offload all layers to GPU
 #   -fa on         : flash attention for speed
 #   --cache-type-k q4_0 : quantize KV cache (saves VRAM)
 #   --cache-type-v q4_0 : quantize KV cache
 #   --ctx-checkpoints 0 : disable per-slot VRAM checkpoints (~2GB saved, fixes OOM leak #23446)
 #   --spec-type draft-mtp : enable MTP for ~1.5x speedup
 #   --spec-draft-n-max 3  : draft 3 tokens per step (sweet spot on Ampere)
 #   --spec-draft-p-min 0.0 : allow all drafts through (PR #23269, avoids 14% regression)

 # The HF cache uses a blobs directory with hash filenames; mount the whole cache
 HF_CACHE="${HOME}/.cache/huggingface"

 # Pull the latest CUDA image if needed
 docker pull ghcr.io/ggml-org/llama.cpp:server-cuda 2>/dev/null

 docker run --device nvidia.com/gpu=all --rm \
    -p 8080:8080 \
    -v "${HF_CACHE}:/hf-cache:ro" \
    -e LD_LIBRARY_PATH=/app \
    ghcr.io/ggml-org/llama.cpp:server-cuda \
    -m /hf-cache/hub/models--unsloth--Qwen3.6-27B-MTP-GGUF/blobs/a7cbd3ecc0e3f9b333edee61ae66bc87ed713c5d49587a8355814722ed329e0f \
    -c 128144 \
    -ngl 99 \
    -fa on \
    --threads 8 \
    --cache-type-k q4_0 \
    --cache-type-v q4_0 \
    --ctx-checkpoints 8 \
    --port 8080 \
    --alias Qwen3.6-27B-MTP \
    --spec-type draft-mtp \
    --spec-draft-n-max 3 \
    --spec-draft-p-min 0.0
	#!/bin/bash
	# llama-server startup script for Qwen3.6-27B Q4_K_M MTP GGUF
	# Uses official Docker image with CUDA backend
	#
	# MTP model from: https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF
	# Downloaded to: ~/.cache/huggingface/hub/models--unsloth--Qwen3.6-27B-MTP-GGUF/
	#
	# Key settings for 128k context on RTX 3090:
	# -c 128144 : context (128K, VRAM-limited with MTP)
	# -ngl 99 : offload all layers to GPU
	# -fa on : flash attention for speed
	# --cache-type-k q4_0 : quantize KV cache (saves VRAM)
	# --cache-type-v q4_0 : quantize KV cache
	# --ctx-checkpoints 0 : disable per-slot VRAM checkpoints (~2GB saved, fixes OOM leak #23446)
	# --spec-type draft-mtp : enable MTP for ~1.5x speedup
	# --spec-draft-n-max 3 : draft 3 tokens per step (sweet spot on Ampere)
	# --spec-draft-p-min 0.0 : allow all drafts through (PR #23269, avoids 14% regression)

	# The HF cache uses a blobs directory with hash filenames; mount the whole cache
	HF_CACHE="${HOME}/.cache/huggingface"

	# Pull the latest CUDA image if needed
	docker pull ghcr.io/ggml-org/llama.cpp:server-cuda 2>/dev/null

	docker run --device nvidia.com/gpu=all --rm \
	-p 8080:8080 \
	-v "${HF_CACHE}:/hf-cache:ro" \
	-e LD_LIBRARY_PATH=/app \
	ghcr.io/ggml-org/llama.cpp:server-cuda \
	-m /hf-cache/hub/models--unsloth--Qwen3.6-27B-MTP-GGUF/blobs/a7cbd3ecc0e3f9b333edee61ae66bc87ed713c5d49587a8355814722ed329e0f \
	-c 128144 \
	-ngl 99 \
	-fa on \
	--threads 8 \
	--cache-type-k q4_0 \
	--cache-type-v q4_0 \
	--ctx-checkpoints 8 \
	--port 8080 \
	--alias Qwen3.6-27B-MTP \
	--spec-type draft-mtp \
	--spec-draft-n-max 3 \
	--spec-draft-p-min 0.0
No results found