Created
May 22, 2026 14:19
-
-
Save justinledwards/1c81079f072efb22b6c47df1e6353f99 to your computer and use it in GitHub Desktop.
Single 3090 Qwen3.6-27B-MTP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # llama-server startup script for Qwen3.6-27B Q4_K_M MTP GGUF | |
| # Uses official Docker image with CUDA backend | |
| # | |
| # MTP model from: https://huggingface.co/unsloth/Qwen3.6-27B-MTP-GGUF | |
| # Downloaded to: ~/.cache/huggingface/hub/models--unsloth--Qwen3.6-27B-MTP-GGUF/ | |
| # | |
| # Key settings for 128k context on RTX 3090: | |
| # -c 128144 : context (128K, VRAM-limited with MTP) | |
| # -ngl 99 : offload all layers to GPU | |
| # -fa on : flash attention for speed | |
| # --cache-type-k q4_0 : quantize KV cache (saves VRAM) | |
| # --cache-type-v q4_0 : quantize KV cache | |
| # --ctx-checkpoints 0 : disable per-slot VRAM checkpoints (~2GB saved, fixes OOM leak #23446) | |
| # --spec-type draft-mtp : enable MTP for ~1.5x speedup | |
| # --spec-draft-n-max 3 : draft 3 tokens per step (sweet spot on Ampere) | |
| # --spec-draft-p-min 0.0 : allow all drafts through (PR #23269, avoids 14% regression) | |
| # The HF cache uses a blobs directory with hash filenames; mount the whole cache | |
| HF_CACHE="${HOME}/.cache/huggingface" | |
| # Pull the latest CUDA image if needed | |
| docker pull ghcr.io/ggml-org/llama.cpp:server-cuda 2>/dev/null | |
| docker run --device nvidia.com/gpu=all --rm \ | |
| -p 8080:8080 \ | |
| -v "${HF_CACHE}:/hf-cache:ro" \ | |
| -e LD_LIBRARY_PATH=/app \ | |
| ghcr.io/ggml-org/llama.cpp:server-cuda \ | |
| -m /hf-cache/hub/models--unsloth--Qwen3.6-27B-MTP-GGUF/blobs/a7cbd3ecc0e3f9b333edee61ae66bc87ed713c5d49587a8355814722ed329e0f \ | |
| -c 128144 \ | |
| -ngl 99 \ | |
| -fa on \ | |
| --threads 8 \ | |
| --cache-type-k q4_0 \ | |
| --cache-type-v q4_0 \ | |
| --ctx-checkpoints 8 \ | |
| --port 8080 \ | |
| --alias Qwen3.6-27B-MTP \ | |
| --spec-type draft-mtp \ | |
| --spec-draft-n-max 3 \ | |
| --spec-draft-p-min 0.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment