Quentin-M · March 14, 2026 04:29
diff --git a/gistfile1.txt b/gistfile1.txt
 services:
  vllm:
    image: vllm/vllm-openai:nightly
    container_name: vllm
    ipc: host
    ports:
      - "8000:8000"
    volumes:
      - vllm-models:/root/.cache/huggingface
      - vllm-compile:/root/.cache/vllm
      - vllm-flashinfer:/root/.cache/flashinfer
      - vllm-torch:/root/.cache/torch
      - vllm-triton:/root/.triton
    environment:
      - VLLM_WORKER_MULTIPROC_METHOD=spawn
      - VLLM_SLEEP_WHEN_IDLE=1
      - VLLM_ENABLE_CUDAGRAPH_GC=1
      - VLLM_USE_FLASHINFER_SAMPLER=1
      - RAY_memory_monitor_refresh_ms=0
      - TORCHINDUCTOR_CACHE_DIR=/root/.cache/torch/inductor
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command: >
      -O3
      --model cyankiwi/Qwen3.5-27B-AWQ-BF16-INT4
      --served-model-name local
      --quantization compressed-tensors
      --max-model-len=150000
      --max-num-seqs=8
      --block-size=32
      --max-num-batched-tokens=2048
      --enable-prefix-caching
      --enable-auto-tool-choice
      --tool-call-parser qwen3_coder
      --reasoning-parser qwen3
      --attention-backend FLASHINFER
      --tensor-parallel-size=2
      --gpu-memory-utilization=0.90
      --trust-remote-code
      --disable-custom-all-reduce
      --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":4}'
    # NOTE: 5 tokens can be faster, but bugged right now: https://github.com/vllm-project/vllm/issues/37035
    #
    # Optional: larger context window w/ fp8 kv cache:
    #   --max-model-len=170000
    #   --kv-cache-dtype fp8
	services:
	vllm:
	image: vllm/vllm-openai:nightly
	container_name: vllm
	ipc: host
	ports:
	- "8000:8000"
	volumes:
	- vllm-models:/root/.cache/huggingface
	- vllm-compile:/root/.cache/vllm
	- vllm-flashinfer:/root/.cache/flashinfer
	- vllm-torch:/root/.cache/torch
	- vllm-triton:/root/.triton
	environment:
	- VLLM_WORKER_MULTIPROC_METHOD=spawn
	- VLLM_SLEEP_WHEN_IDLE=1
	- VLLM_ENABLE_CUDAGRAPH_GC=1
	- VLLM_USE_FLASHINFER_SAMPLER=1
	- RAY_memory_monitor_refresh_ms=0
	- TORCHINDUCTOR_CACHE_DIR=/root/.cache/torch/inductor
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	command: >
	-O3
	--model cyankiwi/Qwen3.5-27B-AWQ-BF16-INT4
	--served-model-name local
	--quantization compressed-tensors
	--max-model-len=150000
	--max-num-seqs=8
	--block-size=32
	--max-num-batched-tokens=2048
	--enable-prefix-caching
	--enable-auto-tool-choice
	--tool-call-parser qwen3_coder
	--reasoning-parser qwen3
	--attention-backend FLASHINFER
	--tensor-parallel-size=2
	--gpu-memory-utilization=0.90
	--trust-remote-code
	--disable-custom-all-reduce
	--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":4}'
	# NOTE: 5 tokens can be faster, but bugged right now: https://github.com/vllm-project/vllm/issues/37035
	#
	# Optional: larger context window w/ fp8 kv cache:
	# --max-model-len=170000
	# --kv-cache-dtype fp8
No results found