services:
&name vllm:
<<: [*ai-common, *gpu]
container_name: *name
hostname: *name
profiles:
- *name
# image: vllm/vllm-openai:cu130-nightly
build:
context: ./vllm
ipc: host
environment:
LD_LIBRARY_PATH: "/usr/lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/nvidia/lib"
VLLM_DO_NOT_TRACK: 1
VLLM_CPU_KVCACHE_SPACE: 8
FLASH_ATTN: 1
VLLM_TARGET_DEVICE: cuda
PYTORCH_ALLOC_CONF: "expandable_segments:False"
CUDA_VISIBLE_DEVICES: "0,1"
TENSOR_PARALLEL_SIZE: 2
VLLM_MARLIN_USE_ATOMIC_ADD: 1
OMP_NUM_THREADS: 6
VLLM_TUNED_CONFIG_FOLDER: /tuned_configs
VLLM_USE_PRECOMPILED: 1
NVIDIA_DISABLE_REQUIRE: "1"
VLLM_SLEEP_WHEN_IDLE: 1
# For GLM 4.7 Flash
VLLM_USE_DEEP_GEMM: 0
VLLM_USE_FLASHINFER_MOE_FP16: "1"
VLLM_FLASHINFER_MOE_BACKEND: throughput
VLLM_ATTENTION_BACKEND: FLASHINFER
VLLM_USE_FLASHINFER_SAMPLER: "1"
command:
- "--enable-prefix-caching"
- "--enable-chunked-prefill"
- "--tensor-parallel-size" # split across GPUs
- "2"
- "--max-model-len"
- "32768"
- "--gpu-memory-utilization"
- "0.85"
# GLM 4.7 Flash
- "--model"
- "cyankiwi/GLM-4.7-Flash-REAP-23B-A3B-AWQ-4bit"
- "--trust-remote-code"
- "--dtype"
- "bfloat16"
- "--tool-call-parser"
- "glm47"
- "--reasoning-parser"
- "glm45"
- "--enable-auto-tool-choice"
- "--enable-expert-parallel"
- "--max-num-batched-tokens"
- "8192"
ports:
- 8000
volumes:
- /mnt/llm/vllm/cache:/root/.cache/huggingface
- /mnt/llm/vllm/torch_cache:/root/.cache/vllm/torch_compile_cache
- /mnt/llm/vllm/tuned_configs:/tuned_configs:rw # for RTX 3090 - https://gist.github.com/sammcj/e45c2ad08a4191f0fbbaa1b842a5a778
networks:
- traefik-servicenet
labels:
traefik.enable: true
org.hotio.pullio.update: true
traefik.http.routers.vllm.rule: Host(`vllm.my.domain`)
traefik.http.routers.vllm.tls.certresolver: le
traefik.http.routers.vllm.entrypoints: websecure
traefik.http.routers.vllm.tls.domains[0].main: "my.domain"
traefik.http.routers.vllm.service: vllm-service
traefik.http.services.vllm-service.loadbalancer.server.port: 8000
# allow websockets
traefik.http.services.vllm-service.loadbalancer.sticky.cookie: true
traefik.http.services.vllm-service.loadbalancer.sticky.cookie.name: "vllm"
FROM vllm/vllm-openai:cu130-nightly
ENV LD_LIBRARY_PATH=/usr/lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/nvidia/lib
ENV CUDA_VERSION=130
ENV UV_TORCH_BACKEND=cu130
RUN uv pip install --system -U https://github.com/huggingface/transformers/archive/refs/heads/main.zip
RUN uv pip install --system --force-reinstall numba
EXPOSE 8000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
Notes: