Skip to content

Instantly share code, notes, and snippets.

@sammcj
Created January 27, 2026 21:06
Show Gist options
  • Select an option

  • Save sammcj/728128541109c45f3b1cecc8be20955f to your computer and use it in GitHub Desktop.

Select an option

Save sammcj/728128541109c45f3b1cecc8be20955f to your computer and use it in GitHub Desktop.
GLM 4.7 Flash vLLM, 2+ RTX 3090, 105-120tk/s
services:
  &name vllm:
    <<: [*ai-common, *gpu]
    container_name: *name
    hostname: *name
    profiles:
      - *name
    # image: vllm/vllm-openai:cu130-nightly
    build:
      context: ./vllm
    ipc: host
    environment:
      LD_LIBRARY_PATH: "/usr/lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/nvidia/lib"

      VLLM_DO_NOT_TRACK: 1
      VLLM_CPU_KVCACHE_SPACE: 8
      FLASH_ATTN: 1
      VLLM_TARGET_DEVICE: cuda
      PYTORCH_ALLOC_CONF: "expandable_segments:False"
      CUDA_VISIBLE_DEVICES: "0,1"
      TENSOR_PARALLEL_SIZE: 2
      VLLM_MARLIN_USE_ATOMIC_ADD: 1
      OMP_NUM_THREADS: 6
      VLLM_TUNED_CONFIG_FOLDER: /tuned_configs

      VLLM_USE_PRECOMPILED: 1
      NVIDIA_DISABLE_REQUIRE: "1"
      VLLM_SLEEP_WHEN_IDLE: 1

      # For GLM 4.7 Flash
      VLLM_USE_DEEP_GEMM: 0
      VLLM_USE_FLASHINFER_MOE_FP16: "1"
      VLLM_FLASHINFER_MOE_BACKEND: throughput
      VLLM_ATTENTION_BACKEND: FLASHINFER
      VLLM_USE_FLASHINFER_SAMPLER: "1"

    command:
      - "--enable-prefix-caching"
      - "--enable-chunked-prefill"
      - "--tensor-parallel-size" # split across GPUs
      - "2"
      - "--max-model-len"
      - "32768"
      - "--gpu-memory-utilization"
      - "0.85"

      # GLM 4.7 Flash
      - "--model"
      - "cyankiwi/GLM-4.7-Flash-REAP-23B-A3B-AWQ-4bit"
      - "--trust-remote-code"
      - "--dtype"
      - "bfloat16"
      - "--tool-call-parser"
      - "glm47"
      - "--reasoning-parser"
      - "glm45"
      - "--enable-auto-tool-choice"
      - "--enable-expert-parallel"
      - "--max-num-batched-tokens"
      - "8192"

    ports:
      - 8000
    volumes:
      - /mnt/llm/vllm/cache:/root/.cache/huggingface
      - /mnt/llm/vllm/torch_cache:/root/.cache/vllm/torch_compile_cache
      - /mnt/llm/vllm/tuned_configs:/tuned_configs:rw # for RTX 3090 - https://gist.github.com/sammcj/e45c2ad08a4191f0fbbaa1b842a5a778
    networks:
      - traefik-servicenet
    labels:
      traefik.enable: true
      org.hotio.pullio.update: true
      traefik.http.routers.vllm.rule: Host(`vllm.my.domain`)
      traefik.http.routers.vllm.tls.certresolver: le
      traefik.http.routers.vllm.entrypoints: websecure
      traefik.http.routers.vllm.tls.domains[0].main: "my.domain"
      traefik.http.routers.vllm.service: vllm-service
      traefik.http.services.vllm-service.loadbalancer.server.port: 8000
      # allow websockets
      traefik.http.services.vllm-service.loadbalancer.sticky.cookie: true
      traefik.http.services.vllm-service.loadbalancer.sticky.cookie.name: "vllm"
FROM vllm/vllm-openai:cu130-nightly

ENV LD_LIBRARY_PATH=/usr/lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/nvidia/lib
ENV CUDA_VERSION=130
ENV UV_TORCH_BACKEND=cu130

RUN uv pip install --system -U https://github.com/huggingface/transformers/archive/refs/heads/main.zip
RUN uv pip install --system --force-reinstall numba

EXPOSE 8000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
@sammcj
Copy link
Author

sammcj commented Jan 27, 2026

Notes:

  • FLASHINFER may or may not actually be used depending on your hardware, I use this compose on a few different machines and some GPUs will fall back to another FA backend.
  • I generated a lightweight (non-complete) fused MoE tuned profile for RTX3090 cards with vLLM here: https://gist.github.com/sammcj/e45c2ad08a4191f0fbbaa1b842a5a778

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment