grahama1970 · November 18, 2024 14:36
diff --git a/docker-compose.yml b/docker-compose.yml
 services:
  # WORKS: Loads successfully on an A5000 GPU
  sglang_QWEN_32B_Int4:
    image: lmsysorg/sglang:latest
    container_name: sglang_QWEN_32B_Int4
    volumes:
      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
    restart: always
    ports:
      - "30004:30000"  # Adjust port as needed
    environment:
      HF_TOKEN: ${HF_TOKEN}
    entrypoint: python3 -m sglang.launch_server
    command: [
      "--model-path", "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4",
      "--mem-fraction-static", "0.95",
      "--host", "0.0.0.0",
      "--port", "30000"
    ]
    ulimits:
      memlock: -1
      stack: 67108864
    ipc: host
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]

  # WORKS: Loads successfully on an A5000 GPU
  sglang_QWEN_14B_fp8:
    image: lmsysorg/sglang:latest
    container_name: sglang_QWEN_14B_fp8
    volumes:
      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
    restart: always
    ports:
      - "30003:30000"  # Adjust port as needed
    environment:
      HF_TOKEN: ${HF_TOKEN}
    entrypoint: python3 -m sglang.launch_server
    command: [
      "--model-path", "Qwen/Qwen2.5-14B-Instruct",
      "--quantization", "fp8",
      "--mem-fraction-static", "0.95",
      "--host", "0.0.0.0",
      "--port", "30000"
    ]
    ulimits:
      memlock: -1
      stack: 67108864
    ipc: host
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]

 # FAILS: Unquantized 14B model runs out of memory on an A5000 GPU
  sglang_QWEN_14B:
    image: lmsysorg/sglang:latest
    container_name: sglang_QWEN_14B
    volumes:
      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
    restart: always
    ports:
      - "30003:30000"  # Adjust port as needed
    environment:
      HF_TOKEN: ${HF_TOKEN}
    entrypoint: python3 -m sglang.launch_server
    command: [
      "--model-path", "Qwen/Qwen2.5-14B-Instruct",
      # "--quantization", "fp8",
      "--mem-fraction-static", "0.95",
      "--host", "0.0.0.0",
      "--port", "30000"
    ]
    ulimits:
      memlock: -1
      stack: 67108864
    ipc: host
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]

  # WORKS: Loads successfully on an A5000 GPU
  sglang_Meta_Llama_3_1_8B:
    image: lmsysorg/sglang:latest
    container_name: sglang_Meta_Llama_3_1_8B
    volumes:
      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
    restart: always
    ports:
      - "30005:30000"  # Adjust port as needed
    environment:
      HF_TOKEN: ${HF_TOKEN}
    entrypoint: python3 -m sglang.launch_server
    command: [
      "--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct",
      "--host", "0.0.0.0",
      "--port", "30000"
    ]
    ulimits:
      memlock: -1
      stack: 67108864
    ipc: host
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]

  # FAILS: On A5000, Runs out of memory.
  sglang_QWEN_32B_Int4_torchao:
    image: lmsysorg/sglang:latest
    container_name: sglang_QWEN_32B_Int4_torchao
    volumes:
      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
    restart: always
    ports:
      - "30002:30000"  # Adjust port as needed
    environment:
      HF_TOKEN: ${HF_TOKEN}
    entrypoint: python3 -m sglang.launch_server
    command: [
      "--model-path", "Qwen/Qwen2.5-Coder-32B-Instruct",
      "--mem-fraction-static", "0.95",
      "--torchao-config", "int4wo-128",
      "--host", "0.0.0.0",
      "--port", "30000"
    ]
    ulimits:
      memlock: -1
      stack: 67108864
    ipc: host
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities: [gpu]
	services:
	# WORKS: Loads successfully on an A5000 GPU
	sglang_QWEN_32B_Int4:
	image: lmsysorg/sglang:latest
	container_name: sglang_QWEN_32B_Int4
	volumes:
	- ${HOME}/.cache/huggingface:/root/.cache/huggingface
	restart: always
	ports:
	- "30004:30000" # Adjust port as needed
	environment:
	HF_TOKEN: ${HF_TOKEN}
	entrypoint: python3 -m sglang.launch_server
	command: [
	"--model-path", "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4",
	"--mem-fraction-static", "0.95",
	"--host", "0.0.0.0",
	"--port", "30000"
	]
	ulimits:
	memlock: -1
	stack: 67108864
	ipc: host
	healthcheck:
	test: ["CMD-SHELL", "curl -f http://localhost:30000/health \|\| exit 1"]
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	device_ids: ['0']
	capabilities: [gpu]

	# WORKS: Loads successfully on an A5000 GPU
	sglang_QWEN_14B_fp8:
	image: lmsysorg/sglang:latest
	container_name: sglang_QWEN_14B_fp8
	volumes:
	- ${HOME}/.cache/huggingface:/root/.cache/huggingface
	restart: always
	ports:
	- "30003:30000" # Adjust port as needed
	environment:
	HF_TOKEN: ${HF_TOKEN}
	entrypoint: python3 -m sglang.launch_server
	command: [
	"--model-path", "Qwen/Qwen2.5-14B-Instruct",
	"--quantization", "fp8",
	"--mem-fraction-static", "0.95",
	"--host", "0.0.0.0",
	"--port", "30000"
	]
	ulimits:
	memlock: -1
	stack: 67108864
	ipc: host
	healthcheck:
	test: ["CMD-SHELL", "curl -f http://localhost:30000/health \|\| exit 1"]
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	device_ids: ['0']
	capabilities: [gpu]

	# FAILS: Unquantized 14B model runs out of memory on an A5000 GPU
	sglang_QWEN_14B:
	image: lmsysorg/sglang:latest
	container_name: sglang_QWEN_14B
	volumes:
	- ${HOME}/.cache/huggingface:/root/.cache/huggingface
	restart: always
	ports:
	- "30003:30000" # Adjust port as needed
	environment:
	HF_TOKEN: ${HF_TOKEN}
	entrypoint: python3 -m sglang.launch_server
	command: [
	"--model-path", "Qwen/Qwen2.5-14B-Instruct",
	# "--quantization", "fp8",
	"--mem-fraction-static", "0.95",
	"--host", "0.0.0.0",
	"--port", "30000"
	]
	ulimits:
	memlock: -1
	stack: 67108864
	ipc: host
	healthcheck:
	test: ["CMD-SHELL", "curl -f http://localhost:30000/health \|\| exit 1"]
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	device_ids: ['0']
	capabilities: [gpu]

	# WORKS: Loads successfully on an A5000 GPU
	sglang_Meta_Llama_3_1_8B:
	image: lmsysorg/sglang:latest
	container_name: sglang_Meta_Llama_3_1_8B
	volumes:
	- ${HOME}/.cache/huggingface:/root/.cache/huggingface
	restart: always
	ports:
	- "30005:30000" # Adjust port as needed
	environment:
	HF_TOKEN: ${HF_TOKEN}
	entrypoint: python3 -m sglang.launch_server
	command: [
	"--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct",
	"--host", "0.0.0.0",
	"--port", "30000"
	]
	ulimits:
	memlock: -1
	stack: 67108864
	ipc: host
	healthcheck:
	test: ["CMD-SHELL", "curl -f http://localhost:30000/health \|\| exit 1"]
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	device_ids: ['0']
	capabilities: [gpu]

	# FAILS: On A5000, Runs out of memory.
	sglang_QWEN_32B_Int4_torchao:
	image: lmsysorg/sglang:latest
	container_name: sglang_QWEN_32B_Int4_torchao
	volumes:
	- ${HOME}/.cache/huggingface:/root/.cache/huggingface
	restart: always
	ports:
	- "30002:30000" # Adjust port as needed
	environment:
	HF_TOKEN: ${HF_TOKEN}
	entrypoint: python3 -m sglang.launch_server
	command: [
	"--model-path", "Qwen/Qwen2.5-Coder-32B-Instruct",
	"--mem-fraction-static", "0.95",
	"--torchao-config", "int4wo-128",
	"--host", "0.0.0.0",
	"--port", "30000"
	]
	ulimits:
	memlock: -1
	stack: 67108864
	ipc: host
	healthcheck:
	test: ["CMD-SHELL", "curl -f http://localhost:30000/health \|\| exit 1"]
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	device_ids: ['0']
	capabilities: [gpu]