Last active
November 18, 2024 14:36
-
-
Save grahama1970/976c8e29f1dae95a503828c6c7470a35 to your computer and use it in GitHub Desktop.
The configuration deploys various models on an A5000 GPU, leveraging SGLang for long-running overnight tasks with low inference speed requirements. Successful configurations include QWEN 32B Int4, QWEN 14B FP8, and Meta Llama 3.1 8B, while QWEN 32B Int4 with TorchAO exceeds memory limits.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
services: | |
# WORKS: Loads successfully on an A5000 GPU | |
sglang_QWEN_32B_Int4: | |
image: lmsysorg/sglang:latest | |
container_name: sglang_QWEN_32B_Int4 | |
volumes: | |
- ${HOME}/.cache/huggingface:/root/.cache/huggingface | |
restart: always | |
ports: | |
- "30004:30000" # Adjust port as needed | |
environment: | |
HF_TOKEN: ${HF_TOKEN} | |
entrypoint: python3 -m sglang.launch_server | |
command: [ | |
"--model-path", "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4", | |
"--mem-fraction-static", "0.95", | |
"--host", "0.0.0.0", | |
"--port", "30000" | |
] | |
ulimits: | |
memlock: -1 | |
stack: 67108864 | |
ipc: host | |
healthcheck: | |
test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] | |
deploy: | |
resources: | |
reservations: | |
devices: | |
- driver: nvidia | |
device_ids: ['0'] | |
capabilities: [gpu] | |
# WORKS: Loads successfully on an A5000 GPU | |
sglang_QWEN_14B_fp8: | |
image: lmsysorg/sglang:latest | |
container_name: sglang_QWEN_14B_fp8 | |
volumes: | |
- ${HOME}/.cache/huggingface:/root/.cache/huggingface | |
restart: always | |
ports: | |
- "30003:30000" # Adjust port as needed | |
environment: | |
HF_TOKEN: ${HF_TOKEN} | |
entrypoint: python3 -m sglang.launch_server | |
command: [ | |
"--model-path", "Qwen/Qwen2.5-14B-Instruct", | |
"--quantization", "fp8", | |
"--mem-fraction-static", "0.95", | |
"--host", "0.0.0.0", | |
"--port", "30000" | |
] | |
ulimits: | |
memlock: -1 | |
stack: 67108864 | |
ipc: host | |
healthcheck: | |
test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] | |
deploy: | |
resources: | |
reservations: | |
devices: | |
- driver: nvidia | |
device_ids: ['0'] | |
capabilities: [gpu] | |
# FAILS: Unquantized 14B model runs out of memory on an A5000 GPU | |
sglang_QWEN_14B: | |
image: lmsysorg/sglang:latest | |
container_name: sglang_QWEN_14B | |
volumes: | |
- ${HOME}/.cache/huggingface:/root/.cache/huggingface | |
restart: always | |
ports: | |
- "30003:30000" # Adjust port as needed | |
environment: | |
HF_TOKEN: ${HF_TOKEN} | |
entrypoint: python3 -m sglang.launch_server | |
command: [ | |
"--model-path", "Qwen/Qwen2.5-14B-Instruct", | |
# "--quantization", "fp8", | |
"--mem-fraction-static", "0.95", | |
"--host", "0.0.0.0", | |
"--port", "30000" | |
] | |
ulimits: | |
memlock: -1 | |
stack: 67108864 | |
ipc: host | |
healthcheck: | |
test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] | |
deploy: | |
resources: | |
reservations: | |
devices: | |
- driver: nvidia | |
device_ids: ['0'] | |
capabilities: [gpu] | |
# WORKS: Loads successfully on an A5000 GPU | |
sglang_Meta_Llama_3_1_8B: | |
image: lmsysorg/sglang:latest | |
container_name: sglang_Meta_Llama_3_1_8B | |
volumes: | |
- ${HOME}/.cache/huggingface:/root/.cache/huggingface | |
restart: always | |
ports: | |
- "30005:30000" # Adjust port as needed | |
environment: | |
HF_TOKEN: ${HF_TOKEN} | |
entrypoint: python3 -m sglang.launch_server | |
command: [ | |
"--model-path", "meta-llama/Meta-Llama-3.1-8B-Instruct", | |
"--host", "0.0.0.0", | |
"--port", "30000" | |
] | |
ulimits: | |
memlock: -1 | |
stack: 67108864 | |
ipc: host | |
healthcheck: | |
test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] | |
deploy: | |
resources: | |
reservations: | |
devices: | |
- driver: nvidia | |
device_ids: ['0'] | |
capabilities: [gpu] | |
# FAILS: On A5000, Runs out of memory. | |
sglang_QWEN_32B_Int4_torchao: | |
image: lmsysorg/sglang:latest | |
container_name: sglang_QWEN_32B_Int4_torchao | |
volumes: | |
- ${HOME}/.cache/huggingface:/root/.cache/huggingface | |
restart: always | |
ports: | |
- "30002:30000" # Adjust port as needed | |
environment: | |
HF_TOKEN: ${HF_TOKEN} | |
entrypoint: python3 -m sglang.launch_server | |
command: [ | |
"--model-path", "Qwen/Qwen2.5-Coder-32B-Instruct", | |
"--mem-fraction-static", "0.95", | |
"--torchao-config", "int4wo-128", | |
"--host", "0.0.0.0", | |
"--port", "30000" | |
] | |
ulimits: | |
memlock: -1 | |
stack: 67108864 | |
ipc: host | |
healthcheck: | |
test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] | |
deploy: | |
resources: | |
reservations: | |
devices: | |
- driver: nvidia | |
device_ids: ['0'] | |
capabilities: [gpu] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment