Created
May 9, 2024 19:06
-
-
Save charlesfrye/bfe37299e5c58893e10e63f7b7251588 to your computer and use it in GitHub Desktop.
LLaMA 3 VLLM Engine in OpenAI-Compatible Mode
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import subprocess | |
import modal | |
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct" | |
MODEL_DIR = f"/models/{MODEL_NAME}" | |
N_GPU = 1 | |
MINUTES = 60 | |
def download_model_to_image(model_dir, model_name): | |
from huggingface_hub import snapshot_download | |
from transformers.utils import move_cache | |
os.makedirs(model_dir, exist_ok=True) | |
snapshot_download( | |
model_name, | |
local_dir=model_dir, | |
ignore_patterns=["*.pt", "*.bin"], # Using safetensors | |
token=os.environ["HF_TOKEN"], | |
) | |
move_cache() | |
vllm_image = ( | |
modal.Image.debian_slim(python_version="3.10") | |
.pip_install( | |
[ | |
"vllm==0.4.1", | |
"hf-transfer==0.1.6", | |
"huggingface_hub==0.22.2", | |
"fastapi", | |
"httpx", | |
] | |
) | |
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
.run_function( | |
download_model_to_image, | |
timeout=60 * 20, | |
kwargs={"model_dir": MODEL_DIR, "model_name": MODEL_NAME}, | |
secrets=[modal.Secret.from_name("huggingface-secret")], | |
) | |
) | |
app = modal.App("vllm-openai-gist") | |
@app.function( | |
image=vllm_image, | |
gpu=modal.gpu.A10G(count=N_GPU), | |
container_idle_timeout=8 * MINUTES, | |
) | |
@modal.web_server( | |
port=8000, | |
startup_timeout=5 * MINUTES, | |
) | |
def serve_vllm(): | |
command = ( # NCCL bug in container runtime: multi-GPU setups hang (but not H100s) | |
# f"NCCL_P2P_DISABLE=1 " + | |
f"python -m vllm.entrypoints.openai.api_server --model {MODEL_DIR}" | |
+ f" --tensor-parallel-size {N_GPU}" | |
+ " --max-model-len 2048" | |
) | |
print("Starting server with command:", command) | |
subprocess.Popen(command, shell=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from openai import OpenAI | |
class Colors: | |
"""ANSI color codes""" | |
GREEN = "\033[0;32m" | |
BLUE = "\033[0;34m" | |
BOLD = "\033[1m" | |
END = "\033[0m" | |
client = OpenAI(api_key="YourSecretToken") | |
WORKSPACE = "modal-labs" | |
assert WORKSPACE != "modal-labs", "Please set your workspace name" | |
client.base_url = f"https://{WORKSPACE}--vllm-openai-gist-serve-vllm-dev.modal.run/v1" | |
model = client.models.list().data[0] | |
print( | |
Colors.GREEN, | |
Colors.BOLD, | |
f"Requesting completion from model {model.id}", | |
Colors.END, | |
sep="", | |
) | |
stream = client.chat.completions.create( | |
model=model.id, # by default, same as directory name | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are a poetic assistant, skilled in writing satirical doggerel with creative flair.", | |
}, | |
{ | |
"role": "user", | |
"content": "Compose a limerick about baboons and racoons.", | |
}, | |
], | |
stream=True, | |
) | |
print(Colors.BLUE) | |
for chunk in stream: | |
if chunk.choices[0].delta.content is not None: | |
print(chunk.choices[0].delta.content, end="") | |
print(Colors.END) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment