Skip to content

Instantly share code, notes, and snippets.

@bernstein7
Created August 1, 2024 16:23
Show Gist options
  • Save bernstein7/14f3202baf2403756f6d0bfa84382335 to your computer and use it in GitHub Desktop.
Save bernstein7/14f3202baf2403756f6d0bfa84382335 to your computer and use it in GitHub Desktop.
vLLM engine deployment
import ray
from typing import List, Union, AsyncGenerator
from vllm import EngineArgs, LLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionToolsParam,
FunctionDefinition,
ResponseFormat,
ErrorResponse
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from openai.types.chat import (
ChatCompletionMessageParam,
ChatCompletionUserMessageParam,
ChatCompletionSystemMessageParam
)
from transformers import AutoTokenizer
from fastapi.responses import JSONResponse
from fastapi import Request
HF_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
SYSTEM_PROMPT = """
You're provided with a list of external functions that may be called to complete certain tasks.
Whenever the user asks you something, you can either respond directly or invoke a function.
The decision to invoke a function is yours, only invoke functions when it makes sense to do so.
Please pay close attention to function descriptions.
When you're using relevant function please fill all its required parameters.
"""
@ray.serve.deployment(
autoscaling_config={"min_replicas": 0, "max_replicas": 2},
ray_actor_options={"num_cpus": 1, "num_gpus": 1},
)
class AsyncEngine:
def __init__(self):
print("Setting tokerinzer...")
self.tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
print("Setting engine args...")
self.engine_args = AsyncEngineArgs(
model=HF_MODEL,
max_num_seqs=256,
device="cuda", #"auto"
engine_use_ray=True,
worker_use_ray=True,
distributed_executor_backend="ray",
dtype="float16",
trust_remote_code=True
)
print("Initializing an engine...")
self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)
print("__init__ done")
async def generate(self, user_prompt: str) -> Union[ErrorResponse, AsyncGenerator[str, None], ChatCompletionResponse]:
conversation = [
ChatCompletionSystemMessageParam(role="system", content=SYSTEM_PROMPT),
ChatCompletionUserMessageParam(role="user", content=user_prompt),
]
tool_dicts = [
ChatCompletionToolsParam(
type= "function",
function= FunctionDefinition(
name="get_current_temperature",
description="Get the current temperature for a specific location",
parameters={
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g., San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["Celsius", "Fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location."
}
},
"required": ["location", "unit"]
}
)
),
ChatCompletionToolsParam(
type="function",
function=FunctionDefinition(
name="get_rain_probability",
description="Get the probability of rain for a specific location",
parameters={
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g., San Francisco, CA"
}
},
"required": ["location"]
}
)
)
]
request = ChatCompletionRequest(
seed=42,
messages=conversation,
model=HF_MODEL,
tools=tool_dicts,
# tool_choice="auto" doesn't work with 0.5.0
response_format=ResponseFormat(type="json_object"),
guided_decoding_backend="outlines"
)
model_config = await self.engine.get_model_config()
openai_serving_chat = OpenAIServingChat(
engine=self.engine,
model_config=model_config,
served_model_names=[HF_MODEL],
response_role="assistant"
)
generator = await openai_serving_chat.create_chat_completion(request)
return generator
async def __call__(self, http_request: Request) -> JSONResponse:
body = await http_request.json()
generator = await self.generate(body['prompts'])
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(), status_code=generator.code)
else:
assert isinstance(generator, ChatCompletionResponse)
return JSONResponse(content=generator.model_dump())
async_engine_app = AsyncEngine.bind()
# This file was generated using the `serve build` command on Ray v2.32.0.
proxy_location: EveryNode
http_options:
host: 0.0.0.0
port: 8000
grpc_options:
port: 9000
grpc_servicer_functions: []
logging_config:
encoding: TEXT
log_level: INFO
logs_dir: null
enable_access_log: true
applications:
- name: async_engine_app
route_prefix: /engine
import_path: engine_deploy:async_engine_app
runtime_env: {
"env_vars": {
"HF_TOKEN": {HF_TOKEN}
}
}
deployments:
- name: AsyncEngine
autoscaling_config:
min_replicas: 0
initial_replicas: 1
max_replicas: 2
target_num_ongoing_requests_per_replica: 1.0
target_ongoing_requests: 1
max_ongoing_requests: 2
metrics_interval_s: 10.0
look_back_period_s: 30.0
smoothing_factor: 1.0
upscaling_factor: 1.0
downscaling_factor: 1.0
downscale_delay_s: 120.0
upscale_delay_s: 10.0
ray_actor_options:
num_cpus: 1.0
num_gpus: 1.0
import requests
response = requests.post("http://localhost:8000/engine", json={"prompt": "What is current weather in New York?"})
response.text
firecrawl-py>=0.0.5
html2text==2020.1.16
langgraph==0.0.53
langsmith==0.1.60
langchain==0.2.0
langchain-community==0.2.0
langchain-openai==0.1.7
llama-index==0.10.37
llama-index-llms-openai>=0.1.15
llama-index-packs-snowflake-query-engine>=0.1.3
llama-index-postprocessor-cohere-rerank>=0.1.4
llama-index-program-openai>=0.1.5
llama-index-readers-web>=0.1.9
llama-index-vector-stores-chroma==0.1.8
llama-index-vector-stores-qdrant>=0.2.0
openai==1.30.1
psycopg[binary,pool]>=3.1.17
python-dateutil>=2.8.2
pytz>=2023.3.post1
pyvis>=0.3.2
pydantic>=2.7.1
qdrant-client>=1.8.2
s3fs>=2023.12.2
sqlalchemy>=2.0.25
sqlalchemy-utils>=0.41.2
transformers
gradio>=4.31.3
async_timeout==4.0.3
starlette<=0.34.0
sentry_sdk
ipywidgets
huggingface_hub
outlines
fastapi==0.108.0
einops
vllm==0.5.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment