Created
August 1, 2024 16:23
-
-
Save bernstein7/14f3202baf2403756f6d0bfa84382335 to your computer and use it in GitHub Desktop.
vLLM engine deployment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
from typing import List, Union, AsyncGenerator | |
from vllm import EngineArgs, LLMEngine | |
from vllm.engine.arg_utils import AsyncEngineArgs | |
from vllm.engine.async_llm_engine import AsyncLLMEngine | |
from vllm.entrypoints.openai.protocol import ( | |
ChatCompletionRequest, | |
ChatCompletionResponse, | |
ChatCompletionToolsParam, | |
FunctionDefinition, | |
ResponseFormat, | |
ErrorResponse | |
) | |
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat | |
from openai.types.chat import ( | |
ChatCompletionMessageParam, | |
ChatCompletionUserMessageParam, | |
ChatCompletionSystemMessageParam | |
) | |
from transformers import AutoTokenizer | |
from fastapi.responses import JSONResponse | |
from fastapi import Request | |
HF_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct" | |
SYSTEM_PROMPT = """ | |
You're provided with a list of external functions that may be called to complete certain tasks. | |
Whenever the user asks you something, you can either respond directly or invoke a function. | |
The decision to invoke a function is yours, only invoke functions when it makes sense to do so. | |
Please pay close attention to function descriptions. | |
When you're using relevant function please fill all its required parameters. | |
""" | |
@ray.serve.deployment( | |
autoscaling_config={"min_replicas": 0, "max_replicas": 2}, | |
ray_actor_options={"num_cpus": 1, "num_gpus": 1}, | |
) | |
class AsyncEngine: | |
def __init__(self): | |
print("Setting tokerinzer...") | |
self.tokenizer = AutoTokenizer.from_pretrained(HF_MODEL) | |
print("Setting engine args...") | |
self.engine_args = AsyncEngineArgs( | |
model=HF_MODEL, | |
max_num_seqs=256, | |
device="cuda", #"auto" | |
engine_use_ray=True, | |
worker_use_ray=True, | |
distributed_executor_backend="ray", | |
dtype="float16", | |
trust_remote_code=True | |
) | |
print("Initializing an engine...") | |
self.engine = AsyncLLMEngine.from_engine_args(self.engine_args) | |
print("__init__ done") | |
async def generate(self, user_prompt: str) -> Union[ErrorResponse, AsyncGenerator[str, None], ChatCompletionResponse]: | |
conversation = [ | |
ChatCompletionSystemMessageParam(role="system", content=SYSTEM_PROMPT), | |
ChatCompletionUserMessageParam(role="user", content=user_prompt), | |
] | |
tool_dicts = [ | |
ChatCompletionToolsParam( | |
type= "function", | |
function= FunctionDefinition( | |
name="get_current_temperature", | |
description="Get the current temperature for a specific location", | |
parameters={ | |
"type": "object", | |
"properties": { | |
"location": { | |
"type": "string", | |
"description": "The city and state, e.g., San Francisco, CA" | |
}, | |
"unit": { | |
"type": "string", | |
"enum": ["Celsius", "Fahrenheit"], | |
"description": "The temperature unit to use. Infer this from the user's location." | |
} | |
}, | |
"required": ["location", "unit"] | |
} | |
) | |
), | |
ChatCompletionToolsParam( | |
type="function", | |
function=FunctionDefinition( | |
name="get_rain_probability", | |
description="Get the probability of rain for a specific location", | |
parameters={ | |
"type": "object", | |
"properties": { | |
"location": { | |
"type": "string", | |
"description": "The city and state, e.g., San Francisco, CA" | |
} | |
}, | |
"required": ["location"] | |
} | |
) | |
) | |
] | |
request = ChatCompletionRequest( | |
seed=42, | |
messages=conversation, | |
model=HF_MODEL, | |
tools=tool_dicts, | |
# tool_choice="auto" doesn't work with 0.5.0 | |
response_format=ResponseFormat(type="json_object"), | |
guided_decoding_backend="outlines" | |
) | |
model_config = await self.engine.get_model_config() | |
openai_serving_chat = OpenAIServingChat( | |
engine=self.engine, | |
model_config=model_config, | |
served_model_names=[HF_MODEL], | |
response_role="assistant" | |
) | |
generator = await openai_serving_chat.create_chat_completion(request) | |
return generator | |
async def __call__(self, http_request: Request) -> JSONResponse: | |
body = await http_request.json() | |
generator = await self.generate(body['prompts']) | |
if isinstance(generator, ErrorResponse): | |
return JSONResponse(content=generator.model_dump(), status_code=generator.code) | |
else: | |
assert isinstance(generator, ChatCompletionResponse) | |
return JSONResponse(content=generator.model_dump()) | |
async_engine_app = AsyncEngine.bind() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This file was generated using the `serve build` command on Ray v2.32.0. | |
proxy_location: EveryNode | |
http_options: | |
host: 0.0.0.0 | |
port: 8000 | |
grpc_options: | |
port: 9000 | |
grpc_servicer_functions: [] | |
logging_config: | |
encoding: TEXT | |
log_level: INFO | |
logs_dir: null | |
enable_access_log: true | |
applications: | |
- name: async_engine_app | |
route_prefix: /engine | |
import_path: engine_deploy:async_engine_app | |
runtime_env: { | |
"env_vars": { | |
"HF_TOKEN": {HF_TOKEN} | |
} | |
} | |
deployments: | |
- name: AsyncEngine | |
autoscaling_config: | |
min_replicas: 0 | |
initial_replicas: 1 | |
max_replicas: 2 | |
target_num_ongoing_requests_per_replica: 1.0 | |
target_ongoing_requests: 1 | |
max_ongoing_requests: 2 | |
metrics_interval_s: 10.0 | |
look_back_period_s: 30.0 | |
smoothing_factor: 1.0 | |
upscaling_factor: 1.0 | |
downscaling_factor: 1.0 | |
downscale_delay_s: 120.0 | |
upscale_delay_s: 10.0 | |
ray_actor_options: | |
num_cpus: 1.0 | |
num_gpus: 1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
response = requests.post("http://localhost:8000/engine", json={"prompt": "What is current weather in New York?"}) | |
response.text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
firecrawl-py>=0.0.5 | |
html2text==2020.1.16 | |
langgraph==0.0.53 | |
langsmith==0.1.60 | |
langchain==0.2.0 | |
langchain-community==0.2.0 | |
langchain-openai==0.1.7 | |
llama-index==0.10.37 | |
llama-index-llms-openai>=0.1.15 | |
llama-index-packs-snowflake-query-engine>=0.1.3 | |
llama-index-postprocessor-cohere-rerank>=0.1.4 | |
llama-index-program-openai>=0.1.5 | |
llama-index-readers-web>=0.1.9 | |
llama-index-vector-stores-chroma==0.1.8 | |
llama-index-vector-stores-qdrant>=0.2.0 | |
openai==1.30.1 | |
psycopg[binary,pool]>=3.1.17 | |
python-dateutil>=2.8.2 | |
pytz>=2023.3.post1 | |
pyvis>=0.3.2 | |
pydantic>=2.7.1 | |
qdrant-client>=1.8.2 | |
s3fs>=2023.12.2 | |
sqlalchemy>=2.0.25 | |
sqlalchemy-utils>=0.41.2 | |
transformers | |
gradio>=4.31.3 | |
async_timeout==4.0.3 | |
starlette<=0.34.0 | |
sentry_sdk | |
ipywidgets | |
huggingface_hub | |
outlines | |
fastapi==0.108.0 | |
einops | |
vllm==0.5.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment