Deploy LLMs with full OpenAI API compatibility using Ray Serve primitives.
- π Automatic scaling and load balancing
- π Tensor parallelism for large models
- π― JSON mode and function calling
- π¦ Multi-LoRA adapter support
- π Custom model composition
- Out of the box support for TRT-LLM and vLLM
Get started with LLM features by using the VLLMDeployment and LLMConfig API.
from ray import serve
from ray.serve.llm import VLLMDeployment, LLMConfig
# Create a basic LLM deployment
@serve.deployment
class MyLLMApp:
def __init__(self):
self.llm_config = LLMConfig(
model_id="meta-llama/Llama-3.1-8b-instruct",
accelerator_type="A100_80G",
tensor_parallelism=2
)
self.deployment = VLLMDeployment.build(self.llm_config)
# deployment_params = self.llm_config.get_deployment_args()
# self.deployment = VLLMDeployment.options(**deployment_params) \
# .bind(base_config=self.llm_config)
async def __call__(self, prompt: str):
return await self.deployment.predict(prompt)
# Deploy the app
app = MyLLMApp.bind()
Then you can run this using:
serve run app.py
We also offer a configuration generation script:
# Generate a base configuration file
python generate-config.py --model meta-llama/Llama-2-7b-chat-hf > serve-config.yaml
The generated YAML is simple to understand and modify:
# serve-config.yaml
model_id: "meta-llama/Llama-2-7b-chat-hf"
name: "llama-2-chat"
accelerator_type: "A100_80G"
tensor_parallelism: 2
max_batch_size: 64
# Optional LoRA support
lora_config:
weights_path: "s3://my-bucket/loras/"
max_lora_rank: 8
You can then import this yaml file programmatically:
from ray import serve
from ray.serve.llm import VLLMDeployment, LLMConfig
# Load from config file
config = LLMConfig.from_yaml("serve-config.yaml")
deployment = VLLMDeployment.build(config)
...
# Deploy with OpenAI API compatibility using Ray Serve
from ray import serve
from ray.serve.llm import OpenAICompatRouter, LLMConfig
llm_config = LLMConfig(
model_id="meta-llama/Llama-2-7b-chat-hf",
name="local-llama", # Name shown in /v1/models
accelerator_type="A100_80G",
tensor_parallel_degree=2
)
deployment = OpenAICompatRouter.bind(
llm_deployments=VLLMDeployment.build(llm_config)
)
To serve the application, you can use the following command:
# Save the above code in a file named `app.py` and run the following command in your terminal:
serve run app.py
The following endpoints are supported out of the box:
- β /v1/chat/completions: Chat interface (ChatGPT-style)
- β /v1/completions: Text completion
- β /v1/models: List available models
- β /v1/models/{model}: Model information
And you can use the following OpenAI client to call:
from openai import OpenAI
# Initialize client
client = OpenAI(base_url="http://localhost:8000", api_key="fake-key")
# Basic completion
response = client.chat.completions.create(
model="local-llama",
messages=[{"role": "user", "content": "Hello!"}]
)
# Using LoRA
response = client.chat.completions.create(
model="local-llama",
messages=[{"role": "user", "content": "Hello!"}],
lora_id="my-custom-lora"
)
You can also deploy multiple models with the OpenAICompatRouter
# Deploy with OpenAI API compatibility using Ray Serve
from ray import serve
from ray.serve.llm import OpenAICompatRouter, LLMConfig, VLLMDeployment
# Define configurations for different models
llm_7b = LLMConfig(
model_id="meta-llama/Llama-2-7b-chat-hf",
name="llama-7b",
accelerator_type="A100_80G",
tensor_parallel_degree=2
)
llm_13b = LLMConfig(
model_id="meta-llama/Llama-2-13b-chat-hf",
name="llama-13b",
accelerator_type="A100_80G",
tensor_parallel_degree=4
)
llm_70b = LLMConfig(
model_id="meta-llama/Llama-2-70b-chat-hf",
name="llama-70b",
accelerator_type="A100_80G",
tensor_parallel_degree=8
)
# Create separate deployments for each model
deployment_7b = VLLMDeployment.build(llm_7b)
# deployment_7b = VLLMDeployment.options(llm_7b.get_deployment_params()).bind(llm_7b)
deployment_13b = VLLMDeployment.build(llm_13b)
deployment_70b = VLLMDeployment.build(llm_70b)
# Create router that manages all deployments
deployment = OpenAICompatRouter.bind(
llm_deployments=[deployment_7b, deployment_13b, deployment_70b]
)
Our LLM Deployment APIs comes with LoRA support:
@serve.deployment
class MyLLMApp:
def __init__(self):
# Add LoRA configuration to the LLMConfig
self.llm_config = LLMConfig(
model_id="meta-llama/Llama-3.1-8b-instruct",
accelerator_type="A100_80G",
tensor_parallelism=2,
lora_config=LoRAConfig(
weights_path="s3://my-bucket/loras/",
max_lora_rank=8,
max_loras_per_replica=4
)
)
self.deployment = VLLMDeployment.build(self.llm_config)
async def __call__(self, prompt: str, lora_id: Optional[str] = None):
return await self.deployment.predict(
prompt,
lora_id=lora_id # Pass through LoRA ID if specified
)
# Deploy the app
app = MyLLMApp.bind()
JSON Mode ensures structured outputs from your LLM deployments. Here is an example:
from ray import serve
from ray.serve.llm import VLLMDeployment, LLMConfig, JSONMode
# Define model with JSON schema
llm_config = LLMConfig(
model_id="meta-llama/Llama-2-7b-chat-hf",
name="llama-2-chat",
accelerator_type="A100_80G",
tensor_parallelism=2,
json_mode=JSONMode(
schema={
"type": "object",
"properties": {
"sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
"score": {"type": "number", "minimum": 0, "maximum": 1}
},
"required": ["sentiment", "score"]
}
)
)
@serve.deployment
class SentimentAnalyzer:
def __init__(self):
self.deployment = VLLMDeployment.build(base_config=llm_config)
async def __call__(self, text: str):
return await self.deployment.predict(
f"Analyze: '{text}'",
response_format={"type": "json"}
)
# Deploy
app = SentimentAnalyzer.bind()
# Example usage:
# response = await app.remote("This product exceeded my expectations!")
# Returns: {"sentiment": "positive", "score": 0.95}