Deploy LLMs with full OpenAI API compatibility using Ray Serve primitives.
- π Automatic scaling and load balancing
- π Tensor parallelism for large models
- π― JSON mode and function calling
- π¦ LoRA adapter support
- π Custom model composition
from ray import serve
from ray.serve.llm import VLLMDeployment, LLMConfig
# Create a basic LLM deployment
@serve.deployment
class MyLLMApp:
def __init__(self):
self.llm_config = LLMConfig(
model_id="meta-llama/Llama-3.1-8b-instruct",
accelerator_type="A100_80G",
tensor_parallelism=2
)
deployment_params = self.llm_config.get_deployment_args()
self.deployment = VLLMDeployment \
.options(**deployment_params) \
.bind(base_config=self.llm_config)
async def __call__(self, prompt: str):
return await self.deployment.predict(prompt)
# Deploy the app
app = MyLLMApp.bind()
Then you can run this using:
serve run app.py
# Deploy with OpenAI API compatibility using Ray Serve
from ray import serve
from ray.serve.llm import OpenAICompatibleDeployment, LLMConfig
llm_config = LLMConfig(
model_id="meta-llama/Llama-2-7b-chat-hf",
name="local-llama", # Name shown in /v1/models
accelerator_type="A100_80G",
tensor_parallel_degree=2
)
deployment = OpenAICompatibleDeployment.bind(
model_configs=[llm_config],
llm_deployment=VLLMDeployment.bind(base_config=llm_config)
)
```bash
# To serve the application, you can use the following command:
# Save the above code in a file named `app.py` and run the following command in your terminal:
serve run app.py
The following endpoints are supported out of the box:
- β /v1/chat/completions: Chat interface (ChatGPT-style)
- β /v1/completions: Text completion
- β /v1/models: List available models
- β /v1/models/{model}: Model information
And you can use the following OpenAI client to call:
from openai import OpenAI
# Initialize client
client = OpenAI(base_url="http://localhost:8000", api_key="fake-key")
# Basic completion
response = client.chat.completions.create(
model="local-llama",
messages=[{"role": "user", "content": "Hello!"}]
)
# Using LoRA
response = client.chat.completions.create(
model="local-llama",
messages=[{"role": "user", "content": "Hello!"}],
lora_id="my-custom-lora"
)
Our LLM Deployment APIs comes with LoRA support:
@serve.deployment
class MyLLMApp:
def __init__(self):
# Add LoRA configuration to the LLMConfig
self.llm_config = LLMConfig(
model_id="meta-llama/Llama-3.1-8b-instruct",
accelerator_type="A100_80G",
tensor_parallelism=2,
lora_config=LoRAConfig(
weights_path="s3://my-bucket/loras/",
max_lora_rank=8,
max_loras_per_replica=4
)
)
deployment_params = self.llm_config.get_deployment_args()
self.deployment = VLLMDeployment \
.options(**deployment_params) \
.bind(base_config=self.llm_config)
async def __call__(self, prompt: str, lora_id: Optional[str] = None):
return await self.deployment.predict(
prompt,
lora_id=lora_id # Pass through LoRA ID if specified
)
# Deploy the app
app = MyLLMApp.bind()