Ray Serve LLM Primitives

Deploy LLMs with full OpenAI API compatibility using Ray Serve primitives.

Features

🚀 Automatic scaling and load balancing
🔄 Tensor parallelism for large models
🎯 JSON mode and function calling
📦 LoRA adapter support
🛠 Custom model composition

from ray import serve
from ray.serve.llm import VLLMDeployment, LLMConfig

# Create a basic LLM deployment
@serve.deployment
class MyLLMApp:
    def __init__(self):
        self.llm_config = LLMConfig(
            model_id="meta-llama/Llama-3.1-8b-instruct",
            accelerator_type="A100_80G",
            tensor_parallelism=2
        )
        deployment_params = self.llm_config.get_deployment_args()
        self.deployment = VLLMDeployment \
            .options(**deployment_params) \
            .bind(base_config=self.llm_config)

    async def __call__(self, prompt: str):
        return await self.deployment.predict(prompt)

# Deploy the app
app = MyLLMApp.bind()

Then you can run this using:

serve run app.py

⭐️ Key Features

OpenAI API Compatible

# Deploy with OpenAI API compatibility using Ray Serve
from ray import serve
from ray.serve.llm import OpenAICompatibleDeployment, LLMConfig

llm_config = LLMConfig(
    model_id="meta-llama/Llama-2-7b-chat-hf",
    name="local-llama",  # Name shown in /v1/models
    accelerator_type="A100_80G",
    tensor_parallel_degree=2
)

deployment = OpenAICompatibleDeployment.bind(
    model_configs=[llm_config],
    llm_deployment=VLLMDeployment.bind(base_config=llm_config)
)

```bash
# To serve the application, you can use the following command:
# Save the above code in a file named `app.py` and run the following command in your terminal:

serve run app.py

The following endpoints are supported out of the box:

✅ /v1/chat/completions: Chat interface (ChatGPT-style)
✅ /v1/completions: Text completion
✅ /v1/models: List available models
✅ /v1/models/{model}: Model information

And you can use the following OpenAI client to call:

from openai import OpenAI

# Initialize client
client = OpenAI(base_url="http://localhost:8000", api_key="fake-key")

# Basic completion
response = client.chat.completions.create(
    model="local-llama",
    messages=[{"role": "user", "content": "Hello!"}]
)

# Using LoRA
response = client.chat.completions.create(
    model="local-llama",
    messages=[{"role": "user", "content": "Hello!"}],
    lora_id="my-custom-lora"
)

LoRA support

Our LLM Deployment APIs comes with LoRA support:

@serve.deployment
class MyLLMApp:
    def __init__(self):
        # Add LoRA configuration to the LLMConfig
        self.llm_config = LLMConfig(
            model_id="meta-llama/Llama-3.1-8b-instruct",
            accelerator_type="A100_80G",
            tensor_parallelism=2,
            lora_config=LoRAConfig(
                weights_path="s3://my-bucket/loras/",
                max_lora_rank=8,
                max_loras_per_replica=4
            )
        )
        deployment_params = self.llm_config.get_deployment_args()
        self.deployment = VLLMDeployment \
            .options(**deployment_params) \
            .bind(base_config=self.llm_config)

    async def __call__(self, prompt: str, lora_id: Optional[str] = None):
        return await self.deployment.predict(
            prompt,
            lora_id=lora_id  # Pass through LoRA ID if specified
        )

# Deploy the app
app = MyLLMApp.bind()

richardliaw/README.md

Ray Serve LLM Primitives

Features

⭐️ Key Features

OpenAI API Compatible

LoRA support