Deploy LLMs with full OpenAI API compatibility using Ray Serve primitives.
- 🚀 Automatic scaling and load balancing
- 🔄 Tensor parallelism for large models
- 🎯 JSON mode and function calling
- 📦 Multi-LoRA adapter support
from time import perf_counter | |
from time import sleep | |
from contextlib import contextmanager | |
@contextmanager | |
def catchtime() -> Callable[[], float]: | |
t1 = t2 = perf_counter() | |
yield lambda: t2 - t1 | |
t2 = perf_counter() |
import torch | |
import ray.data | |
class DataGenerator: | |
def __init__(self, permute_config): | |
device = torch.device("cuda") | |
self.model = Model().to(device) | |
self.config = permute_config | |
def __call__(self, input): |
import ray.data | |
class DataGenerator: | |
def __init__(self, permute_config): | |
device = torch.device("cuda") | |
self.model = Model().to(device) | |
self.config = permute_config | |
def __call__(self, input): | |
for test_input in self.permute(permute_config, input): |
import ray.data | |
class DataGenerator: | |
def __init__(self, permute_config): | |
device = torch.device("cuda") | |
self.model = Model().to(device) | |
self.config = permute_config | |
def __call__(self, input): | |
for test_input in self.permute(permute_config, input): |
import requests | |
from datetime import datetime | |
def get_issues_with_ray_in_title(repo_name): | |
issues = [] | |
page = 1 | |
# headers = {'Authorization': 'token YOUR_GITHUB_TOKEN'} | |
while True: | |
issues_url = f"https://api.github.com/repos/{repo_name}/issues?page={page}&per_page=100&state=open" | |
response = requests.get(issues_url)# headers=headers) | |
if response.status_code == 200: |