Richard Liaw richardliaw

Ray Serve LLM Primitives

Deploy LLMs with full OpenAI API compatibility using Ray Serve primitives.

Here are the full supported engine configurations:

model_id: <HF model ID or local model path>
llm_engine: vllm
accelerator_type: <GPU type>

Deploy LLMs with full OpenAI API compatibility using Ray Serve primitives.

Deploy LLMs with full OpenAI API compatibility using Ray Serve primitives.

Deploy LLMs with full OpenAI API compatibility using Ray Serve primitives.

	from time import perf_counter
	from time import sleep
	from contextlib import contextmanager

	@contextmanager
	def catchtime() -> Callable[[], float]:
	t1 = t2 = perf_counter()
	yield lambda: t2 - t1
	t2 = perf_counter()

	import ray.data

	class DataGenerator:
	def __init__(self, permute_config):
	device = torch.device("cuda")
	self.model = Model().to(device)
	self.config = permute_config

	def __call__(self, input):
	for test_input in self.permute(permute_config, input):

	import ray.data

	class DataGenerator:
	def __init__(self, permute_config):
	device = torch.device("cuda")
	self.model = Model().to(device)
	self.config = permute_config

	def __call__(self, input):
	for test_input in self.permute(permute_config, input):

	import requests
	from datetime import datetime
	def get_issues_with_ray_in_title(repo_name):
	issues = []
	page = 1
	# headers = {'Authorization': 'token YOUR_GITHUB_TOKEN'}
	while True:
	issues_url = f"https://api.github.com/repos/{repo_name}/issues?page={page}&per_page=100&state=open"
	response = requests.get(issues_url)# headers=headers)
	if response.status_code == 200: