Here are the full supported engine configurations:
model_id: <HF model ID or local model path>
llm_engine: vllm
accelerator_type: <GPU type>
from time import perf_counter | |
from time import sleep | |
from contextlib import contextmanager | |
@contextmanager | |
def catchtime() -> Callable[[], float]: | |
t1 = t2 = perf_counter() | |
yield lambda: t2 - t1 | |
t2 = perf_counter() |
import torch | |
import ray.data | |
class DataGenerator: | |
def __init__(self, permute_config): | |
device = torch.device("cuda") | |
self.model = Model().to(device) | |
self.config = permute_config | |
def __call__(self, input): |
import ray.data | |
class DataGenerator: | |
def __init__(self, permute_config): | |
device = torch.device("cuda") | |
self.model = Model().to(device) | |
self.config = permute_config | |
def __call__(self, input): | |
for test_input in self.permute(permute_config, input): |
import ray.data | |
class DataGenerator: | |
def __init__(self, permute_config): | |
device = torch.device("cuda") | |
self.model = Model().to(device) | |
self.config = permute_config | |
def __call__(self, input): | |
for test_input in self.permute(permute_config, input): |