Last active
January 13, 2025 20:41
-
-
Save tcapelle/79ee2b4ce79d2bc991e53e1ccce8ada3 to your computer and use it in GitHub Desktop.
Evaluation on Modal
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# put your wandb api key in the modal secrets | |
# >modal run eval_latency_modal.py | |
import time | |
import logging | |
import numpy as np | |
from rich.console import Console | |
from rich.table import Table | |
import modal | |
from modal import Image | |
from weave.scorers import ( | |
Scorer, | |
ToxicityScorer, | |
HallucinationScorer, | |
ContextRelevanceScorer, | |
RobustnessScorer, | |
BiasScorer, | |
CoherenceScorer | |
) | |
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR) | |
GPU_TYPE = "L4" | |
SCORERS = [ToxicityScorer, HallucinationScorer, ContextRelevanceScorer, RobustnessScorer, BiasScorer, CoherenceScorer] | |
# Create Modal app | |
app = modal.App("latency-evaluation") | |
HF_CACHE_DIR = "/hf-cache" | |
WANDB_CACHE_DIR = "/wandb-cache" | |
# Create Modal image with required dependencies | |
image = (Image.debian_slim() | |
.apt_install("git") | |
.pip_install_from_requirements("eval_requirements.txt") | |
.env({"HF_HUB_CACHE": HF_CACHE_DIR, | |
"WANDB_CACHE_DIR": WANDB_CACHE_DIR}) | |
) | |
cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True) | |
wandb_volume = modal.Volume.from_name("wandb-cache", create_if_missing=True) | |
# around 100 tokens | |
base_piece = """ | |
The quick brown fox jumps over the lazy dog. | |
This is a longer sentence to increase the token count. | |
We need to add more text here, so let's continue with some more words and phrases. | |
The sun shines brightly, illuminating the forest floor. | |
Birds sing melodies in the trees, a gentle breeze rustles the leaves. | |
A small stream meanders through the landscape, reflecting the blue sky above. | |
Another sentence to increase the token count a little bit more here and there. | |
""" | |
base_piece_tokens = 99 # Known token count | |
INPUTS = [ | |
{"input":base_piece, "output": base_piece, "query": "This is a query", "context": base_piece, "length": base_piece_tokens}, # 100 tokens | |
{"input":base_piece, "output": base_piece * 5, "query": "This is a query", "context": base_piece * 5, "length": base_piece_tokens * 5}, # 500 tokens | |
{"input":base_piece, "output": base_piece * 10, "query": "This is a query", "context": base_piece * 10, "length": base_piece_tokens * 10}, # 1000 tokens | |
{"input":base_piece, "output": base_piece * 40, "query": "This is a query", "context": base_piece * 40, "length": base_piece_tokens * 40}, # 4000 tokens | |
{"input":base_piece, "output": base_piece * 72, "query": "This is a query", "context": base_piece * 72, "length": base_piece_tokens * 75}, # 8000 tokens | |
] | |
def eval_speed(scorer: Scorer, inputs: dict, trials: int = 10): | |
""" | |
Evaluate the speed of a scorer by averaging the time it takes to score the input `trials` times. | |
""" | |
times = [] | |
for _ in range(trials): | |
start_time = time.time() | |
# filter inputs to only include the inputs that the scorer.score method accepts | |
filtered_inputs = {k: v for k, v in inputs.items() if k in scorer.score.__annotations__} | |
if "context" in filtered_inputs: | |
filtered_inputs["output"] = "Dummy output" | |
_ = scorer.score(**filtered_inputs) | |
end_time = time.time() | |
elapsed_time = (end_time - start_time) * 1000 # Convert to milliseconds | |
times.append(elapsed_time) | |
return np.mean(times) | |
@app.function( | |
image=image, | |
gpu=GPU_TYPE, | |
secrets=[modal.Secret.from_name("wandb-api-key")], | |
timeout=1800, | |
volumes={HF_CACHE_DIR: cache_volume, WANDB_CACHE_DIR: wandb_volume}, | |
) | |
def run_evaluation(scorer_class: Scorer, trials: int = 10, device: str = "cuda"): | |
results = [] | |
if "device" in scorer_class.score.__annotations__: | |
scorer = scorer_class(device=device) | |
else: | |
scorer = scorer_class() | |
for single_input in INPUTS: | |
# dummy pass to warm up the model | |
_ = eval_speed(scorer, single_input, trials=1) | |
elapsed_time = eval_speed(scorer, single_input, trials=trials) | |
results.append({ | |
"scorer": scorer_class.__name__, | |
"tokens": single_input["length"], | |
"elapsed_time": elapsed_time | |
}) | |
return results | |
@app.local_entrypoint() | |
def main(trials: int = 20, device: str = "cuda"): | |
results = [] | |
for result in run_evaluation.map(SCORERS, kwargs={"trials": trials, "device": device}): | |
results.extend(result) | |
# Create table with dynamic columns based on token counts | |
table = Table(title=f"Speed Evaluation Results: {GPU_TYPE}") | |
table.add_column("Scorer", style="cyan", no_wrap=True) | |
token_counts = sorted(set(result["tokens"] for result in results)) | |
for count in token_counts: | |
table.add_column(f"{count} tok(s)", justify="right", style="green") | |
# Group results by scorer and add rows directly | |
for scorer in set(result["scorer"] for result in results): | |
scorer_times = { | |
result["tokens"]: f"{result['elapsed_time']:.2f} ms" | |
for result in results | |
if result["scorer"] == scorer | |
} | |
table.add_row(scorer, *(scorer_times[count] for count in token_counts)) | |
with open(f"./latency_results/{GPU_TYPE}.txt", "w") as file: | |
Console(file=file).print(table) | |
print("Results saved to latency_results_{GPU_TYPE}.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment