Skip to content

Instantly share code, notes, and snippets.

@tcapelle
Last active January 13, 2025 20:41
Show Gist options
  • Save tcapelle/79ee2b4ce79d2bc991e53e1ccce8ada3 to your computer and use it in GitHub Desktop.
Save tcapelle/79ee2b4ce79d2bc991e53e1ccce8ada3 to your computer and use it in GitHub Desktop.
Evaluation on Modal
# put your wandb api key in the modal secrets
# >modal run eval_latency_modal.py
import time
import logging
import numpy as np
from rich.console import Console
from rich.table import Table
import modal
from modal import Image
from weave.scorers import (
Scorer,
ToxicityScorer,
HallucinationScorer,
ContextRelevanceScorer,
RobustnessScorer,
BiasScorer,
CoherenceScorer
)
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
GPU_TYPE = "L4"
SCORERS = [ToxicityScorer, HallucinationScorer, ContextRelevanceScorer, RobustnessScorer, BiasScorer, CoherenceScorer]
# Create Modal app
app = modal.App("latency-evaluation")
HF_CACHE_DIR = "/hf-cache"
WANDB_CACHE_DIR = "/wandb-cache"
# Create Modal image with required dependencies
image = (Image.debian_slim()
.apt_install("git")
.pip_install_from_requirements("eval_requirements.txt")
.env({"HF_HUB_CACHE": HF_CACHE_DIR,
"WANDB_CACHE_DIR": WANDB_CACHE_DIR})
)
cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
wandb_volume = modal.Volume.from_name("wandb-cache", create_if_missing=True)
# around 100 tokens
base_piece = """
The quick brown fox jumps over the lazy dog.
This is a longer sentence to increase the token count.
We need to add more text here, so let's continue with some more words and phrases.
The sun shines brightly, illuminating the forest floor.
Birds sing melodies in the trees, a gentle breeze rustles the leaves.
A small stream meanders through the landscape, reflecting the blue sky above.
Another sentence to increase the token count a little bit more here and there.
"""
base_piece_tokens = 99 # Known token count
INPUTS = [
{"input":base_piece, "output": base_piece, "query": "This is a query", "context": base_piece, "length": base_piece_tokens}, # 100 tokens
{"input":base_piece, "output": base_piece * 5, "query": "This is a query", "context": base_piece * 5, "length": base_piece_tokens * 5}, # 500 tokens
{"input":base_piece, "output": base_piece * 10, "query": "This is a query", "context": base_piece * 10, "length": base_piece_tokens * 10}, # 1000 tokens
{"input":base_piece, "output": base_piece * 40, "query": "This is a query", "context": base_piece * 40, "length": base_piece_tokens * 40}, # 4000 tokens
{"input":base_piece, "output": base_piece * 72, "query": "This is a query", "context": base_piece * 72, "length": base_piece_tokens * 75}, # 8000 tokens
]
def eval_speed(scorer: Scorer, inputs: dict, trials: int = 10):
"""
Evaluate the speed of a scorer by averaging the time it takes to score the input `trials` times.
"""
times = []
for _ in range(trials):
start_time = time.time()
# filter inputs to only include the inputs that the scorer.score method accepts
filtered_inputs = {k: v for k, v in inputs.items() if k in scorer.score.__annotations__}
if "context" in filtered_inputs:
filtered_inputs["output"] = "Dummy output"
_ = scorer.score(**filtered_inputs)
end_time = time.time()
elapsed_time = (end_time - start_time) * 1000 # Convert to milliseconds
times.append(elapsed_time)
return np.mean(times)
@app.function(
image=image,
gpu=GPU_TYPE,
secrets=[modal.Secret.from_name("wandb-api-key")],
timeout=1800,
volumes={HF_CACHE_DIR: cache_volume, WANDB_CACHE_DIR: wandb_volume},
)
def run_evaluation(scorer_class: Scorer, trials: int = 10, device: str = "cuda"):
results = []
if "device" in scorer_class.score.__annotations__:
scorer = scorer_class(device=device)
else:
scorer = scorer_class()
for single_input in INPUTS:
# dummy pass to warm up the model
_ = eval_speed(scorer, single_input, trials=1)
elapsed_time = eval_speed(scorer, single_input, trials=trials)
results.append({
"scorer": scorer_class.__name__,
"tokens": single_input["length"],
"elapsed_time": elapsed_time
})
return results
@app.local_entrypoint()
def main(trials: int = 20, device: str = "cuda"):
results = []
for result in run_evaluation.map(SCORERS, kwargs={"trials": trials, "device": device}):
results.extend(result)
# Create table with dynamic columns based on token counts
table = Table(title=f"Speed Evaluation Results: {GPU_TYPE}")
table.add_column("Scorer", style="cyan", no_wrap=True)
token_counts = sorted(set(result["tokens"] for result in results))
for count in token_counts:
table.add_column(f"{count} tok(s)", justify="right", style="green")
# Group results by scorer and add rows directly
for scorer in set(result["scorer"] for result in results):
scorer_times = {
result["tokens"]: f"{result['elapsed_time']:.2f} ms"
for result in results
if result["scorer"] == scorer
}
table.add_row(scorer, *(scorer_times[count] for count in token_counts))
with open(f"./latency_results/{GPU_TYPE}.txt", "w") as file:
Console(file=file).print(table)
print("Results saved to latency_results_{GPU_TYPE}.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment