tcapelle · January 13, 2025 20:41
diff --git a/eval_modal.py b/eval_modal.py
 # put your wandb api key in the modal secrets
 # >modal run eval_latency_modal.py

 import time
 import logging
 import numpy as np
 from rich.console import Console
 from rich.table import Table
 import modal
 from modal import Image

 from weave.scorers import (
    Scorer,
    ToxicityScorer, 
    HallucinationScorer, 
    ContextRelevanceScorer, 
    RobustnessScorer,
    BiasScorer,
    CoherenceScorer
 )

 logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)

 GPU_TYPE = "L4"

 SCORERS = [ToxicityScorer, HallucinationScorer, ContextRelevanceScorer, RobustnessScorer, BiasScorer, CoherenceScorer]

 # Create Modal app
 app = modal.App("latency-evaluation")
 HF_CACHE_DIR = "/hf-cache"
 WANDB_CACHE_DIR = "/wandb-cache"

 # Create Modal image with required dependencies
 image = (Image.debian_slim()
         .apt_install("git")
         .pip_install_from_requirements("eval_requirements.txt")
         .env({"HF_HUB_CACHE": HF_CACHE_DIR,
               "WANDB_CACHE_DIR": WANDB_CACHE_DIR})
 )

 cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
 wandb_volume = modal.Volume.from_name("wandb-cache", create_if_missing=True)
 # around 100 tokens
 base_piece = """
 The quick brown fox jumps over the lazy dog. 
 This is a longer sentence to increase the token count.  
 We need to add more text here, so let's continue with some more words and phrases.  
 The sun shines brightly, illuminating the forest floor. 
 Birds sing melodies in the trees, a gentle breeze rustles the leaves.  
 A small stream meanders through the landscape, reflecting the blue sky above.  
 Another sentence to increase the token count a little bit more here and there.
 """

 base_piece_tokens = 99  # Known token count

 INPUTS = [
    {"input":base_piece, "output": base_piece, "query": "This is a query", "context": base_piece, "length": base_piece_tokens}, # 100 tokens
    {"input":base_piece, "output": base_piece * 5, "query": "This is a query", "context": base_piece * 5, "length": base_piece_tokens * 5}, # 500 tokens
    {"input":base_piece, "output": base_piece * 10, "query": "This is a query", "context": base_piece * 10, "length": base_piece_tokens * 10}, # 1000 tokens
    {"input":base_piece, "output": base_piece * 40, "query": "This is a query", "context": base_piece * 40, "length": base_piece_tokens * 40}, # 4000 tokens
    {"input":base_piece, "output": base_piece * 72, "query": "This is a query", "context": base_piece * 72, "length": base_piece_tokens * 75}, # 8000 tokens
 ]

 def eval_speed(scorer: Scorer, inputs: dict, trials: int = 10):
    """
    Evaluate the speed of a scorer by averaging the time it takes to score the input `trials` times.
    """
    times = []
    for _ in range(trials):
        start_time = time.time()
        # filter inputs to only include the inputs that the scorer.score method accepts
        filtered_inputs = {k: v for k, v in inputs.items() if k in scorer.score.__annotations__}
        if "context" in filtered_inputs:
            filtered_inputs["output"] = "Dummy output"
        _ = scorer.score(**filtered_inputs)
        end_time = time.time()
        elapsed_time = (end_time - start_time) * 1000  # Convert to milliseconds
        times.append(elapsed_time)
    return np.mean(times)

 @app.function(
    image=image,
    gpu=GPU_TYPE,
    secrets=[modal.Secret.from_name("wandb-api-key")],
    timeout=1800,
    volumes={HF_CACHE_DIR: cache_volume, WANDB_CACHE_DIR: wandb_volume},
 )
 def run_evaluation(scorer_class: Scorer, trials: int = 10, device: str = "cuda"):
    results = []
    if "device" in scorer_class.score.__annotations__:
        scorer = scorer_class(device=device)
    else:
        scorer = scorer_class()
    for single_input in INPUTS:
        # dummy pass to warm up the model
        _ = eval_speed(scorer, single_input, trials=1)
        elapsed_time = eval_speed(scorer, single_input, trials=trials)
        results.append({
            "scorer": scorer_class.__name__,
            "tokens": single_input["length"],
            "elapsed_time": elapsed_time
        })
    
    return results

 @app.local_entrypoint()
 def main(trials: int = 20, device: str = "cuda"):
    results = []
    for result in run_evaluation.map(SCORERS, kwargs={"trials": trials, "device": device}):
        results.extend(result)
    
    # Create table with dynamic columns based on token counts
    table = Table(title=f"Speed Evaluation Results: {GPU_TYPE}")
    table.add_column("Scorer", style="cyan", no_wrap=True)
    token_counts = sorted(set(result["tokens"] for result in results))
    for count in token_counts:
        table.add_column(f"{count} tok(s)", justify="right", style="green")

    # Group results by scorer and add rows directly
    for scorer in set(result["scorer"] for result in results):
        scorer_times = {
            result["tokens"]: f"{result['elapsed_time']:.2f} ms"
            for result in results 
            if result["scorer"] == scorer
        }
        table.add_row(scorer, *(scorer_times[count] for count in token_counts))
    
    with open(f"./latency_results/{GPU_TYPE}.txt", "w") as file:
        Console(file=file).print(table)
        print("Results saved to latency_results_{GPU_TYPE}.txt")
	# put your wandb api key in the modal secrets
	# >modal run eval_latency_modal.py

	import time
	import logging
	import numpy as np
	from rich.console import Console
	from rich.table import Table
	import modal
	from modal import Image

	from weave.scorers import (
	Scorer,
	ToxicityScorer,
	HallucinationScorer,
	ContextRelevanceScorer,
	RobustnessScorer,
	BiasScorer,
	CoherenceScorer
	)

	logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)

	GPU_TYPE = "L4"

	SCORERS = [ToxicityScorer, HallucinationScorer, ContextRelevanceScorer, RobustnessScorer, BiasScorer, CoherenceScorer]

	# Create Modal app
	app = modal.App("latency-evaluation")
	HF_CACHE_DIR = "/hf-cache"
	WANDB_CACHE_DIR = "/wandb-cache"

	# Create Modal image with required dependencies
	image = (Image.debian_slim()
	.apt_install("git")
	.pip_install_from_requirements("eval_requirements.txt")
	.env({"HF_HUB_CACHE": HF_CACHE_DIR,
	"WANDB_CACHE_DIR": WANDB_CACHE_DIR})
	)

	cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
	wandb_volume = modal.Volume.from_name("wandb-cache", create_if_missing=True)
	# around 100 tokens
	base_piece = """
	The quick brown fox jumps over the lazy dog.
	This is a longer sentence to increase the token count.
	We need to add more text here, so let's continue with some more words and phrases.
	The sun shines brightly, illuminating the forest floor.
	Birds sing melodies in the trees, a gentle breeze rustles the leaves.
	A small stream meanders through the landscape, reflecting the blue sky above.
	Another sentence to increase the token count a little bit more here and there.
	"""

	base_piece_tokens = 99 # Known token count

	INPUTS = [
	{"input":base_piece, "output": base_piece, "query": "This is a query", "context": base_piece, "length": base_piece_tokens}, # 100 tokens
	{"input":base_piece, "output": base_piece * 5, "query": "This is a query", "context": base_piece * 5, "length": base_piece_tokens * 5}, # 500 tokens
	{"input":base_piece, "output": base_piece * 10, "query": "This is a query", "context": base_piece * 10, "length": base_piece_tokens * 10}, # 1000 tokens
	{"input":base_piece, "output": base_piece * 40, "query": "This is a query", "context": base_piece * 40, "length": base_piece_tokens * 40}, # 4000 tokens
	{"input":base_piece, "output": base_piece * 72, "query": "This is a query", "context": base_piece * 72, "length": base_piece_tokens * 75}, # 8000 tokens
	]

	def eval_speed(scorer: Scorer, inputs: dict, trials: int = 10):
	"""
	Evaluate the speed of a scorer by averaging the time it takes to score the input `trials` times.
	"""
	times = []
	for _ in range(trials):
	start_time = time.time()
	# filter inputs to only include the inputs that the scorer.score method accepts
	filtered_inputs = {k: v for k, v in inputs.items() if k in scorer.score.__annotations__}
	if "context" in filtered_inputs:
	filtered_inputs["output"] = "Dummy output"
	_ = scorer.score(**filtered_inputs)
	end_time = time.time()
	elapsed_time = (end_time - start_time) * 1000 # Convert to milliseconds
	times.append(elapsed_time)
	return np.mean(times)

	@app.function(
	image=image,
	gpu=GPU_TYPE,
	secrets=[modal.Secret.from_name("wandb-api-key")],
	timeout=1800,
	volumes={HF_CACHE_DIR: cache_volume, WANDB_CACHE_DIR: wandb_volume},
	)
	def run_evaluation(scorer_class: Scorer, trials: int = 10, device: str = "cuda"):
	results = []
	if "device" in scorer_class.score.__annotations__:
	scorer = scorer_class(device=device)
	else:
	scorer = scorer_class()
	for single_input in INPUTS:
	# dummy pass to warm up the model
	_ = eval_speed(scorer, single_input, trials=1)
	elapsed_time = eval_speed(scorer, single_input, trials=trials)
	results.append({
	"scorer": scorer_class.__name__,
	"tokens": single_input["length"],
	"elapsed_time": elapsed_time
	})

	return results

	@app.local_entrypoint()
	def main(trials: int = 20, device: str = "cuda"):
	results = []
	for result in run_evaluation.map(SCORERS, kwargs={"trials": trials, "device": device}):
	results.extend(result)

	# Create table with dynamic columns based on token counts
	table = Table(title=f"Speed Evaluation Results: {GPU_TYPE}")
	table.add_column("Scorer", style="cyan", no_wrap=True)
	token_counts = sorted(set(result["tokens"] for result in results))
	for count in token_counts:
	table.add_column(f"{count} tok(s)", justify="right", style="green")

	# Group results by scorer and add rows directly
	for scorer in set(result["scorer"] for result in results):
	scorer_times = {
	result["tokens"]: f"{result['elapsed_time']:.2f} ms"
	for result in results
	if result["scorer"] == scorer
	}
	table.add_row(scorer, *(scorer_times[count] for count in token_counts))

	with open(f"./latency_results/{GPU_TYPE}.txt", "w") as file:
	Console(file=file).print(table)
	print("Results saved to latency_results_{GPU_TYPE}.txt")