Skip to content

Instantly share code, notes, and snippets.

@egorsmkv
Created July 16, 2025 17:55
Show Gist options
  • Save egorsmkv/97103ae1ead11989ee07cccbda2a4877 to your computer and use it in GitHub Desktop.
Save egorsmkv/97103ae1ead11989ee07cccbda2a4877 to your computer and use it in GitHub Desktop.
Vibe-coded evaluation
import evaluate
import polars as pl
import time
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
# --- 1. SETUP ---
model_id = '/home/smlkw/en-uk-t/final-checkpoints/kulyk-en-uk'
print(f"Model ID: {model_id}")
hf_dataset = load_dataset("facebook/flores", 'all')
df_all = hf_dataset['devtest'].to_polars()
source_sentences = df_all['sentence_eng_Latn'].to_list()
references = [[row['sentence_ukr_Cyrl']] for row in df_all.iter_rows(named=True)]
print(f"Found {len(references)} sentences for translation.")
sacrebleu = evaluate.load("sacrebleu")
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="cuda:0",
torch_dtype="bfloat16",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# --- 2. CONFIGURE TOKENIZER FOR BATCHING ---
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
# --- 3. BATCHED INFERENCE WITH SPEED MEASUREMENT --- 🚀
print("Starting batched inference...")
batch_size = 64
predictions = []
total_generated_tokens = 0
# Record start time
start_time = time.perf_counter()
for i in tqdm(range(0, len(source_sentences), batch_size), desc="Translating Batches"):
batch_sentences = source_sentences[i:i + batch_size]
prompts = [f"Translate the text to Ukrainian:\n{s}" for s in batch_sentences]
messages = [[{"role": "user", "content": p}] for p in prompts]
formatted_prompts = [
tokenizer.apply_chat_template(m, tokenize=False, add_generation_prompt=True)
for m in messages
]
inputs = tokenizer(
formatted_prompts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=2048,
).to(model.device)
inputs.pop('token_type_ids', None)
# Use torch.no_grad() for inference to save memory and computations
with torch.no_grad():
output = model.generate(
**inputs,
do_sample=False,
repetition_penalty=1.05,
max_new_tokens=2048,
)
# Isolate and count the newly generated tokens
prompt_len = inputs['input_ids'].shape[1]
generated_tokens = output[:, prompt_len:]
total_generated_tokens += generated_tokens.numel() # .numel() gets the total number of elements
assistant_responses = tokenizer.batch_decode(
generated_tokens, skip_special_tokens=True
)
predictions.extend(assistant_responses)
# Record end time and calculate final metrics
end_time = time.perf_counter()
total_time = end_time - start_time
num_sentences = len(source_sentences)
sentences_per_second = num_sentences / total_time
tokens_per_second = total_generated_tokens / total_time
print("\nInference complete.")
# --- 4. EVALUATION & RESULTS ---
print("\n--- Example Translations ---")
for i in range(min(5, len(predictions))):
print(f"English: {source_sentences[i]}")
print(f"Reference: {references[i][0]}")
print(f"Prediction: {predictions[i].strip()}")
print("-" * 20)
print("\nCalculating SacreBLEU score...")
results = sacrebleu.compute(predictions=predictions, references=references)
print("\n--- Final Score ---")
print(results)
# --- 5. PERFORMANCE METRICS ---
print("\n--- Performance Metrics ---")
print(f"Total Sentences: {num_sentences}")
print(f"Total Generated Tokens: {total_generated_tokens}")
print(f"Batch Size: {batch_size}")
print(f"Total Time: {total_time:.2f} seconds")
print(f"Throughput (sentences/sec): {sentences_per_second:.2f}")
print(f"Throughput (tokens/sec): {tokens_per_second:.2f}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment