Skip to content

Instantly share code, notes, and snippets.

@HDCharles
Created April 23, 2026 20:49
Show Gist options
  • Select an option

  • Save HDCharles/030a3c5d7439680afdbebd935758366d to your computer and use it in GitHub Desktop.

Select an option

Save HDCharles/030a3c5d7439680afdbebd935758366d to your computer and use it in GitHub Desktop.
vLLM full-vocab logprob timing benchmark (Llama-3-8B)
import time, os
model_path = "/mnt/data/engine/HDCharles/hf_hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/8afb486c1db24fe5011ec46dfbe5b5dccdb575c2"
from vllm import LLM, SamplingParams
print("Loading model...")
t0 = time.time()
llm = LLM(model=model_path, gpu_memory_utilization=0.5, max_logprobs=-1)
tokenizer = llm.get_tokenizer()
vocab_size = len(tokenizer)
print(f"Model loaded in {time.time()-t0:.1f}s, vocab size: {vocab_size}")
prompt = "The quick brown fox jumps over the lazy dog and then sat down"
tokens = tokenizer.encode(prompt)[:12]
short_prompt = tokenizer.decode(tokens)
print(f"Prompt: {repr(short_prompt)}")
print(f"Prompt tokens: {len(tokens)}")
# Warmup
params_warmup = SamplingParams(max_tokens=1, prompt_logprobs=1)
llm.generate([short_prompt], params_warmup)
# Full vocab: prompt_logprobs=-1 returns all vocab_size logprobs
params = SamplingParams(
max_tokens=1,
prompt_logprobs=-1,
)
print(f"\nTiming full vocab ({vocab_size} tokens) logprob extraction for ~10 prompt tokens...")
N = 3
times = []
for i in range(N):
t0 = time.time()
outputs = llm.generate([short_prompt], params)
elapsed = time.time() - t0
times.append(elapsed)
n_pos = sum(1 for x in outputs[0].prompt_logprobs if x is not None)
print(f" Run {i+1}: {elapsed:.3f}s, positions with logprobs: {n_pos}")
avg = sum(times) / N
tok_per_sec = 10 / avg
print(f"\nAvg time for ~10 tokens: {avg:.3f}s")
print(f"Per-token rate: {tok_per_sec:.1f} tokens/sec")
print(f"Extrapolated for 245k tokens (WikiText-103 test): {245000 / tok_per_sec / 3600:.1f} hours")
print(f"Storage per token (float32): {vocab_size * 4 / 1024:.1f} KB")
print(f"Total storage for 245k tokens (float16): {245000 * vocab_size * 2 / 1e9:.1f} GB")
@HDCharles
Copy link
Copy Markdown
Author

  • ~1.1 tokens/sec — completely dominated by serializing and returning 128k floats per position back through Python
  • ~64 hours to process WikiText-103 test set (245k tokens) on a single A100
  • ~62 GB storage in float16

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment