Created
April 23, 2026 20:49
-
-
Save HDCharles/030a3c5d7439680afdbebd935758366d to your computer and use it in GitHub Desktop.
vLLM full-vocab logprob timing benchmark (Llama-3-8B)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time, os | |
| model_path = "/mnt/data/engine/HDCharles/hf_hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/8afb486c1db24fe5011ec46dfbe5b5dccdb575c2" | |
| from vllm import LLM, SamplingParams | |
| print("Loading model...") | |
| t0 = time.time() | |
| llm = LLM(model=model_path, gpu_memory_utilization=0.5, max_logprobs=-1) | |
| tokenizer = llm.get_tokenizer() | |
| vocab_size = len(tokenizer) | |
| print(f"Model loaded in {time.time()-t0:.1f}s, vocab size: {vocab_size}") | |
| prompt = "The quick brown fox jumps over the lazy dog and then sat down" | |
| tokens = tokenizer.encode(prompt)[:12] | |
| short_prompt = tokenizer.decode(tokens) | |
| print(f"Prompt: {repr(short_prompt)}") | |
| print(f"Prompt tokens: {len(tokens)}") | |
| # Warmup | |
| params_warmup = SamplingParams(max_tokens=1, prompt_logprobs=1) | |
| llm.generate([short_prompt], params_warmup) | |
| # Full vocab: prompt_logprobs=-1 returns all vocab_size logprobs | |
| params = SamplingParams( | |
| max_tokens=1, | |
| prompt_logprobs=-1, | |
| ) | |
| print(f"\nTiming full vocab ({vocab_size} tokens) logprob extraction for ~10 prompt tokens...") | |
| N = 3 | |
| times = [] | |
| for i in range(N): | |
| t0 = time.time() | |
| outputs = llm.generate([short_prompt], params) | |
| elapsed = time.time() - t0 | |
| times.append(elapsed) | |
| n_pos = sum(1 for x in outputs[0].prompt_logprobs if x is not None) | |
| print(f" Run {i+1}: {elapsed:.3f}s, positions with logprobs: {n_pos}") | |
| avg = sum(times) / N | |
| tok_per_sec = 10 / avg | |
| print(f"\nAvg time for ~10 tokens: {avg:.3f}s") | |
| print(f"Per-token rate: {tok_per_sec:.1f} tokens/sec") | |
| print(f"Extrapolated for 245k tokens (WikiText-103 test): {245000 / tok_per_sec / 3600:.1f} hours") | |
| print(f"Storage per token (float32): {vocab_size * 4 / 1024:.1f} KB") | |
| print(f"Total storage for 245k tokens (float16): {245000 * vocab_size * 2 / 1e9:.1f} GB") |
Author
HDCharles
commented
Apr 23, 2026
- ~1.1 tokens/sec — completely dominated by serializing and returning 128k floats per position back through Python
- ~64 hours to process WikiText-103 test set (245k tokens) on a single A100
- ~62 GB storage in float16
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment