Skip to content

Instantly share code, notes, and snippets.

@awestley
Last active April 8, 2025 17:06
Show Gist options
  • Save awestley/6be127f2e76977cb8f68c3be3035e679 to your computer and use it in GitHub Desktop.
Save awestley/6be127f2e76977cb8f68c3be3035e679 to your computer and use it in GitHub Desktop.
speed.py
import ollama
import time
import re
from transformers import AutoTokenizer
MODEL = "r1-1776-500:671b"
QUESTIONS = [
"Explain how a rocket engine works in simple terms.",
"Can you tell me how fuel combustion happens inside the engine?",
"What are the major challenges with cooling rocket engines?",
"Describe how the nozzle shape affects the rocket's performance.",
"How do engineers test rocket engines safely on Earth?",
"What materials are commonly used to build rocket engines and why?",
"Explain why liquid-fuel rockets differ from solid-fuel rockets.",
"How does the engine control thrust during a launch?",
]
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
def preload_model(model: str):
print(f"Preloading model '{model}'...")
_ = ollama.generate(model=model, prompt="Hello", stream=False)
def count_tokens(text: str) -> int:
return len(tokenizer.encode(text, add_special_tokens=False))
def strip_think_section(text: str) -> str:
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
def generate_and_measure_streaming(model: str, prompt: str):
token_count = 0
output = ""
start_total = time.perf_counter()
first_token_time = None
for chunk in ollama.generate(model=model, prompt=prompt, stream=True):
now = time.perf_counter()
token = chunk.get("response", "")
if token:
if first_token_time is None:
first_token_time = now
token_count += 1
output += token
print(token, end="", flush=True)
end_total = time.perf_counter()
# Durations
total_elapsed = end_total - start_total
streaming_elapsed = end_total - first_token_time if first_token_time else 0
first_token_latency = (first_token_time - start_total) if first_token_time else None
# TPS calculations
TPS_incl_TTFT = token_count / total_elapsed if total_elapsed > 0 else 0
TPS_stream_only = token_count / streaming_elapsed if streaming_elapsed > 0 else 0
return {
"output": output,
"first_token_latency": first_token_latency,
"total_elapsed": total_elapsed,
"streaming_elapsed": streaming_elapsed,
"tokens_streamed": token_count,
"TPS_incl_TTFT": TPS_incl_TTFT,
"TPS_stream_only": TPS_stream_only,
"output_length_chars": len(output),
}
def run_real_context_growth():
preload_model(MODEL)
conversation_history = ""
for round_num, question in enumerate(QUESTIONS, start=1):
conversation_history += f"\n\nUser: {question}"
context_tokens = count_tokens(conversation_history)
print(f"\n=== Round {round_num} ===")
print(f"Context length: {len(conversation_history)} characters")
print(f"Context tokens (real): {context_tokens} tokens\n")
results = generate_and_measure_streaming(MODEL, conversation_history)
# Strip <think> section before adding to context
clean_output = strip_think_section(results["output"])
conversation_history += f"\n\nAssistant: {clean_output}"
print(f"\n\n--- Metrics ---")
print(f"Time to first token: {results['first_token_latency']:.2f} seconds")
print(f"Total time taken (with TTFT): {results['total_elapsed']:.2f} seconds")
print(f"Streaming time (after first token): {results['streaming_elapsed']:.2f} seconds")
print(f"Tokens streamed: {results['tokens_streamed']}")
print(f"TPS including TTFT: {results['TPS_incl_TTFT']:.2f}")
print(f"TPS streaming only: {results['TPS_stream_only']:.2f}")
print(f"Output length: {results['output_length_chars']} characters")
if __name__ == "__main__":
run_real_context_growth()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment