Last active
April 8, 2025 17:06
-
-
Save awestley/6be127f2e76977cb8f68c3be3035e679 to your computer and use it in GitHub Desktop.
speed.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ollama | |
import time | |
import re | |
from transformers import AutoTokenizer | |
MODEL = "r1-1776-500:671b" | |
QUESTIONS = [ | |
"Explain how a rocket engine works in simple terms.", | |
"Can you tell me how fuel combustion happens inside the engine?", | |
"What are the major challenges with cooling rocket engines?", | |
"Describe how the nozzle shape affects the rocket's performance.", | |
"How do engineers test rocket engines safely on Earth?", | |
"What materials are commonly used to build rocket engines and why?", | |
"Explain why liquid-fuel rockets differ from solid-fuel rockets.", | |
"How does the engine control thrust during a launch?", | |
] | |
print("Loading tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf") | |
def preload_model(model: str): | |
print(f"Preloading model '{model}'...") | |
_ = ollama.generate(model=model, prompt="Hello", stream=False) | |
def count_tokens(text: str) -> int: | |
return len(tokenizer.encode(text, add_special_tokens=False)) | |
def strip_think_section(text: str) -> str: | |
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip() | |
def generate_and_measure_streaming(model: str, prompt: str): | |
token_count = 0 | |
output = "" | |
start_total = time.perf_counter() | |
first_token_time = None | |
for chunk in ollama.generate(model=model, prompt=prompt, stream=True): | |
now = time.perf_counter() | |
token = chunk.get("response", "") | |
if token: | |
if first_token_time is None: | |
first_token_time = now | |
token_count += 1 | |
output += token | |
print(token, end="", flush=True) | |
end_total = time.perf_counter() | |
# Durations | |
total_elapsed = end_total - start_total | |
streaming_elapsed = end_total - first_token_time if first_token_time else 0 | |
first_token_latency = (first_token_time - start_total) if first_token_time else None | |
# TPS calculations | |
TPS_incl_TTFT = token_count / total_elapsed if total_elapsed > 0 else 0 | |
TPS_stream_only = token_count / streaming_elapsed if streaming_elapsed > 0 else 0 | |
return { | |
"output": output, | |
"first_token_latency": first_token_latency, | |
"total_elapsed": total_elapsed, | |
"streaming_elapsed": streaming_elapsed, | |
"tokens_streamed": token_count, | |
"TPS_incl_TTFT": TPS_incl_TTFT, | |
"TPS_stream_only": TPS_stream_only, | |
"output_length_chars": len(output), | |
} | |
def run_real_context_growth(): | |
preload_model(MODEL) | |
conversation_history = "" | |
for round_num, question in enumerate(QUESTIONS, start=1): | |
conversation_history += f"\n\nUser: {question}" | |
context_tokens = count_tokens(conversation_history) | |
print(f"\n=== Round {round_num} ===") | |
print(f"Context length: {len(conversation_history)} characters") | |
print(f"Context tokens (real): {context_tokens} tokens\n") | |
results = generate_and_measure_streaming(MODEL, conversation_history) | |
# Strip <think> section before adding to context | |
clean_output = strip_think_section(results["output"]) | |
conversation_history += f"\n\nAssistant: {clean_output}" | |
print(f"\n\n--- Metrics ---") | |
print(f"Time to first token: {results['first_token_latency']:.2f} seconds") | |
print(f"Total time taken (with TTFT): {results['total_elapsed']:.2f} seconds") | |
print(f"Streaming time (after first token): {results['streaming_elapsed']:.2f} seconds") | |
print(f"Tokens streamed: {results['tokens_streamed']}") | |
print(f"TPS including TTFT: {results['TPS_incl_TTFT']:.2f}") | |
print(f"TPS streaming only: {results['TPS_stream_only']:.2f}") | |
print(f"Output length: {results['output_length_chars']} characters") | |
if __name__ == "__main__": | |
run_real_context_growth() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment