awestley · April 8, 2025 17:06
diff --git a/speed.py b/speed.py
 import ollama
 import time
 import re
 from transformers import AutoTokenizer

 MODEL = "r1-1776-500:671b"

 QUESTIONS = [
    "Explain how a rocket engine works in simple terms.",
    "Can you tell me how fuel combustion happens inside the engine?",
    "What are the major challenges with cooling rocket engines?",
    "Describe how the nozzle shape affects the rocket's performance.",
    "How do engineers test rocket engines safely on Earth?",
    "What materials are commonly used to build rocket engines and why?",
    "Explain why liquid-fuel rockets differ from solid-fuel rockets.",
    "How does the engine control thrust during a launch?",
 ]

 print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")

 def preload_model(model: str):
    print(f"Preloading model '{model}'...")
    _ = ollama.generate(model=model, prompt="Hello", stream=False)

 def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text, add_special_tokens=False))

 def strip_think_section(text: str) -> str:
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

 def generate_and_measure_streaming(model: str, prompt: str):
    token_count = 0
    output = ""

    start_total = time.perf_counter()
    first_token_time = None

    for chunk in ollama.generate(model=model, prompt=prompt, stream=True):
        now = time.perf_counter()
        token = chunk.get("response", "")
        if token:
            if first_token_time is None:
                first_token_time = now
            token_count += 1
            output += token
            print(token, end="", flush=True)

    end_total = time.perf_counter()

    # Durations
    total_elapsed = end_total - start_total
    streaming_elapsed = end_total - first_token_time if first_token_time else 0
    first_token_latency = (first_token_time - start_total) if first_token_time else None

    # TPS calculations
    TPS_incl_TTFT = token_count / total_elapsed if total_elapsed > 0 else 0
    TPS_stream_only = token_count / streaming_elapsed if streaming_elapsed > 0 else 0

    return {
        "output": output,
        "first_token_latency": first_token_latency,
        "total_elapsed": total_elapsed,
        "streaming_elapsed": streaming_elapsed,
        "tokens_streamed": token_count,
        "TPS_incl_TTFT": TPS_incl_TTFT,
        "TPS_stream_only": TPS_stream_only,
        "output_length_chars": len(output),
    }

 def run_real_context_growth():
    preload_model(MODEL)

    conversation_history = ""
    for round_num, question in enumerate(QUESTIONS, start=1):
        conversation_history += f"\n\nUser: {question}"
        context_tokens = count_tokens(conversation_history)

        print(f"\n=== Round {round_num} ===")
        print(f"Context length: {len(conversation_history)} characters")
        print(f"Context tokens (real): {context_tokens} tokens\n")

        results = generate_and_measure_streaming(MODEL, conversation_history)

        # Strip <think> section before adding to context
        clean_output = strip_think_section(results["output"])
        conversation_history += f"\n\nAssistant: {clean_output}"

        print(f"\n\n--- Metrics ---")
        print(f"Time to first token: {results['first_token_latency']:.2f} seconds")
        print(f"Total time taken (with TTFT): {results['total_elapsed']:.2f} seconds")
        print(f"Streaming time (after first token): {results['streaming_elapsed']:.2f} seconds")
        print(f"Tokens streamed: {results['tokens_streamed']}")
        print(f"TPS including TTFT: {results['TPS_incl_TTFT']:.2f}")
        print(f"TPS streaming only: {results['TPS_stream_only']:.2f}")
        print(f"Output length: {results['output_length_chars']} characters")

 if __name__ == "__main__":
    run_real_context_growth()
	import ollama
	import time
	import re
	from transformers import AutoTokenizer

	MODEL = "r1-1776-500:671b"

	QUESTIONS = [
	"Explain how a rocket engine works in simple terms.",
	"Can you tell me how fuel combustion happens inside the engine?",
	"What are the major challenges with cooling rocket engines?",
	"Describe how the nozzle shape affects the rocket's performance.",
	"How do engineers test rocket engines safely on Earth?",
	"What materials are commonly used to build rocket engines and why?",
	"Explain why liquid-fuel rockets differ from solid-fuel rockets.",
	"How does the engine control thrust during a launch?",
	]

	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")

	def preload_model(model: str):
	print(f"Preloading model '{model}'...")
	_ = ollama.generate(model=model, prompt="Hello", stream=False)

	def count_tokens(text: str) -> int:
	return len(tokenizer.encode(text, add_special_tokens=False))

	def strip_think_section(text: str) -> str:
	return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

	def generate_and_measure_streaming(model: str, prompt: str):
	token_count = 0
	output = ""

	start_total = time.perf_counter()
	first_token_time = None

	for chunk in ollama.generate(model=model, prompt=prompt, stream=True):
	now = time.perf_counter()
	token = chunk.get("response", "")
	if token:
	if first_token_time is None:
	first_token_time = now
	token_count += 1
	output += token
	print(token, end="", flush=True)

	end_total = time.perf_counter()

	# Durations
	total_elapsed = end_total - start_total
	streaming_elapsed = end_total - first_token_time if first_token_time else 0
	first_token_latency = (first_token_time - start_total) if first_token_time else None

	# TPS calculations
	TPS_incl_TTFT = token_count / total_elapsed if total_elapsed > 0 else 0
	TPS_stream_only = token_count / streaming_elapsed if streaming_elapsed > 0 else 0

	return {
	"output": output,
	"first_token_latency": first_token_latency,
	"total_elapsed": total_elapsed,
	"streaming_elapsed": streaming_elapsed,
	"tokens_streamed": token_count,
	"TPS_incl_TTFT": TPS_incl_TTFT,
	"TPS_stream_only": TPS_stream_only,
	"output_length_chars": len(output),
	}

	def run_real_context_growth():
	preload_model(MODEL)

	conversation_history = ""
	for round_num, question in enumerate(QUESTIONS, start=1):
	conversation_history += f"\n\nUser: {question}"
	context_tokens = count_tokens(conversation_history)

	print(f"\n=== Round {round_num} ===")
	print(f"Context length: {len(conversation_history)} characters")
	print(f"Context tokens (real): {context_tokens} tokens\n")

	results = generate_and_measure_streaming(MODEL, conversation_history)

	# Strip <think> section before adding to context
	clean_output = strip_think_section(results["output"])
	conversation_history += f"\n\nAssistant: {clean_output}"

	print(f"\n\n--- Metrics ---")
	print(f"Time to first token: {results['first_token_latency']:.2f} seconds")
	print(f"Total time taken (with TTFT): {results['total_elapsed']:.2f} seconds")
	print(f"Streaming time (after first token): {results['streaming_elapsed']:.2f} seconds")
	print(f"Tokens streamed: {results['tokens_streamed']}")
	print(f"TPS including TTFT: {results['TPS_incl_TTFT']:.2f}")
	print(f"TPS streaming only: {results['TPS_stream_only']:.2f}")
	print(f"Output length: {results['output_length_chars']} characters")

	if __name__ == "__main__":
	run_real_context_growth()