sammcj · April 4, 2026 09:11
diff --git a/mtp_bench.py b/mtp_bench.py
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #   "mlx>=0.31.1",
 #   "mlx-lm @ git+https://github.com/AirRunner/mlx-lm.git@feat/mtp-native",
 # ]
 # ///
 """MTP vs non-MTP benchmark: measure prefill and generation separately.

 Usage:
    uv run mtp_bench.py <model_path>
 """
 import sys
 import time

 import mlx.core as mx
 from mlx_lm import load
 from mlx_lm.generate import stream_generate
 from mlx_lm.sample_utils import make_sampler

 MODEL_PATH = sys.argv[1]
 NUM_RUNS = 4  # 1 warmup + 3 measured
 MAX_GEN_TOKENS = 100

 # Long prompt for measurable prefill (~600 tokens)
 LONG_PROMPT = """You are a helpful assistant. Please analyze the following passage and provide a brief summary.

 The history of artificial intelligence began in antiquity, with myths, stories and rumors of artificial beings endowed with intelligence or consciousness by master craftsmen. The seeds of modern AI were planted by philosophers who attempted to describe the process of human thinking as the mechanical manipulation of symbols. This work culminated in the invention of the programmable digital computer in the 1940s, a machine based on the abstract essence of mathematical reasoning. This device and the ideas behind it inspired a handful of scientists to begin seriously discussing the possibility of building an electronic brain.

 The field of AI research was born at a workshop at Dartmouth College in 1956. The attendees became the founders and leaders of AI research. They and their students produced programs that the press described as astonishing: computers were learning checkers strategies, solving word problems in algebra, proving logical theorems and speaking English. By the middle of the 1960s, research in the U.S. was heavily funded by the Department of Defense and laboratories had been established around the world.

 AI's founders were optimistic about the future: Herbert Simon predicted, "machines will be capable, within twenty years, of doing any work a man can do". Marvin Minsky agreed, writing, "within a generation the problem of creating artificial intelligence will substantially be solved". They failed to recognize the difficulty of some of the remaining tasks. Progress slowed and in 1974, in response to the criticism of Sir James Lighthill and ongoing pressure from the US Congress to fund more productive projects, both the U.S. and British governments cut off exploratory research in AI. The next few years would later be called an AI winter, a period when obtaining funding for AI projects was difficult.

 In the early 1980s, AI research was revived by the commercial success of expert systems, a form of AI program that simulated the knowledge and analytical skills of human experts. By 1985, the market for AI had reached over a billion dollars. At the same time, Japan's fifth generation computer project inspired the U.S and British governments to restore funding for academic research. However, beginning with the collapse of the Lisp Machine market in 1987, AI once again fell into disrepute, and a second, longer-lasting winter began.

 AI revived again in the late 1990s and early 2000s by finding specific solutions to specific problems. The narrow focus allowed researchers to produce verifiable results, exploit more mathematical methods, and collaborate with other fields. By 2000, solutions developed by AI researchers were being widely used in technology. In the 2010s, advances in deep learning, enabled by the availability of large datasets, powerful GPUs, and new algorithms, led to breakthroughs in areas such as image recognition, natural language processing, and game playing. These successes renewed interest in AI and led to significant investments by major technology companies.

 Please provide a concise 2-sentence summary."""

 SHORT_PROMPT = "What is 15 * 23?"

 def run_benchmark(model, tokenizer, prompt_text, use_mtp, temp=0.0):
    """Run generation and return (prompt_tps, gen_tps, accept_rate)."""
    sampler_fn = make_sampler(temp=temp) if temp > 0 else None
    results = []
    for i in range(NUM_RUNS):
        tokens = tokenizer.encode(prompt_text)
        prompt = mx.array(tokens)
        prompt_len = len(tokens)

        gen_tokens = 0
        draft_tokens = 0
        first_token_time = None
        start = time.perf_counter()

        for resp in stream_generate(
            model=model,
            tokenizer=tokenizer,
            prompt=prompt,
            max_tokens=MAX_GEN_TOKENS,
            mtp=use_mtp,
            sampler=sampler_fn,
        ):
            gen_tokens += 1
            if resp.from_draft:
                draft_tokens += 1
            if gen_tokens == 1:
                first_token_time = time.perf_counter()
            if resp.token in tokenizer.eos_token_ids:
                break

        end = time.perf_counter()

        prefill_time = first_token_time - start if first_token_time else 0
        gen_time = end - first_token_time if first_token_time else 0
        prompt_tps = prompt_len / prefill_time if prefill_time > 0 else 0
        gen_tps = (gen_tokens - 1) / gen_time if gen_time > 0 and gen_tokens > 1 else 0
        accept_rate = (draft_tokens / gen_tokens * 100) if gen_tokens > 0 and use_mtp else 0

        tag = "WARMUP" if i == 0 else f"run {i}"
        accept_str = f", accept={accept_rate:.1f}%" if use_mtp else ""
        print(f"  [{tag}] prefill={prompt_tps:.1f} tok/s ({prompt_len} tok in {prefill_time*1000:.0f}ms), "
              f"gen={gen_tps:.1f} tok/s ({gen_tokens} tok in {gen_time*1000:.0f}ms){accept_str}")

        if i > 0:  # skip warmup
            results.append((prompt_tps, gen_tps, accept_rate))

        mx.clear_cache()

    avg_prefill = sum(r[0] for r in results) / len(results)
    avg_gen = sum(r[1] for r in results) / len(results)
    avg_accept = sum(r[2] for r in results) / len(results)
    accept_str = f", accept={avg_accept:.1f}%" if use_mtp else ""
    print(f"  AVG: prefill={avg_prefill:.1f} tok/s, gen={avg_gen:.1f} tok/s{accept_str}")
    return avg_prefill, avg_gen, avg_accept


 def main():
    print(f"Loading model: {MODEL_PATH}")
    result = load(MODEL_PATH)
    model, tokenizer = result[0], result[1]
    print(f"Model loaded.\n")

    # Check if model has MTP
    has_mtp = hasattr(model, "mtp_forward")
    print(f"Model has MTP head: {has_mtp}\n")

    for temp in [0.0, 0.6]:
        print("=" * 60)
        print(f"LONG PROMPT BENCHMARK  temp={temp}")
        print("=" * 60)

        print(f"\n--- Without MTP (temp={temp}) ---")
        no_mtp_prefill, no_mtp_gen, _ = run_benchmark(model, tokenizer, LONG_PROMPT, False, temp=temp)

        if has_mtp:
            print(f"\n--- With MTP (temp={temp}) ---")
            mtp_prefill, mtp_gen, mtp_accept = run_benchmark(model, tokenizer, LONG_PROMPT, True, temp=temp)

            print(f"\n--- RESULTS (temp={temp}) ---")
            print(f"Prefill:    no-MTP={no_mtp_prefill:.1f}  MTP={mtp_prefill:.1f}  delta={((mtp_prefill/no_mtp_prefill)-1)*100:+.1f}%")
            print(f"Generation: no-MTP={no_mtp_gen:.1f}  MTP={mtp_gen:.1f}  delta={((mtp_gen/no_mtp_gen)-1)*100:+.1f}%")
            print(f"Acceptance: {mtp_accept:.1f}%")
        else:
            print("\nModel has no MTP head, skipping MTP benchmark.")
        print()


 if __name__ == "__main__":
    main()
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "mlx>=0.31.1",
	# "mlx-lm @ git+https://github.com/AirRunner/mlx-lm.git@feat/mtp-native",
	# ]
	# ///
	"""MTP vs non-MTP benchmark: measure prefill and generation separately.

	Usage:
	uv run mtp_bench.py <model_path>
	"""
	import sys
	import time

	import mlx.core as mx
	from mlx_lm import load
	from mlx_lm.generate import stream_generate
	from mlx_lm.sample_utils import make_sampler

	MODEL_PATH = sys.argv[1]
	NUM_RUNS = 4 # 1 warmup + 3 measured
	MAX_GEN_TOKENS = 100

	# Long prompt for measurable prefill (~600 tokens)
	LONG_PROMPT = """You are a helpful assistant. Please analyze the following passage and provide a brief summary.

	The history of artificial intelligence began in antiquity, with myths, stories and rumors of artificial beings endowed with intelligence or consciousness by master craftsmen. The seeds of modern AI were planted by philosophers who attempted to describe the process of human thinking as the mechanical manipulation of symbols. This work culminated in the invention of the programmable digital computer in the 1940s, a machine based on the abstract essence of mathematical reasoning. This device and the ideas behind it inspired a handful of scientists to begin seriously discussing the possibility of building an electronic brain.

	The field of AI research was born at a workshop at Dartmouth College in 1956. The attendees became the founders and leaders of AI research. They and their students produced programs that the press described as astonishing: computers were learning checkers strategies, solving word problems in algebra, proving logical theorems and speaking English. By the middle of the 1960s, research in the U.S. was heavily funded by the Department of Defense and laboratories had been established around the world.

	AI's founders were optimistic about the future: Herbert Simon predicted, "machines will be capable, within twenty years, of doing any work a man can do". Marvin Minsky agreed, writing, "within a generation the problem of creating artificial intelligence will substantially be solved". They failed to recognize the difficulty of some of the remaining tasks. Progress slowed and in 1974, in response to the criticism of Sir James Lighthill and ongoing pressure from the US Congress to fund more productive projects, both the U.S. and British governments cut off exploratory research in AI. The next few years would later be called an AI winter, a period when obtaining funding for AI projects was difficult.

	In the early 1980s, AI research was revived by the commercial success of expert systems, a form of AI program that simulated the knowledge and analytical skills of human experts. By 1985, the market for AI had reached over a billion dollars. At the same time, Japan's fifth generation computer project inspired the U.S and British governments to restore funding for academic research. However, beginning with the collapse of the Lisp Machine market in 1987, AI once again fell into disrepute, and a second, longer-lasting winter began.

	AI revived again in the late 1990s and early 2000s by finding specific solutions to specific problems. The narrow focus allowed researchers to produce verifiable results, exploit more mathematical methods, and collaborate with other fields. By 2000, solutions developed by AI researchers were being widely used in technology. In the 2010s, advances in deep learning, enabled by the availability of large datasets, powerful GPUs, and new algorithms, led to breakthroughs in areas such as image recognition, natural language processing, and game playing. These successes renewed interest in AI and led to significant investments by major technology companies.

	Please provide a concise 2-sentence summary."""

	SHORT_PROMPT = "What is 15 * 23?"

	def run_benchmark(model, tokenizer, prompt_text, use_mtp, temp=0.0):
	"""Run generation and return (prompt_tps, gen_tps, accept_rate)."""
	sampler_fn = make_sampler(temp=temp) if temp > 0 else None
	results = []
	for i in range(NUM_RUNS):
	tokens = tokenizer.encode(prompt_text)
	prompt = mx.array(tokens)
	prompt_len = len(tokens)

	gen_tokens = 0
	draft_tokens = 0
	first_token_time = None
	start = time.perf_counter()

	for resp in stream_generate(
	model=model,
	tokenizer=tokenizer,
	prompt=prompt,
	max_tokens=MAX_GEN_TOKENS,
	mtp=use_mtp,
	sampler=sampler_fn,
	):
	gen_tokens += 1
	if resp.from_draft:
	draft_tokens += 1
	if gen_tokens == 1:
	first_token_time = time.perf_counter()
	if resp.token in tokenizer.eos_token_ids:
	break

	end = time.perf_counter()

	prefill_time = first_token_time - start if first_token_time else 0
	gen_time = end - first_token_time if first_token_time else 0
	prompt_tps = prompt_len / prefill_time if prefill_time > 0 else 0
	gen_tps = (gen_tokens - 1) / gen_time if gen_time > 0 and gen_tokens > 1 else 0
	accept_rate = (draft_tokens / gen_tokens * 100) if gen_tokens > 0 and use_mtp else 0

	tag = "WARMUP" if i == 0 else f"run {i}"
	accept_str = f", accept={accept_rate:.1f}%" if use_mtp else ""
	print(f" [{tag}] prefill={prompt_tps:.1f} tok/s ({prompt_len} tok in {prefill_time*1000:.0f}ms), "
	f"gen={gen_tps:.1f} tok/s ({gen_tokens} tok in {gen_time*1000:.0f}ms){accept_str}")

	if i > 0: # skip warmup
	results.append((prompt_tps, gen_tps, accept_rate))

	mx.clear_cache()

	avg_prefill = sum(r[0] for r in results) / len(results)
	avg_gen = sum(r[1] for r in results) / len(results)
	avg_accept = sum(r[2] for r in results) / len(results)
	accept_str = f", accept={avg_accept:.1f}%" if use_mtp else ""
	print(f" AVG: prefill={avg_prefill:.1f} tok/s, gen={avg_gen:.1f} tok/s{accept_str}")
	return avg_prefill, avg_gen, avg_accept


	def main():
	print(f"Loading model: {MODEL_PATH}")
	result = load(MODEL_PATH)
	model, tokenizer = result[0], result[1]
	print(f"Model loaded.\n")

	# Check if model has MTP
	has_mtp = hasattr(model, "mtp_forward")
	print(f"Model has MTP head: {has_mtp}\n")

	for temp in [0.0, 0.6]:
	print("=" * 60)
	print(f"LONG PROMPT BENCHMARK temp={temp}")
	print("=" * 60)

	print(f"\n--- Without MTP (temp={temp}) ---")
	no_mtp_prefill, no_mtp_gen, _ = run_benchmark(model, tokenizer, LONG_PROMPT, False, temp=temp)

	if has_mtp:
	print(f"\n--- With MTP (temp={temp}) ---")
	mtp_prefill, mtp_gen, mtp_accept = run_benchmark(model, tokenizer, LONG_PROMPT, True, temp=temp)

	print(f"\n--- RESULTS (temp={temp}) ---")
	print(f"Prefill: no-MTP={no_mtp_prefill:.1f} MTP={mtp_prefill:.1f} delta={((mtp_prefill/no_mtp_prefill)-1)*100:+.1f}%")
	print(f"Generation: no-MTP={no_mtp_gen:.1f} MTP={mtp_gen:.1f} delta={((mtp_gen/no_mtp_gen)-1)*100:+.1f}%")
	print(f"Acceptance: {mtp_accept:.1f}%")
	else:
	print("\nModel has no MTP head, skipping MTP benchmark.")
	print()


	if __name__ == "__main__":
	main()
No results found