Blaizzy · February 28, 2026 14:00
diff --git a/BENCHMARK_RESULTS.md b/BENCHMARK_RESULTS.md
diff --git a/plot_results.py b/plot_results.py
 #!/usr/bin/env python3
 """Generate benchmark plots from Qwen3-TTS batch generation results."""

 import matplotlib.pyplot as plt
 import numpy as np
 import os

 # Results data
 MODELS = ["4-bit", "6-bit", "8-bit", "bf16"]

 # --- Sequential (single generate()) baseline — short prompt ---
 SEQ_TPS = {"4-bit": 21.2, "6-bit": 21.0, "8-bit": 20.5, "bf16": 20.2}
 SEQ_RTF = {"4-bit": 1.69, "6-bit": 1.68, "8-bit": 1.64, "bf16": 1.62}
 SEQ_TTFB = {"4-bit": 57.2, "6-bit": 64.0, "8-bit": 60.9, "bf16": 62.0}
 SEQ_MEMORY = {"4-bit": 3.51, "6-bit": 3.89, "8-bit": 4.28, "bf16": 5.72}

 # --- Batch generate() results — short prompt ---
 BATCH_SIZES = [1, 2, 4]

 BATCH_TPS = {
    "4-bit": [19.3, 30.9, 46.5],
    "6-bit": [20.4, 33.1, 46.7],
    "8-bit": [19.9, 32.5, 46.4],
    "bf16":  [19.3, 28.9, 41.6],
 }

 BATCH_THROUGHPUT = {
    "4-bit": [1.55, 2.47, 3.72],
    "6-bit": [1.63, 2.65, 3.74],
    "8-bit": [1.59, 2.60, 3.71],
    "bf16":  [1.54, 2.31, 3.33],
 }

 BATCH_TTFB = {
    "4-bit": [68.4, 85.0, 110.9],
    "6-bit": [69.2, 86.2, 112.6],
    "8-bit": [67.2, 86.7, 110.6],
    "bf16":  [68.6, 95.7, 119.7],
 }

 BATCH_MEMORY = {
    "4-bit": [3.51, 3.54, 3.60],
    "6-bit": [3.89, 3.92, 3.98],
    "8-bit": [4.28, 4.31, 4.37],
    "bf16":  [5.72, 5.75, 5.81],
 }

 BATCH_TOTAL_TIME = {
    "4-bit": [1241.2, 1757.2, 2108.1],
    "6-bit": [1223.4, 1344.0, 1926.6],
    "8-bit": [1156.7, 1462.3, 2047.7],
    "bf16":  [1218.1, 1576.5, 2162.7],
 }

 COLORS = {
    "4-bit": "#2196F3",
    "6-bit": "#4CAF50",
    "8-bit": "#FF9800",
    "bf16":  "#F44336",
 }
 MARKERS = {"4-bit": "o", "6-bit": "s", "8-bit": "D", "bf16": "^"}

 OUT_DIR = "benchmarks/plots"
 os.makedirs(OUT_DIR, exist_ok=True)

 plt.rcParams.update({
    "figure.facecolor": "white",
    "axes.grid": True,
    "grid.alpha": 0.3,
    "font.size": 11,
 })

 # X-axis labels: Sequential, then batch sizes
 X_LABELS = ["Seq (1)", "Batch 1", "Batch 2", "Batch 4"]
 X_POS = [0, 1, 2, 3]


 def _full_tps(model):
    return [SEQ_TPS[model]] + BATCH_TPS[model]


 def _full_throughput(model):
    return [SEQ_RTF[model]] + BATCH_THROUGHPUT[model]


 def _full_ttfb(model):
    return [SEQ_TTFB[model]] + BATCH_TTFB[model]


 def _full_memory(model):
    return [SEQ_MEMORY[model]] + BATCH_MEMORY[model]


 def plot_tps():
    fig, ax = plt.subplots(figsize=(9, 5.5))
    for model in MODELS:
        vals = _full_tps(model)
        ax.plot(
            X_POS, vals,
            marker=MARKERS[model], color=COLORS[model],
            linewidth=2, markersize=8, label=model,
        )
    # Vertical separator between sequential and batch
    ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
    ax.text(0.0, ax.get_ylim()[1] * 0.97, "Sequential", ha="center", fontsize=8, color="gray")
    ax.text(2.0, ax.get_ylim()[1] * 0.97, "Batched", ha="center", fontsize=8, color="gray")
    ax.set_xlabel("Generation Mode")
    ax.set_ylabel("Tokens per Second")
    ax.set_title("Tokens per Second — Sequential vs Batched")
    ax.set_xticks(X_POS)
    ax.set_xticklabels(X_LABELS)
    ax.legend()
    fig.tight_layout()
    path = os.path.join(OUT_DIR, "tps_vs_batch.png")
    fig.savefig(path, dpi=150)
    plt.close(fig)
    print(f"Saved: {path}")


 def plot_throughput():
    fig, ax = plt.subplots(figsize=(9, 5.5))
    for model in MODELS:
        vals = _full_throughput(model)
        ax.plot(
            X_POS, vals,
            marker=MARKERS[model], color=COLORS[model],
            linewidth=2, markersize=8, label=model,
        )
    # Ideal linear scaling reference (from batch 1 onward)
    ax.plot(
        X_POS[1:], BATCH_SIZES,
        linestyle="--", color="gray", alpha=0.5, label="Ideal linear",
    )
    ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
    ax.text(0.0, ax.get_ylim()[1] * 0.97, "Sequential", ha="center", fontsize=8, color="gray")
    ax.text(2.0, ax.get_ylim()[1] * 0.97, "Batched", ha="center", fontsize=8, color="gray")
    ax.set_xlabel("Generation Mode")
    ax.set_ylabel("Throughput (audio duration / wall time)")
    ax.set_title("Throughput Scaling — Sequential vs Batched")
    ax.set_xticks(X_POS)
    ax.set_xticklabels(X_LABELS)
    ax.legend()
    fig.tight_layout()
    path = os.path.join(OUT_DIR, "throughput_vs_batch.png")
    fig.savefig(path, dpi=150)
    plt.close(fig)
    print(f"Saved: {path}")


 def plot_ttfb():
    fig, ax = plt.subplots(figsize=(9, 5.5))
    for model in MODELS:
        vals = _full_ttfb(model)
        ax.plot(
            X_POS, vals,
            marker=MARKERS[model], color=COLORS[model],
            linewidth=2, markersize=8, label=model,
        )
    ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
    ax.text(0.0, ax.get_ylim()[1] * 0.97, "Sequential", ha="center", fontsize=8, color="gray")
    ax.text(2.0, ax.get_ylim()[1] * 0.97, "Batched", ha="center", fontsize=8, color="gray")
    ax.set_xlabel("Generation Mode")
    ax.set_ylabel("Average TTFB (ms)")
    ax.set_title("Time to First Byte — Sequential vs Batched")
    ax.set_xticks(X_POS)
    ax.set_xticklabels(X_LABELS)
    ax.legend()
    fig.tight_layout()
    path = os.path.join(OUT_DIR, "ttfb_vs_batch.png")
    fig.savefig(path, dpi=150)
    plt.close(fig)
    print(f"Saved: {path}")


 def plot_memory():
    fig, ax = plt.subplots(figsize=(9, 5.5))
    x = np.arange(len(X_LABELS))
    width = 0.18
    for i, model in enumerate(MODELS):
        offset = (i - 1.5) * width
        vals = _full_memory(model)
        bars = ax.bar(
            x + offset, vals, width,
            label=model, color=COLORS[model], alpha=0.85,
        )
        for bar, val in zip(bars, vals):
            ax.text(
                bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05,
                f"{val:.1f}", ha="center", va="bottom", fontsize=7,
            )
    ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
    ax.set_xlabel("Generation Mode")
    ax.set_ylabel("Peak Memory (GB)")
    ax.set_title("Peak Memory Usage — Sequential vs Batched")
    ax.set_xticks(x)
    ax.set_xticklabels(X_LABELS)
    ax.legend()
    ax.set_ylim(0, 7.5)
    fig.tight_layout()
    path = os.path.join(OUT_DIR, "memory_vs_batch.png")
    fig.savefig(path, dpi=150)
    plt.close(fig)
    print(f"Saved: {path}")


 def plot_efficiency():
    """Throughput per GB of memory: sequential vs batch=4."""
    fig, ax = plt.subplots(figsize=(9, 5.5))

    x = np.arange(len(MODELS))
    width = 0.3

    seq_eff = [SEQ_RTF[m] / SEQ_MEMORY[m] for m in MODELS]
    batch4_eff = [BATCH_THROUGHPUT[m][2] / BATCH_MEMORY[m][2] for m in MODELS]

    bars1 = ax.bar(
        x - width / 2, seq_eff, width,
        label="Sequential", color=[COLORS[m] for m in MODELS], alpha=0.45,
        edgecolor=[COLORS[m] for m in MODELS], linewidth=1.5,
    )
    bars2 = ax.bar(
        x + width / 2, batch4_eff, width,
        label="Batch 4", color=[COLORS[m] for m in MODELS], alpha=0.85,
    )

    for bar, val in zip(bars1, seq_eff):
        ax.text(
            bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
            f"{val:.2f}", ha="center", va="bottom", fontsize=9,
        )
    for bar, val in zip(bars2, batch4_eff):
        ax.text(
            bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
            f"{val:.2f}", ha="center", va="bottom", fontsize=9,
        )

    ax.set_xticks(x)
    ax.set_xticklabels(MODELS)
    ax.set_ylabel("Throughput per GB (x / GB)")
    ax.set_title("Memory Efficiency — Sequential vs Batch 4")
    ax.legend()
    fig.tight_layout()
    path = os.path.join(OUT_DIR, "efficiency_batch4.png")
    fig.savefig(path, dpi=150)
    plt.close(fig)
    print(f"Saved: {path}")


 def plot_combined_summary():
    """2x2 summary figure."""
    fig, axes = plt.subplots(2, 2, figsize=(15, 11))

    # TPS
    ax = axes[0, 0]
    for model in MODELS:
        ax.plot(
            X_POS, _full_tps(model),
            marker=MARKERS[model], color=COLORS[model],
            linewidth=2, markersize=7, label=model,
        )
    ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
    ax.set_xlabel("Generation Mode")
    ax.set_ylabel("Tokens / sec")
    ax.set_title("Tokens per Second")
    ax.set_xticks(X_POS)
    ax.set_xticklabels(X_LABELS, fontsize=9)
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

    # Throughput
    ax = axes[0, 1]
    for model in MODELS:
        ax.plot(
            X_POS, _full_throughput(model),
            marker=MARKERS[model], color=COLORS[model],
            linewidth=2, markersize=7, label=model,
        )
    ax.plot(X_POS[1:], BATCH_SIZES, "--", color="gray", alpha=0.5, label="Ideal")
    ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
    ax.set_xlabel("Generation Mode")
    ax.set_ylabel("Audio duration / wall time")
    ax.set_title("Throughput Scaling")
    ax.set_xticks(X_POS)
    ax.set_xticklabels(X_LABELS, fontsize=9)
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

    # TTFB
    ax = axes[1, 0]
    for model in MODELS:
        ax.plot(
            X_POS, _full_ttfb(model),
            marker=MARKERS[model], color=COLORS[model],
            linewidth=2, markersize=7, label=model,
        )
    ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
    ax.set_xlabel("Generation Mode")
    ax.set_ylabel("Avg TTFB (ms)")
    ax.set_title("Time to First Byte")
    ax.set_xticks(X_POS)
    ax.set_xticklabels(X_LABELS, fontsize=9)
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

    # Memory
    ax = axes[1, 1]
    x = np.arange(len(X_LABELS))
    width = 0.18
    for i, model in enumerate(MODELS):
        offset = (i - 1.5) * width
        ax.bar(
            x + offset, _full_memory(model), width,
            label=model, color=COLORS[model], alpha=0.85,
        )
    ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
    ax.set_xlabel("Generation Mode")
    ax.set_ylabel("Peak Memory (GB)")
    ax.set_title("Memory Usage")
    ax.set_xticks(x)
    ax.set_xticklabels(X_LABELS, fontsize=9)
    ax.legend(fontsize=9)
    ax.set_ylim(0, 7.5)
    ax.grid(True, alpha=0.3)

    fig.suptitle(
        "Qwen3-TTS Batch Generation — CustomVoice 1.7B (short prompt)",
        fontsize=14, fontweight="bold", y=0.98,
    )
    fig.tight_layout(rect=[0, 0, 1, 0.96])
    path = os.path.join(OUT_DIR, "summary.png")
    fig.savefig(path, dpi=150)
    plt.close(fig)
    print(f"Saved: {path}")


 if __name__ == "__main__":
    plot_tps()
    plot_throughput()
    plot_ttfb()
    plot_memory()
    plot_efficiency()
    plot_combined_summary()
    print(f"\nAll plots saved to {OUT_DIR}/")
Model	TPS	Throughput	Avg TTFB	Memory
4-bit	46.5	3.72x	110.9ms	3.60GB
6-bit	46.7	3.74x	112.6ms	3.98GB
8-bit	46.4	3.71x	110.6ms	4.37GB
bf16	41.6	3.33x	119.7ms	5.81GB
Batch	Total Time	TPS	Avg TTFB	Throughput	Memory
1	1241.2ms	19.3	68.4ms	1.55x	3.51GB
2	1757.2ms	30.9	85.0ms	2.47x	3.54GB
4	2108.1ms	46.5	110.9ms	3.72x	3.60GB
File	Size	Source
`sequential_short.wav`	75K	Sequential, "short"
`sequential_medium.wav`	608K	Sequential, "medium"
`batch1_short_seq0.wav`	86K	Batch=1, seq 0
`batch2_short_seq0.wav`	83K	Batch=2, seq 0
`batch2_short_seq1.wav`	94K	Batch=2, seq 1
`batch4_short_seq0.wav`	98K	Batch=4, seq 0
`batch4_short_seq1.wav`	98K	Batch=4, seq 1
`batch4_short_seq2.wav`	105K	Batch=4, seq 2
`batch4_short_seq3.wav`	75K	Batch=4, seq 3
Batch	Total Time	TPS	Avg TTFB	Throughput	Memory
1	1223.4ms	20.4	69.2ms	1.63x	3.89GB
2	1344.0ms	33.1	86.2ms	2.65x	3.92GB
4	1926.6ms	46.7	112.6ms	3.74x	3.98GB
Batch	Total Time	TPS	Avg TTFB	Throughput	Memory
1	1156.7ms	19.9	67.2ms	1.59x	4.28GB
2	1462.3ms	32.5	86.7ms	2.60x	4.31GB
4	2047.7ms	46.4	110.6ms	3.71x	4.37GB
Batch	Total Time	TPS	Avg TTFB	Throughput	Memory
1	1218.1ms	19.3	68.6ms	1.54x	5.72GB
2	1576.5ms	28.9	95.7ms	2.31x	5.75GB
4	2162.7ms	41.6	119.7ms	3.33x	5.81GB
	#!/usr/bin/env python3
	"""Generate benchmark plots from Qwen3-TTS batch generation results."""

	import matplotlib.pyplot as plt
	import numpy as np
	import os

	# Results data
	MODELS = ["4-bit", "6-bit", "8-bit", "bf16"]

	# --- Sequential (single generate()) baseline — short prompt ---
	SEQ_TPS = {"4-bit": 21.2, "6-bit": 21.0, "8-bit": 20.5, "bf16": 20.2}
	SEQ_RTF = {"4-bit": 1.69, "6-bit": 1.68, "8-bit": 1.64, "bf16": 1.62}
	SEQ_TTFB = {"4-bit": 57.2, "6-bit": 64.0, "8-bit": 60.9, "bf16": 62.0}
	SEQ_MEMORY = {"4-bit": 3.51, "6-bit": 3.89, "8-bit": 4.28, "bf16": 5.72}

	# --- Batch generate() results — short prompt ---
	BATCH_SIZES = [1, 2, 4]

	BATCH_TPS = {
	"4-bit": [19.3, 30.9, 46.5],
	"6-bit": [20.4, 33.1, 46.7],
	"8-bit": [19.9, 32.5, 46.4],
	"bf16": [19.3, 28.9, 41.6],
	}

	BATCH_THROUGHPUT = {
	"4-bit": [1.55, 2.47, 3.72],
	"6-bit": [1.63, 2.65, 3.74],
	"8-bit": [1.59, 2.60, 3.71],
	"bf16": [1.54, 2.31, 3.33],
	}

	BATCH_TTFB = {
	"4-bit": [68.4, 85.0, 110.9],
	"6-bit": [69.2, 86.2, 112.6],
	"8-bit": [67.2, 86.7, 110.6],
	"bf16": [68.6, 95.7, 119.7],
	}

	BATCH_MEMORY = {
	"4-bit": [3.51, 3.54, 3.60],
	"6-bit": [3.89, 3.92, 3.98],
	"8-bit": [4.28, 4.31, 4.37],
	"bf16": [5.72, 5.75, 5.81],
	}

	BATCH_TOTAL_TIME = {
	"4-bit": [1241.2, 1757.2, 2108.1],
	"6-bit": [1223.4, 1344.0, 1926.6],
	"8-bit": [1156.7, 1462.3, 2047.7],
	"bf16": [1218.1, 1576.5, 2162.7],
	}

	COLORS = {
	"4-bit": "#2196F3",
	"6-bit": "#4CAF50",
	"8-bit": "#FF9800",
	"bf16": "#F44336",
	}
	MARKERS = {"4-bit": "o", "6-bit": "s", "8-bit": "D", "bf16": "^"}

	OUT_DIR = "benchmarks/plots"
	os.makedirs(OUT_DIR, exist_ok=True)

	plt.rcParams.update({
	"figure.facecolor": "white",
	"axes.grid": True,
	"grid.alpha": 0.3,
	"font.size": 11,
	})

	# X-axis labels: Sequential, then batch sizes
	X_LABELS = ["Seq (1)", "Batch 1", "Batch 2", "Batch 4"]
	X_POS = [0, 1, 2, 3]


	def _full_tps(model):
	return [SEQ_TPS[model]] + BATCH_TPS[model]


	def _full_throughput(model):
	return [SEQ_RTF[model]] + BATCH_THROUGHPUT[model]


	def _full_ttfb(model):
	return [SEQ_TTFB[model]] + BATCH_TTFB[model]


	def _full_memory(model):
	return [SEQ_MEMORY[model]] + BATCH_MEMORY[model]


	def plot_tps():
	fig, ax = plt.subplots(figsize=(9, 5.5))
	for model in MODELS:
	vals = _full_tps(model)
	ax.plot(
	X_POS, vals,
	marker=MARKERS[model], color=COLORS[model],
	linewidth=2, markersize=8, label=model,
	)
	# Vertical separator between sequential and batch
	ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
	ax.text(0.0, ax.get_ylim()[1] * 0.97, "Sequential", ha="center", fontsize=8, color="gray")
	ax.text(2.0, ax.get_ylim()[1] * 0.97, "Batched", ha="center", fontsize=8, color="gray")
	ax.set_xlabel("Generation Mode")
	ax.set_ylabel("Tokens per Second")
	ax.set_title("Tokens per Second — Sequential vs Batched")
	ax.set_xticks(X_POS)
	ax.set_xticklabels(X_LABELS)
	ax.legend()
	fig.tight_layout()
	path = os.path.join(OUT_DIR, "tps_vs_batch.png")
	fig.savefig(path, dpi=150)
	plt.close(fig)
	print(f"Saved: {path}")


	def plot_throughput():
	fig, ax = plt.subplots(figsize=(9, 5.5))
	for model in MODELS:
	vals = _full_throughput(model)
	ax.plot(
	X_POS, vals,
	marker=MARKERS[model], color=COLORS[model],
	linewidth=2, markersize=8, label=model,
	)
	# Ideal linear scaling reference (from batch 1 onward)
	ax.plot(
	X_POS[1:], BATCH_SIZES,
	linestyle="--", color="gray", alpha=0.5, label="Ideal linear",
	)
	ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
	ax.text(0.0, ax.get_ylim()[1] * 0.97, "Sequential", ha="center", fontsize=8, color="gray")
	ax.text(2.0, ax.get_ylim()[1] * 0.97, "Batched", ha="center", fontsize=8, color="gray")
	ax.set_xlabel("Generation Mode")
	ax.set_ylabel("Throughput (audio duration / wall time)")
	ax.set_title("Throughput Scaling — Sequential vs Batched")
	ax.set_xticks(X_POS)
	ax.set_xticklabels(X_LABELS)
	ax.legend()
	fig.tight_layout()
	path = os.path.join(OUT_DIR, "throughput_vs_batch.png")
	fig.savefig(path, dpi=150)
	plt.close(fig)
	print(f"Saved: {path}")


	def plot_ttfb():
	fig, ax = plt.subplots(figsize=(9, 5.5))
	for model in MODELS:
	vals = _full_ttfb(model)
	ax.plot(
	X_POS, vals,
	marker=MARKERS[model], color=COLORS[model],
	linewidth=2, markersize=8, label=model,
	)
	ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
	ax.text(0.0, ax.get_ylim()[1] * 0.97, "Sequential", ha="center", fontsize=8, color="gray")
	ax.text(2.0, ax.get_ylim()[1] * 0.97, "Batched", ha="center", fontsize=8, color="gray")
	ax.set_xlabel("Generation Mode")
	ax.set_ylabel("Average TTFB (ms)")
	ax.set_title("Time to First Byte — Sequential vs Batched")
	ax.set_xticks(X_POS)
	ax.set_xticklabels(X_LABELS)
	ax.legend()
	fig.tight_layout()
	path = os.path.join(OUT_DIR, "ttfb_vs_batch.png")
	fig.savefig(path, dpi=150)
	plt.close(fig)
	print(f"Saved: {path}")


	def plot_memory():
	fig, ax = plt.subplots(figsize=(9, 5.5))
	x = np.arange(len(X_LABELS))
	width = 0.18
	for i, model in enumerate(MODELS):
	offset = (i - 1.5) * width
	vals = _full_memory(model)
	bars = ax.bar(
	x + offset, vals, width,
	label=model, color=COLORS[model], alpha=0.85,
	)
	for bar, val in zip(bars, vals):
	ax.text(
	bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05,
	f"{val:.1f}", ha="center", va="bottom", fontsize=7,
	)
	ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
	ax.set_xlabel("Generation Mode")
	ax.set_ylabel("Peak Memory (GB)")
	ax.set_title("Peak Memory Usage — Sequential vs Batched")
	ax.set_xticks(x)
	ax.set_xticklabels(X_LABELS)
	ax.legend()
	ax.set_ylim(0, 7.5)
	fig.tight_layout()
	path = os.path.join(OUT_DIR, "memory_vs_batch.png")
	fig.savefig(path, dpi=150)
	plt.close(fig)
	print(f"Saved: {path}")


	def plot_efficiency():
	"""Throughput per GB of memory: sequential vs batch=4."""
	fig, ax = plt.subplots(figsize=(9, 5.5))

	x = np.arange(len(MODELS))
	width = 0.3

	seq_eff = [SEQ_RTF[m] / SEQ_MEMORY[m] for m in MODELS]
	batch4_eff = [BATCH_THROUGHPUT[m][2] / BATCH_MEMORY[m][2] for m in MODELS]

	bars1 = ax.bar(
	x - width / 2, seq_eff, width,
	label="Sequential", color=[COLORS[m] for m in MODELS], alpha=0.45,
	edgecolor=[COLORS[m] for m in MODELS], linewidth=1.5,
	)
	bars2 = ax.bar(
	x + width / 2, batch4_eff, width,
	label="Batch 4", color=[COLORS[m] for m in MODELS], alpha=0.85,
	)

	for bar, val in zip(bars1, seq_eff):
	ax.text(
	bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
	f"{val:.2f}", ha="center", va="bottom", fontsize=9,
	)
	for bar, val in zip(bars2, batch4_eff):
	ax.text(
	bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
	f"{val:.2f}", ha="center", va="bottom", fontsize=9,
	)

	ax.set_xticks(x)
	ax.set_xticklabels(MODELS)
	ax.set_ylabel("Throughput per GB (x / GB)")
	ax.set_title("Memory Efficiency — Sequential vs Batch 4")
	ax.legend()
	fig.tight_layout()
	path = os.path.join(OUT_DIR, "efficiency_batch4.png")
	fig.savefig(path, dpi=150)
	plt.close(fig)
	print(f"Saved: {path}")


	def plot_combined_summary():
	"""2x2 summary figure."""
	fig, axes = plt.subplots(2, 2, figsize=(15, 11))

	# TPS
	ax = axes[0, 0]
	for model in MODELS:
	ax.plot(
	X_POS, _full_tps(model),
	marker=MARKERS[model], color=COLORS[model],
	linewidth=2, markersize=7, label=model,
	)
	ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
	ax.set_xlabel("Generation Mode")
	ax.set_ylabel("Tokens / sec")
	ax.set_title("Tokens per Second")
	ax.set_xticks(X_POS)
	ax.set_xticklabels(X_LABELS, fontsize=9)
	ax.legend(fontsize=9)
	ax.grid(True, alpha=0.3)

	# Throughput
	ax = axes[0, 1]
	for model in MODELS:
	ax.plot(
	X_POS, _full_throughput(model),
	marker=MARKERS[model], color=COLORS[model],
	linewidth=2, markersize=7, label=model,
	)
	ax.plot(X_POS[1:], BATCH_SIZES, "--", color="gray", alpha=0.5, label="Ideal")
	ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
	ax.set_xlabel("Generation Mode")
	ax.set_ylabel("Audio duration / wall time")
	ax.set_title("Throughput Scaling")
	ax.set_xticks(X_POS)
	ax.set_xticklabels(X_LABELS, fontsize=9)
	ax.legend(fontsize=9)
	ax.grid(True, alpha=0.3)

	# TTFB
	ax = axes[1, 0]
	for model in MODELS:
	ax.plot(
	X_POS, _full_ttfb(model),
	marker=MARKERS[model], color=COLORS[model],
	linewidth=2, markersize=7, label=model,
	)
	ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
	ax.set_xlabel("Generation Mode")
	ax.set_ylabel("Avg TTFB (ms)")
	ax.set_title("Time to First Byte")
	ax.set_xticks(X_POS)
	ax.set_xticklabels(X_LABELS, fontsize=9)
	ax.legend(fontsize=9)
	ax.grid(True, alpha=0.3)

	# Memory
	ax = axes[1, 1]
	x = np.arange(len(X_LABELS))
	width = 0.18
	for i, model in enumerate(MODELS):
	offset = (i - 1.5) * width
	ax.bar(
	x + offset, _full_memory(model), width,
	label=model, color=COLORS[model], alpha=0.85,
	)
	ax.axvline(x=0.5, color="gray", linestyle=":", alpha=0.4)
	ax.set_xlabel("Generation Mode")
	ax.set_ylabel("Peak Memory (GB)")
	ax.set_title("Memory Usage")
	ax.set_xticks(x)
	ax.set_xticklabels(X_LABELS, fontsize=9)
	ax.legend(fontsize=9)
	ax.set_ylim(0, 7.5)
	ax.grid(True, alpha=0.3)

	fig.suptitle(
	"Qwen3-TTS Batch Generation — CustomVoice 1.7B (short prompt)",
	fontsize=14, fontweight="bold", y=0.98,
	)
	fig.tight_layout(rect=[0, 0, 1, 0.96])
	path = os.path.join(OUT_DIR, "summary.png")
	fig.savefig(path, dpi=150)
	plt.close(fig)
	print(f"Saved: {path}")


	if __name__ == "__main__":
	plot_tps()
	plot_throughput()
	plot_ttfb()
	plot_memory()
	plot_efficiency()
	plot_combined_summary()
	print(f"\nAll plots saved to {OUT_DIR}/")