June 3, 2026 20:16 · April 14, 2026 12:23 · April 5, 2026 20:35 · April 5, 2026 19:33 · March 2, 2026 21:23 · March 2, 2026 17:07
 diff --git a/mlx_vlm/models/rope_utils.py b/mlx_vlm/models/rope_utils.py
 index 0d2f5343..6fb292ae 100644
 --- a/mlx_vlm/models/rope_utils.py
 +++ b/mlx_vlm/models/rope_utils.py
 @@ -556,6 +556,7 @@ class MRoPERotaryEmbedding(nn.Module):
         self.pairing = _pairing_for_style(style)
         self.fused_apply = self.position_selector is not None and _HAS_METAL
         self._compiled_apply = {} if self.fused_apply else None
 +        self.eval_cached_arrays()
 
 """
 Benchmark TriAttention on MATH 500 — matching the paper's evaluation protocol.

 Paper settings: max_tokens=32768, temp=0.6, top_p=0.95, budget=512/1024/2048
 We use max_tokens=4096 for practical runtime on Apple Silicon.

 USAGE
    python bench_triattention_math.py \
        --model /tmp/gemma-4-26b-a4b-it-5bit \
        --calib /tmp/gemma4_26b_5bit_calib.safetensors \
 """
 Benchmark TurboQuant (TBQ) vs baseline on MM-NIAH (Multimodal Needle-in-a-Haystack).

 INSTALL
    pip install -U mlx-vlm
    # or
    uv pip install -U mlx-vlm

 SETUP — Extract images (one-time)
    huggingface-cli download OpenGVLab/MM-NIAH mm_niah_val/images.tar.gz --repo-type dataset
 """Benchmark TurboQuant vs baseline on LongBench-v2.

 Usage:
    python scripts/bench_longbench_v2.py --model google/gemma-4-e4b-it --num-samples 10 --max-tokens-ctx 260000
    python scripts/bench_longbench_v2.py --model google/gemma-4-26b-a4b-it --num-samples 5 --max-tokens-ctx 128000 --kv-bits 4
 """

 import argparse
 import importlib
 import time
 #!/usr/bin/env python3
 """
 Benchmark for Qwen3-TTS: measures TTFB, inter-chunk latency, and throughput.

 Usage:                                                                                                                                                                                
  # Sequential only (short/medium/long)                                                                                                                                                                 
  python qwen3_tts_benchmark.py --model mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16

  # Sequential + batched (1,2,3,4,8)
  python qwen3_tts_benchmark.py --batch-size 1 2 3 4 8
 #!/usr/bin/env python3
 """
 Benchmark for Qwen3-TTS: measures TTFB, inter-chunk latency, and throughput.

 Usage:
    python benchmarks/qwen3_tts_benchmark.py
    python benchmarks/qwen3_tts_benchmark.py --model mlx-community/Qwen3-TTS-0.6B-bf16
    python benchmarks/qwen3_tts_benchmark.py --num-trials 3 --streaming-interval 1.0
    python benchmarks/qwen3_tts_benchmark.py --prompts short medium long
 """
 //
 //  BaseConfiguration.swift
 //  mlx-test
 //
 //  Created by Prince Canuma on 29/12/25.
 //

 import Foundation
 import MLX
 mlx_audio.tts.generate \
  --model mlx-community/chatterbox-turbo-fp16 \
  --text 'Abstract
 The dominant sequence transduction models are based on complex recurrent or
 convolutional neural networks that include an encoder and a decoder. The best
 performing models also connect the encoder and decoder through an attention
 mechanism. We propose a new simple network architecture, the Transformer,
 based solely on attention mechanisms, dispensing with recurrence and convolutions
 entirely. Experiments on two machine translation tasks show these models to
 be superior in quality while being more parallelizable and requiring significantly
 #!/usr/bin/env python3
 # Copyright (c) 2025 Resemble AI
 # MIT License
 # Weight conversion script: PyTorch -> MLX

 """
 Converts Chatterbox Turbo weights from PyTorch to MLX format.

 Usage:
    python convert_weights.py --output model.safetensors
	diff --git a/mlx_vlm/models/rope_utils.py b/mlx_vlm/models/rope_utils.py
	index 0d2f5343..6fb292ae 100644
	--- a/mlx_vlm/models/rope_utils.py
	+++ b/mlx_vlm/models/rope_utils.py
	@@ -556,6 +556,7 @@ class MRoPERotaryEmbedding(nn.Module):
	self.pairing = _pairing_for_style(style)
	self.fused_apply = self.position_selector is not None and _HAS_METAL
	self._compiled_apply = {} if self.fused_apply else None
	+ self.eval_cached_arrays()
	"""
	Benchmark TriAttention on MATH 500 — matching the paper's evaluation protocol.

	Paper settings: max_tokens=32768, temp=0.6, top_p=0.95, budget=512/1024/2048
	We use max_tokens=4096 for practical runtime on Apple Silicon.

	USAGE
	python bench_triattention_math.py \
	--model /tmp/gemma-4-26b-a4b-it-5bit \
	--calib /tmp/gemma4_26b_5bit_calib.safetensors \
	"""
	Benchmark TurboQuant (TBQ) vs baseline on MM-NIAH (Multimodal Needle-in-a-Haystack).

	INSTALL
	pip install -U mlx-vlm
	# or
	uv pip install -U mlx-vlm

	SETUP — Extract images (one-time)
	huggingface-cli download OpenGVLab/MM-NIAH mm_niah_val/images.tar.gz --repo-type dataset
	"""Benchmark TurboQuant vs baseline on LongBench-v2.

	Usage:
	python scripts/bench_longbench_v2.py --model google/gemma-4-e4b-it --num-samples 10 --max-tokens-ctx 260000
	python scripts/bench_longbench_v2.py --model google/gemma-4-26b-a4b-it --num-samples 5 --max-tokens-ctx 128000 --kv-bits 4
	"""

	import argparse
	import importlib
	import time
	#!/usr/bin/env python3
	"""
	Benchmark for Qwen3-TTS: measures TTFB, inter-chunk latency, and throughput.

	Usage:
	# Sequential only (short/medium/long)
	python qwen3_tts_benchmark.py --model mlx-community/Qwen3-TTS-12Hz-0.6B-Base-bf16

	# Sequential + batched (1,2,3,4,8)
	python qwen3_tts_benchmark.py --batch-size 1 2 3 4 8
	//
	// BaseConfiguration.swift
	// mlx-test
	//
	// Created by Prince Canuma on 29/12/25.
	//

	import Foundation
	import MLX
	mlx_audio.tts.generate \
	--model mlx-community/chatterbox-turbo-fp16 \
	--text 'Abstract
	The dominant sequence transduction models are based on complex recurrent or
	convolutional neural networks that include an encoder and a decoder. The best
	performing models also connect the encoder and decoder through an attention
	mechanism. We propose a new simple network architecture, the Transformer,
	based solely on attention mechanisms, dispensing with recurrence and convolutions
	entirely. Experiments on two machine translation tasks show these models to
	be superior in quality while being more parallelizable and requiring significantly
	#!/usr/bin/env python3
	# Copyright (c) 2025 Resemble AI
	# MIT License
	# Weight conversion script: PyTorch -> MLX

	"""
	Converts Chatterbox Turbo weights from PyTorch to MLX format.

	Usage:
	python convert_weights.py --output model.safetensors