Guide to run OmniVoice with AMD's Strix Halo CPUs

omnivoice strix halo

use the kyuz0/vllm-therock-gfx1151:f89c8c689ade docker image

docker run -it --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri -v /home/$USER/.cache:/root/.cache --entrypoint bash kyuz0/vllm-therock-gfx1151

pip install omnivoice-server
patch venv/lib/python3.12/site-packages/omnivoice_server/utils/audio.py with this for omnivoice-server==0.1.0

--- venv/lib/python3.12/site-packages/omnivoice_server/utils/audio.py   2026-04-11 03:36:18.700855894 +0000                                                                                                          
+++ venv/lib/python3.12/site-packages/omnivoice_server/utils/audio.py_mod       2026-04-11 03:41:34.286731496 +0000                                                                                                  
@@ -7,6 +7,7 @@                                                                                           
 # FIX: io and torchaudio were imported a second time in the middle of the file,                          
 # after validate_audio_bytes. Moved all imports to top — single import block.
 import io                                      
+import wave                                         
                                                                                                          
 import torch                                                                                             
 import torchaudio                                                                                        
@@ -18,21 +19,34 @@
     """
     Convert (1, T) float32 tensor to 16-bit PCM WAV bytes.
     """
-    cpu_tensor = tensor.cpu()
-    if cpu_tensor.dim() == 1:
-        cpu_tensor = cpu_tensor.unsqueeze(0)
+    cpu_tensor = tensor.detach().to("cpu")
 
-    buf = io.BytesIO()
-    torchaudio.save(
-        buf,
-        cpu_tensor,
-        SAMPLE_RATE,
-        format="wav",
-        encoding="PCM_S",
-        bits_per_sample=16,
+    if cpu_tensor.dim() == 2:
+        if cpu_tensor.shape[0] == 1:
+            cpu_tensor = cpu_tensor.squeeze(0)
+        else:
+            raise ValueError("Only mono audio is supported")
+
+    if cpu_tensor.dim() != 1:
+        raise ValueError(
+            f"Expected 1D or (1, T) tensor, got shape {tuple(cpu_tensor.shape)}"
+        )
+
+    pcm16 = (
+        cpu_tensor.clamp(-1.0, 1.0)
+        .mul(32767.0)
+        .to(torch.int16)
+        .numpy()
+        .tobytes()
     )
-    buf.seek(0)
-    return buf.read()
+
+    buf = io.BytesIO()
+    with wave.open(buf, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(SAMPLE_RATE)
+        wf.writeframes(pcm16)
+        return buf.getvalue()
 
 
 def tensors_to_wav_bytes(tensors: list[torch.Tensor]) -> bytes:

MIOPEN_FIND_MODE=2 omnivoice-server --host 0.0.0.0 --port 8880 --device cuda
test with

curl http://localhost:8880/v1/audio/speech -H 'content-type: application/json' -d '{"model": "tts-1", "input": "This is a BF16 conversion of OmniVoice — a state-of-the-art zero-shot multilingual TTS model supporting 600+ languages, built on a diffusion language model architecture. Converting from FP32 to BF16 halves the on-disk size and VRAM usage with negligible quality loss, making it the recommended variant for most users.", "response_format": "wav"}' -o /tmp/output-1.wav

kyteinsky/omnivoice-strix-halo.md

Select an option

No results found

Select an option

No results found

omnivoice strix halo