- use the
kyuz0/vllm-therock-gfx1151 docker image
docker run -it --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri -v /home/$USER/.cache:/root/.cache --entrypoint bash kyuz0/vllm-therock-gfx1151
pip install omnivoice-server
- patch
venv/lib/python3.12/site-packages/omnivoice_server/utils/audio.py with this for omnivoice-server==0.1.0
--- venv/lib/python3.12/site-packages/omnivoice_server/utils/audio.py 2026-04-11 03:36:18.700855894 +0000
+++ venv/lib/python3.12/site-packages/omnivoice_server/utils/audio.py_mod 2026-04-11 03:41:34.286731496 +0000
@@ -7,6 +7,7 @@
# FIX: io and torchaudio were imported a second time in the middle of the file,
# after validate_audio_bytes. Moved all imports to top — single import block.
import io
+import wave
import torch
import torchaudio
@@ -18,21 +19,34 @@
"""
Convert (1, T) float32 tensor to 16-bit PCM WAV bytes.
"""
- cpu_tensor = tensor.cpu()
- if cpu_tensor.dim() == 1:
- cpu_tensor = cpu_tensor.unsqueeze(0)
+ cpu_tensor = tensor.detach().to("cpu")
- buf = io.BytesIO()
- torchaudio.save(
- buf,
- cpu_tensor,
- SAMPLE_RATE,
- format="wav",
- encoding="PCM_S",
- bits_per_sample=16,
+ if cpu_tensor.dim() == 2:
+ if cpu_tensor.shape[0] == 1:
+ cpu_tensor = cpu_tensor.squeeze(0)
+ else:
+ raise ValueError("Only mono audio is supported")
+
+ if cpu_tensor.dim() != 1:
+ raise ValueError(
+ f"Expected 1D or (1, T) tensor, got shape {tuple(cpu_tensor.shape)}"
+ )
+
+ pcm16 = (
+ cpu_tensor.clamp(-1.0, 1.0)
+ .mul(32767.0)
+ .to(torch.int16)
+ .numpy()
+ .tobytes()
)
- buf.seek(0)
- return buf.read()
+
+ buf = io.BytesIO()
+ with wave.open(buf, "wb") as wf:
+ wf.setnchannels(1)
+ wf.setsampwidth(2)
+ wf.setframerate(SAMPLE_RATE)
+ wf.writeframes(pcm16)
+ return buf.getvalue()
def tensors_to_wav_bytes(tensors: list[torch.Tensor]) -> bytes:
MIOPEN_FIND_MODE=2 omnivoice-server --host 0.0.0.0 --port 8880 --device cuda
- test with
curl http://localhost:8880/v1/audio/speech -H 'content-type: application/json' -d '{"model": "tts-1", "input": "This is a BF16 conversion of OmniVoice — a state-of-the-art zero-shot multilingual TTS model supporting 600+ languages, built on a diffusion language model architecture. Converting from FP32 to BF16 halves the on-disk size and VRAM usage with negligible quality loss, making it the recommended variant for most users.", "response_format": "wav"}' -o /tmp/output-1.wav