Skip to content

Instantly share code, notes, and snippets.

@h4rm0n1c
Last active September 2, 2025 13:50
Show Gist options
  • Save h4rm0n1c/5ab5c305a4fbbddfda587fc88a259437 to your computer and use it in GitHub Desktop.
Save h4rm0n1c/5ab5c305a4fbbddfda587fc88a259437 to your computer and use it in GitHub Desktop.
Sapi 4.0 32 Bit Python Wrapper - Works with FlexSpeak (Halflife Vox) on Windows 10!

You will need win32com, install via pip into a win32 python instance, 3.11-32 is good.

you will have to use a win32 python, testing was done on python 3.11-32

Download this

https://archive.org/details/flextalk

I've tested on Win10 64

get a python 3.11-32 env

use pip to install win32com into 3.11-32

py -3.11-32 ./speak.py --say "test" - Demo of including and using it in a command line client, this is an advanced python cleanroom rewrite of speak.exe from the microsoft speech SDK 4.0

py -3.11-32 ./sapi4wrap.py - The actual wrapper class itself

This could be used in a wine wrapper on linux to provide the halflife vox voice, winpython in a wine wrapper is silly but flextalk on its own is kinda old and garbo.

no, wav writing will never be supported, for that a properly functiong SAPI 4.0 install would be needed, as it is, "Speech.VoiceText" just barely works on windows 10.

I have no idea if/how flextalk engine can be integrated with the regular microsoft speech api from that era and DIrectSS, etc...

I am planning on adding wav support by installing voicemeeter and using one of the devices to provide loopback to a python script which will push audio straight to ffmpeg

import time
import win32com.client
class VoiceText:
"""
Minimal, safe wrapper for SAPI 4 'Speech.VoiceText' as seen on your system.
Observed behavior:
- Register(site, app): must be called once per COM instance.
- Speak(text, flags): queues if already speaking. Use flags=0.
- Enabled=0 causes Speak() to raise; use 'mute' by simply not calling Speak.
- Speed is clamped by the engine: effective range ~50..300 on your box.
- Transport: Pause/Resume/Stop work; FastForward/Rewind hang (do not use).
"""
def __init__(self, site="pyvoice", app="pyapp", default_speed=100, auto_enable=True):
self._create(site, app, default_speed, auto_enable)
# --- lifecycle ---
def _create(self, site, app, default_speed, auto_enable):
self.vt = win32com.client.Dispatch("Speech.VoiceText")
self.vt.Register(site, app) # one-time; re-calling on same instance may throw
if auto_enable:
self.vt.Enabled = 1
self.vt.Speed = int(default_speed)
def recreate(self, site="pyvoice", app="pyapp", default_speed=100, auto_enable=True):
"""Dispose current COM object and recreate (use to change site/app)."""
try:
del self.vt
except Exception:
pass
self._create(site, app, default_speed, auto_enable)
# --- properties ---
@property
def enabled(self) -> bool:
try:
return bool(self.vt.Enabled)
except Exception:
return False
@enabled.setter
def enabled(self, on: bool):
self.vt.Enabled = 1 if on else 0
@property
def speed(self) -> int:
return int(self.vt.Speed)
@speed.setter
def speed(self, val: int):
# clamp to observed effective range
v = max(50, min(int(val), 300))
self.vt.Speed = v
def is_speaking(self) -> bool:
try:
return bool(self.vt.IsSpeaking)
except Exception:
return False
# --- core ops ---
def speak_async(self, text: str, flags: int = 0):
"""Queue text and return immediately."""
if not self.enabled:
# mirror engine behavior with clearer message
raise RuntimeError("VoiceText.Enabled==0: engine will reject Speak(). Enable first.")
self.vt.Speak(text, flags)
def speak_and_wait(self, text: str, flags: int = 0, poll: float = 0.02):
"""Queue text and block until all speech finishes."""
self.speak_async(text, flags)
self.wait_finish(poll=poll)
def stop(self):
"""Stop current and flush any queued utterances."""
try:
self.vt.StopSpeaking()
finally:
time.sleep(0.12) # allow buffer to settle
def pause(self): self.vt.AudioPause()
def resume(self): self.vt.AudioResume()
# --- helpers ---
def purge(self):
"""Hard flush of current+queued audio; use before 'exclusive' speaks."""
self.stop()
def speak_exclusive(self, text: str, flags: int = 0, poll: float = 0.02):
"""Flush queue, speak text, wait for completion (single-flight)."""
if not self.enabled:
self.enabled = 1
self.purge()
self.speak_async(text, flags)
self.wait_finish(poll=poll)
def speak_batch(self, lines, inter_gap: float = 0.0, flags: int = 0):
"""Queue multiple lines in order; optional small gap between enqueues."""
if not self.enabled:
self.enabled = 1
for line in lines:
self.vt.Speak(line, flags)
if inter_gap:
time.sleep(inter_gap)
def wait_start(self, timeout: float = 3.0, poll: float = 0.01) -> bool:
t0 = time.monotonic()
while time.monotonic() - t0 < timeout:
if self.is_speaking():
return True
time.sleep(poll)
return False
def wait_finish(self, poll: float = 0.02):
while self.is_speaking():
time.sleep(poll)
# quick demo
if __name__ == "__main__":
tts = VoiceText()
tts.speak_and_wait("Hello from SAPI four via Python.")
tts.speak_async("This will queue.")
tts.speak_async("And play after the previous sentence.")
tts.wait_finish()
# sapi4_cli.py
# Simple command-line driver for SAPI 4 VoiceText via sapi4wrap.VoiceText
# Works on your Python 3.11 32-bit setup.
import argparse, sys, time
from pathlib import Path
from sapi4wrap import VoiceText # <-- your file
def read_text_file(path: Path, encoding: str | None):
encs = [encoding] if encoding else ["utf-8-sig", "utf-16", "mbcs"]
last_err = None
for enc in encs:
try:
return path.read_text(encoding=enc, errors="replace")
except Exception as e:
last_err = e
raise last_err
def main():
p = argparse.ArgumentParser(
prog="sapi4_cli",
description="Speak text using SAPI 4 VoiceText (Speech.VoiceText)."
)
# Input sources
g = p.add_mutually_exclusive_group()
g.add_argument("--say", help="Speak the given text string.")
g.add_argument("--file", type=Path, help="Speak text loaded from file.")
g.add_argument("--stdin", action="store_true",
help="Read text from stdin and speak it.")
# Behavior
# --- replace your Behavior args with this ---
beh = p.add_argument_group("Behavior")
beh.add_argument("--exclusive", action="store_true",
help="Flush any queued audio before speaking.")
beh.add_argument("--batch-lines", action="store_true",
help="For --file: speak each line as its own utterance (queued).")
beh.add_argument("--gap", type=float, default=0.0,
help="Inter-line delay for --batch-lines (seconds).")
beh.add_argument("--flags", type=int, default=0,
help="Raw Speak() flags (observed 0/1/2/4/8 behave the same).")
# Default to waiting; allow opt-out
beh.add_argument("--no-wait", dest="wait", action="store_false",
help="Don't block. NOTE: process exit will cut speech.")
# Engine setup
p.add_argument("--speed", type=int, default=100,
help="Engine speed (engine clamps to ~50..300 on your box).")
p.add_argument("--site", default="pyvoice", help="Register site string.")
p.add_argument("--app", default="pyapp", help="Register app string.")
p.add_argument("--encoding", default=None,
help="Input file/stdin encoding (default: try utf-8-sig, utf-16, mbcs).")
p.set_defaults(wait=True)
args = p.parse_args()
# If no source provided, default to --stdin (like the old speak.exe)
if not (args.say or args.file or args.stdin):
args.stdin = True
# Create voice
tts = VoiceText(site=args.site, app=args.app, default_speed=args.speed, auto_enable=True)
def speak_text(text: str):
if args.exclusive:
tts.speak_exclusive(text, flags=args.flags) # this blocks by design
return
tts.speak_async(text, flags=args.flags)
if args.wait:
tts.wait_finish()
else:
# Best-effort nudge so audio actually starts before exit
tts.wait_start(timeout=1.0)
time.sleep(0.25)
# Source: --say
if args.say:
speak_text(args.say)
return
# Source: --file
if args.file:
if not args.file.exists():
p.error(f"File not found: {args.file}")
text = read_text_file(args.file, args.encoding)
if args.batch_lines:
# One utterance per line (queue semantics)
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
if args.exclusive:
tts.purge()
tts.speak_batch(lines, inter_gap=args.gap, flags=args.flags)
if args.wait:
tts.wait_finish()
else:
# One big utterance
speak_text(text)
return
# Source: --stdin
if args.stdin:
if sys.stdin.isatty():
print("Reading from stdin (Ctrl+Z then Enter to end)...", file=sys.stderr)
data = sys.stdin.read()
if not data:
return
speak_text(data)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment