Skip to content

Instantly share code, notes, and snippets.

@tbogdala
Created February 13, 2025 01:37
Show Gist options
  • Save tbogdala/d9db7e2a4ce1adb01331d9f0c8de5e99 to your computer and use it in GitHub Desktop.
Save tbogdala/d9db7e2a4ce1adb01331d9f0c8de5e99 to your computer and use it in GitHub Desktop.
Quick and dirty kokoro invocation that also plays the audio as well as saving it.
# 1️⃣ Install required packages
# pip install kokoro soundfile sounddevice argparse
import sys
import argparse
from kokoro import KPipeline
import soundfile as sf
import sounddevice as sd
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
# pulled from https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
supported_voices = [
"af_heart", "af_alloy", "af_aoede", "af_bella", "af_jessica", "af_kore",
"af_nicole", "af_nova", "af_river", "af_sarah", "af_sky", "am_adam",
"am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael", "am_onyx",
"am_puck", "am_santa", "bf_alice", "bf_emma", "bf_isabella", "bf_lily",
"bm_daniel", "bm_fable", "bm_george", "bm_lewis", "jf_alpha", "jf_gongitsune",
"jf_nezumi", "jf_tebukuro", "jm_kumo", "zf_xiaobei", "zf_xiaoni",
"zf_xiaoxiao", "zf_xiaoyi", "zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang"
]
# 2️⃣ Parse command-line arguments
parser = argparse.ArgumentParser(description='Kokoro TTS Processor')
parser.add_argument('--voice', type=str, default='af_heart', choices=supported_voices,
help='Voice selection (default: af_heart)')
parser.add_argument('--speed', type=float, default=1.0,
help='Speed setting for the generated speech (default: 1.0)')
args = parser.parse_args()
# 3️⃣ Read text from standard input
text = sys.stdin.read()
# 4️⃣ Initialize the pipeline with American English
pipeline = KPipeline(lang_code='a') # English by default
# 5️⃣ Generate and process audio chunks with selected voice
generator = pipeline(
text,
voice=args.voice, # Use command-line argument for voice
speed=1.0,
split_pattern=r'\n+'
)
# 6️⃣ Save and play each generated audio segment
for i, (gs, ps, audio) in enumerate(generator):
filename = f'output_{i}.wav'
sf.write(filename, audio, 24000)
# Play audio using sounddevice (cross-platform)
sd.play(audio, samplerate=24000)
sd.wait() # Wait until audio finishes playing
print(f"Processed {i+1} audio segment(s) successfully")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment