Created
February 1, 2025 23:17
-
-
Save trojblue/a37de663ada15014eaa5463a536f110d to your computer and use it in GitHub Desktop.
Local version of the huggingface Kokoro-TTS space (that uses local gpu instead of huggingface Zero)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
orig: https://huggingface.co/spaces/hexgrad/Kokoro-TTS | |
Deps: | |
pip install kokoro | |
other files: see original repo | |
""" | |
import os | |
import random | |
import torch | |
import gradio as gr | |
from kokoro import KModel, KPipeline | |
# For a local app we simply set a character limit. | |
CHAR_LIMIT = 5000 | |
CUDA_AVAILABLE = torch.cuda.is_available() | |
models = { | |
gpu: KModel().to("cuda" if gpu else "cpu").eval() | |
for gpu in [False] + ([True] if CUDA_AVAILABLE else []) | |
} | |
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in "ab"} | |
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kΛOkΙΙΉO" | |
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kΛQkΙΙΉQ" | |
# No Spaces decorator needed; simply call the GPU model directly. | |
def forward_gpu(ps, ref_s, speed): | |
return models[True](ps, ref_s, speed) | |
def generate_first(text, voice="af_heart", speed=1, use_gpu=CUDA_AVAILABLE): | |
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT] | |
pipeline = pipelines[voice[0]] | |
pack = pipeline.load_voice(voice) | |
use_gpu = use_gpu and CUDA_AVAILABLE | |
for _, ps, _ in pipeline(text, voice, speed): | |
ref_s = pack[len(ps) - 1] | |
try: | |
if use_gpu: | |
audio = forward_gpu(ps, ref_s, speed) | |
else: | |
audio = models[False](ps, ref_s, speed) | |
except gr.exceptions.Error as e: | |
if use_gpu: | |
print("Warning:", str(e)) | |
print("Info: Retrying with CPU. To avoid this error, change Hardware to CPU.") | |
audio = models[False](ps, ref_s, speed) | |
else: | |
raise gr.Error(e) | |
return (24000, audio.numpy()), ps | |
return None, "" | |
def predict(text, voice="af_heart", speed=1): | |
return generate_first(text, voice, speed, use_gpu=False)[0] | |
def tokenize_first(text, voice="af_heart"): | |
pipeline = pipelines[voice[0]] | |
for _, ps, _ in pipeline(text, voice): | |
return ps | |
return "" | |
def generate_all(text, voice="af_heart", speed=1, use_gpu=CUDA_AVAILABLE): | |
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT] | |
pipeline = pipelines[voice[0]] | |
pack = pipeline.load_voice(voice) | |
use_gpu = use_gpu and CUDA_AVAILABLE | |
for _, ps, _ in pipeline(text, voice, speed): | |
ref_s = pack[len(ps) - 1] | |
try: | |
if use_gpu: | |
audio = forward_gpu(ps, ref_s, speed) | |
else: | |
audio = models[False](ps, ref_s, speed) | |
except gr.exceptions.Error as e: | |
if use_gpu: | |
print("Warning:", str(e)) | |
print("Info: Switching to CPU") | |
audio = models[False](ps, ref_s, speed) | |
else: | |
raise gr.Error(e) | |
yield 24000, audio.numpy() | |
# Load random texts (ensure you have an "en.txt" file in the same directory) | |
random_texts = {} | |
for lang in ["en"]: | |
with open(f"{lang}.txt", "r") as r: | |
random_texts[lang] = [line.strip() for line in r] | |
def get_random_text(voice): | |
lang = {"a": "en", "b": "en"}[voice[0]] | |
return random.choice(random_texts[lang]) | |
CHOICES = { | |
"πΊπΈ πΊ Heart β€οΈ": "af_heart", | |
"πΊπΈ πΊ Bella π₯": "af_bella", | |
"πΊπΈ πΊ Nicole π§": "af_nicole", | |
"πΊπΈ πΊ Aoede": "af_aoede", | |
"πΊπΈ πΊ Kore": "af_kore", | |
"πΊπΈ πΊ Sarah": "af_sarah", | |
"πΊπΈ πΊ Nova": "af_nova", | |
"πΊπΈ πΊ Sky": "af_sky", | |
"πΊπΈ πΊ Alloy": "af_alloy", | |
"πΊπΈ πΊ Jessica": "af_jessica", | |
"πΊπΈ πΊ River": "af_river", | |
"πΊπΈ πΉ Michael": "am_michael", | |
"πΊπΈ πΉ Fenrir": "am_fenrir", | |
"πΊπΈ πΉ Puck": "am_puck", | |
"πΊπΈ πΉ Echo": "am_echo", | |
"πΊπΈ πΉ Eric": "am_eric", | |
"πΊπΈ πΉ Liam": "am_liam", | |
"πΊπΈ πΉ Onyx": "am_onyx", | |
"πΊπΈ πΉ Santa": "am_santa", | |
"πΊπΈ πΉ Adam": "am_adam", | |
"π¬π§ πΊ Emma": "bf_emma", | |
"π¬π§ πΊ Isabella": "bf_isabella", | |
"π¬π§ πΊ Alice": "bf_alice", | |
"π¬π§ πΊ Lily": "bf_lily", | |
"π¬π§ πΉ George": "bm_george", | |
"π¬π§ πΉ Fable": "bm_fable", | |
"π¬π§ πΉ Lewis": "bm_lewis", | |
"π¬π§ πΉ Daniel": "bm_daniel", | |
} | |
for v in CHOICES.values(): | |
pipelines[v[0]].load_voice(v) | |
TOKEN_NOTE = """ | |
π‘ Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kΛOkΙΙΉO/)` | |
π¬ To adjust intonation, try punctuation `;:,.!?ββ¦"()ββ` | |
β¬οΈ Lower stress `[1 level](-1)` or `[2 levels](-2)` | |
β¬οΈ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words) | |
""" | |
with gr.Blocks() as generate_tab: | |
out_audio = gr.Audio( | |
label="Output Audio", interactive=False, streaming=False, autoplay=True | |
) | |
generate_btn = gr.Button("Generate", variant="primary") | |
with gr.Accordion("Output Tokens", open=True): | |
out_ps = gr.Textbox( | |
interactive=False, | |
show_label=False, | |
info="Tokens used to generate the audio, up to 510 context length.", | |
) | |
tokenize_btn = gr.Button("Tokenize", variant="secondary") | |
gr.Markdown(TOKEN_NOTE) | |
# The Predict button is kept hidden. | |
predict_btn = gr.Button("Predict", variant="secondary", visible=False) | |
STREAM_NOTE = [ | |
"β οΈ There is an unknown Gradio bug that might yield no audio the first time you click `Stream`." | |
] | |
if CHAR_LIMIT is not None: | |
STREAM_NOTE.append(f"βοΈ Each stream is capped at {CHAR_LIMIT} characters.") | |
STREAM_NOTE.append("π Want more characters? You can use Kokoro directly or duplicate this app.") | |
STREAM_NOTE = "\n\n".join(STREAM_NOTE) | |
with gr.Blocks() as stream_tab: | |
out_stream = gr.Audio( | |
label="Output Audio Stream", interactive=False, streaming=True, autoplay=True | |
) | |
with gr.Row(): | |
stream_btn = gr.Button("Stream", variant="primary") | |
stop_btn = gr.Button("Stop", variant="stop") | |
with gr.Accordion("Note", open=True): | |
gr.Markdown(STREAM_NOTE) | |
BANNER_TEXT = """ | |
[***Kokoro*** **is an open-weight TTS model with 82 million parameters.**](https://huggingface.co/hexgrad/Kokoro-82M) | |
As of January 31st, 2025, Kokoro was the most-liked [**TTS model**](https://huggingface.co/models?pipeline_tag=text-to-speech&sort=likes) and the most-liked [**TTS app**](https://huggingface.co/spaces?sort=likes&search=tts). | |
This demo only showcases English, but you can directly use the model to access other languages. | |
""" | |
with gr.Blocks() as app: | |
with gr.Row(): | |
gr.Markdown(BANNER_TEXT, container=True) | |
with gr.Row(): | |
with gr.Column(): | |
text = gr.Textbox( | |
label="Input Text", | |
info=f"Up to ~500 characters per Generate, or {CHAR_LIMIT if CHAR_LIMIT is not None else 'β'} characters per Stream", | |
) | |
with gr.Row(): | |
voice = gr.Dropdown( | |
list(CHOICES.items()), | |
value="af_heart", | |
label="Voice", | |
info="Quality and availability vary by language", | |
) | |
use_gpu = gr.Dropdown( | |
[("ZeroGPU π", True), ("CPU π", False)], | |
value=CUDA_AVAILABLE, | |
label="Hardware", | |
info="GPU is usually faster, but has a usage quota", | |
interactive=CUDA_AVAILABLE, | |
) | |
speed = gr.Slider( | |
minimum=0.5, maximum=2, value=1, step=0.1, label="Speed" | |
) | |
random_btn = gr.Button("Random Text", variant="secondary") | |
with gr.Column(): | |
gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"]) | |
random_btn.click(fn=get_random_text, inputs=[voice], outputs=[text]) | |
generate_btn.click( | |
fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps] | |
) | |
tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps]) | |
stream_event = stream_btn.click( | |
fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream] | |
) | |
stop_btn.click(fn=None, cancels=stream_event) | |
predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio]) | |
if __name__ == "__main__": | |
app.queue().launch(share=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment