kokoro_beam.md

# 1. https://docs.beam.cloud/v2/getting-started/quickstart#installation
# 2. beam deploy kokoro_beam.py:generate_speech

from beam import endpoint, env, Image, Output


if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokoro_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    #####################################################################
    # 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro
    #####################################################################
    # cpu=1,
    # workers=1,
    # memory="4Gi",
    #####################################################################
    cpu=10,
    workers=10,
    memory="24Gi",
    #####################################################################
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokoro_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        "-write_xing",  # Add proper MP3 header
        "0",  # Disable Xing header (which contains duration info)
        "-write_id3v2",
        "1",  # Add ID3v2 header for file recognition
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    del pipeline

    return {"output_url": public_url}

from beam import endpoint, env, Image, Output if env.is_remote(): from kokoro import KPipeline import subprocess import uuid import os def load_model(): pipeline = KPipeline("a", device="cuda:0") pipeline.load_single_voice("af_alloy") pipeline.load_single_voice("am_onyx") pipeline.load_single_voice("af_heart") return pipeline kokor_image = ( Image(python_version="python3.11") .add_python_packages(["kokoro"]) .add_commands(["apt update && apt install espeak-ng ffmpeg -y"]) ) @endpoint( name="kokoro-tts", on_start=load_model, cpu=10, workers=10, memory="24Gi", gpu=["RTX4090", "A10G", "A100-40"], gpu_count=1, image=kokor_image, ) def generate_speech(context, **inputs): pipeline = context.on_start_value text = inputs.pop("text", None) voice = inputs.pop("voice", "af_alloy") if not text: return {"error": "Please provide text to generate speech"} generator = pipeline(text, voice=voice) mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3" # Use ffmpeg as a pipe (no intermediate WAV files) ffmpeg_cmd = [ "ffmpeg", "-y", # Overwrite if exists "-f", "s16le", # Raw PCM 16-bit little-endian "-ar", "24000", # Sample rate "-ac", "1", # Mono audio "-i", "pipe:0", # Read from stdin (raw audio) "-codec:a", "libmp3lame", # MP3 codec "-b:a", "48k", # Bitrate "-write_xing", # Add proper MP3 header "0", # Disable Xing header (which contains duration info) "-write_id3v2", "1", # Add ID3v2 header for file recognition mp3_file, ] try: with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc: for result in generator: # Convert tensor to bytes and scale to 16-bit PCM format audio_bytes = ( (result.audio.cpu().numpy() * 32767) .clip(-32768, 32767) .astype("int16") .tobytes() ) ffmpeg_proc.stdin.write(audio_bytes) ffmpeg_proc.stdin.close() ffmpeg_proc.wait() # Ensure ffmpeg finishes encoding except subprocess.CalledProcessError: return {"error": "Failed to convert audio to MP3"} output_file = Output(path=mp3_file) output_file.save() public_url = output_file.public_url(expires=3600) # Cleanup pipeline memory (optional) del pipeline return {"output_url": public_url}

efemaer/kokoro_beam.md

efemaer commented Feb 19, 2025

Uh oh!

chandradeepc commented Feb 20, 2025 •

edited

Loading

Uh oh!

efemaer commented Feb 20, 2025

Uh oh!

efemaer/kokoro_beam.md

efemaer commented Feb 19, 2025

Uh oh!

chandradeepc commented Feb 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

efemaer commented Feb 20, 2025

Uh oh!

chandradeepc commented Feb 20, 2025 •

edited

Loading