Skip to content

Instantly share code, notes, and snippets.

@efemaer
Last active March 20, 2025 15:22
Show Gist options
  • Save efemaer/cedbc629d2cf5bfe1eb35276e5531c4b to your computer and use it in GitHub Desktop.
Save efemaer/cedbc629d2cf5bfe1eb35276e5531c4b to your computer and use it in GitHub Desktop.
# 1. https://docs.beam.cloud/v2/getting-started/quickstart#installation
# 2. beam deploy kokoro_beam.py:generate_speech

from beam import endpoint, env, Image, Output


if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokoro_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    #####################################################################
    # 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro
    #####################################################################
    # cpu=1,
    # workers=1,
    # memory="4Gi",
    #####################################################################
    cpu=10,
    workers=10,
    memory="24Gi",
    #####################################################################
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokoro_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        "-write_xing",  # Add proper MP3 header
        "0",  # Disable Xing header (which contains duration info)
        "-write_id3v2",
        "1",  # Add ID3v2 header for file recognition
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    del pipeline

    return {"output_url": public_url}
@efemaer
Copy link
Author

efemaer commented Feb 19, 2025

Yes, tested just now works as expected

@chandradeepc
Copy link

chandradeepc commented Feb 20, 2025

Fixed bug with mp3 metadata encoding. And set the beam config for max concurrency possible on 24GB GPU. This config costs $3/hr. Reduce to 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro. This minimum config costs $0.8/hr.



from beam import endpoint, env, Image, Output

if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid
    import os


def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline


kokor_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)


@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    cpu=10,
    workers=10,
    memory="24Gi",
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokor_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value

    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")

    if not text:
        return {"error": "Please provide text to generate speech"}

    generator = pipeline(text, voice=voice)

    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"

    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        "-write_xing",  # Add proper MP3 header
        "0",  # Disable Xing header (which contains duration info)
        "-write_id3v2",
        "1",  # Add ID3v2 header for file recognition
        mp3_file,
    ]

    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)

            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding

    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}

    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)

    #  Cleanup pipeline memory (optional)
    del pipeline

    return {"output_url": public_url}

@efemaer
Copy link
Author

efemaer commented Feb 20, 2025

Thanks for the input @chandradeepc ! Updated the gist to reflect that as an option

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment