xdevfaheem · February 22, 2025 15:35 · xdevfaheem · Feb 22, 2025
diff --git a/zonos.ipynb b/zonos.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/xdevfaheem/be48be88efd1eaf9809b0e8f8462d660/zonos.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XmSb6NyB5O1g"
      },
      "source": [
        "# **Zonos TTS + Voice Cloning**"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Zyphra's Zonos-v0.1 is an expressive, realtime TTS model. They have release two variants, transformer-based and a hybdrid one (transformer+mamba2 blocks). Specifically, Hybrid model is more efficient and posses better perfomance (low latency and memory overhead). More on that on their [**blog**](https://www.zyphra.com/post/beta-release-of-zonos-v0-1)\n",
        "\n",
        "Can't wait for their future release!"
      ],
      "metadata": {
        "id": "LglbLZ0f4LWA"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SJbYr8aP31m3"
      },
      "source": [
        "## **Installation & Setup**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kSdOTOs7COKe"
      },
      "outputs": [],
      "source": [
        "# Install system dependencies\n",
        "!apt update && apt install -y espeak-ng\n",
        "\n",
        "# Clone Zonos repository\n",
        "!git clone https://github.com/Zyphra/Zonos.git"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_CoWe42j5l3-"
      },
      "outputs": [],
      "source": [
        "%cd Zonos\n",
        "!uv pip install -e . --system\n",
        "!uv pip install -e .[compile] --system\n",
        "%cd ..\n",
        "\n",
        "!uv pip install spacy-layout --system\n",
        "!uv pip install gradio --system"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sri420KlWxk_"
      },
      "outputs": [],
      "source": [
        "!python -m spacy download en_core_web_trf"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5dmP8omT38Zc"
      },
      "source": [
        "## **Core**"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "API i wrote for the UI, never mind"
      ],
      "metadata": {
        "id": "LZDeCP7T6bp-"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GDXRDToQFRl-"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import torchaudio\n",
        "import numpy as np\n",
        "from zonos.model import Zonos\n",
        "from zonos.conditioning import make_cond_dict\n",
        "from zonos.utils import DEFAULT_DEVICE as device\n",
        "import spacy\n",
        "from spacy_layout import spaCyLayout\n",
        "from IPython.display import Audio\n",
        "import os\n",
        "\n",
        "class VoxCloner:\n",
        "\n",
        "    def __init__(self, model_id=\"Zyphra/Zonos-v0.1-transformer\", seed=121, word_per_chunk_seconds=32):\n",
        "        self.word_limit_per_chunk = word_per_chunk_seconds\n",
        "        self.device = device\n",
        "        self.speaker_audio_path = \"\"\n",
        "        self.speaker_embedding = None\n",
        "        self.progress_callback = None\n",
        "\n",
        "        # Force deterministic settings\n",
        "        torch.backends.cudnn.deterministic = True\n",
        "        torch.backends.cudnn.benchmark = False\n",
        "        torch.manual_seed(seed)\n",
        "        if torch.cuda.is_available():\n",
        "            torch.cuda.manual_seed_all(seed)\n",
        "\n",
        "        # free up memory if model is already initialized\n",
        "        torch.cuda.empty_cache()\n",
        "        self.model = Zonos.from_pretrained(model_id, device=device)\n",
        "        self.model.requires_grad_(False).eval()\n",
        "        print(\"Model Loaded on device: \", device)\n",
        "\n",
        "    def process(self, file: str, words_per_chunk_seconds = 32):\n",
        "\n",
        "        nlp = spacy.load(\"en_core_web_trf\")\n",
        "\n",
        "        # Extract content\n",
        "        if file.endswith(\".txt\") and os.path.isfile(file):\n",
        "            with open(file, \"r\") as f:\n",
        "                content = f.read()\n",
        "        elif (file.endswith(\".pdf\") or file.endswith(\".xlsx\") or file.endswith(\"docx\")) and os.path.isfile(file):\n",
        "            parser = spaCyLayout(nlp)\n",
        "            doc = parser(file)\n",
        "            content = doc.text\n",
        "        elif not os.path.isfile(file):\n",
        "            content = file\n",
        "        else:\n",
        "            raise Exception(\"Unsupported file type, Come on there! why not, have your content in txt or pdf? it's simple and convinient isn't it?\")\n",
        "\n",
        "        num_words = 0\n",
        "        current_split = 0\n",
        "        chunks = []\n",
        "\n",
        "        # split te extracted content (using on spacy pre-trained transformers model)\n",
        "        # later: https://github.com/segment-any-text/wtpsplit\n",
        "        doc = nlp(content)\n",
        "        splits = [sent.text.strip() for sent in doc.sents]\n",
        "\n",
        "        # generate chunks with words less the word count limit\n",
        "        for i in range(len(splits)):\n",
        "            sent = splits[i]\n",
        "            sentence_per_words = len(sent.split())\n",
        "            if num_words+sentence_per_words >= self.word_limit_per_chunk:\n",
        "                text = \" \".join(splits[current_split:i])\n",
        "                chunks.append(text)\n",
        "                num_words = 0\n",
        "                current_split=i\n",
        "            num_words+=sentence_per_words\n",
        "\n",
        "        chunks.append(\" \".join(splits[current_split:])) # flush the remaining splits\n",
        "\n",
        "        print(chunks);return chunks\n",
        "\n",
        "    def concatenate_audio(self, audio_segments, silence_duration=0.2):\n",
        "        # concat audio segments + slight silence, into a single audio waveform\n",
        "        silence = torch.zeros(1, int(self.model.autoencoder.sampling_rate * silence_duration))\n",
        "        concatenated = [] # list of 2d (1, n) tensors to be concat along the column dim for single channel audio array\n",
        "        for audio in audio_segments:\n",
        "            concatenated.append(audio)\n",
        "            concatenated.append(silence)\n",
        "        return torch.cat(concatenated, dim=1)\n",
        "\n",
        "    def generate_speech(\n",
        "            self,\n",
        "            content_path: str,\n",
        "            speaker_audio_path=None,\n",
        "            language=\"en-us\",\n",
        "            word_limit=30,\n",
        "            cfg_scale: float = 2.0,\n",
        "            top_p: float = 0.0,\n",
        "            top_k: int = 0,\n",
        "            min_p: float = 0.0,\n",
        "            e1: float = 1.0,    # Happiness\n",
        "            e2: float = 0.05,   # Sadness\n",
        "            e3: float = 0.05,   # Disgust\n",
        "            e4: float = 0.05,   # Fear\n",
        "            e5: float = 0.05,   # Surprise\n",
        "            e6: float = 0.05,   # Anger\n",
        "            e7: float = 0.1,    # Other\n",
        "            e8: float = 0.2,    # Neutral\n",
        "            # https://github.com/Zyphra/Zonos/commit/a09ff4fa50cfa66bf79986e19c191f85f5cb53e8\n",
        "            linear: float = 0.5, # High values make the output less random.\n",
        "            confidence: float = 0.40, # Low values make random outputs more random.\n",
        "            quadratic: float = 0.00, # High values make low probablities much lower.\n",
        "        ):\n",
        "\n",
        "        # Load reference audio\n",
        "        # generate speaker embedding for one audio once. TODO: could be more stricter by checking the audio arrays instead of path\n",
        "        if speaker_audio_path and speaker_audio_path!=self.speaker_audio_path:\n",
        "            wav, sampling_rate = torchaudio.load(speaker_audio_path)\n",
        "            self.speaker_embedding = self.model.make_speaker_embedding(wav, sampling_rate).to(self.device, dtype=torch.bfloat16)\n",
        "            self.speaker_audio_path = speaker_audio_path\n",
        "\n",
        "        # Fixed emotion tensor\n",
        "        emotion_tensor = torch.tensor(\n",
        "            [e1, e2, e3, e4, e5, e6, e7, e8],\n",
        "            device=device,\n",
        "            dtype=torch.float32\n",
        "        )\n",
        "\n",
        "        audio_segments = []\n",
        "        for chunk in text_chunks:\n",
        "            if not chunk.strip(): continue\n",
        "\n",
        "            if self.progress_callback:\n",
        "                self.progress_callback(f\"Generating audio segment {len(audio_segments)+1}/{total_chunks}...\")\n",
        "\n",
        "            # consistent conditioning\n",
        "            cond_dict = make_cond_dict(\n",
        "                text=chunk.strip(),\n",
        "                language=language,\n",
        "                speaker=self.speaker_embedding,\n",
        "                emotion=emotion_tensor,\n",
        "                device=self.device,\n",
        "                # Only disable these if your model supports them\n",
        "                unconditional_keys=[\"vqscore_8\", \"dnsmos_ovrl\"]\n",
        "            )\n",
        "\n",
        "            conditioning = self.model.prepare_conditioning(cond_dict)\n",
        "\n",
        "            # generation with static conditioning\n",
        "            with torch.no_grad():\n",
        "                codes = self.model.generate(\n",
        "                    prefix_conditioning=conditioning,\n",
        "                    max_new_tokens=86 * 30,\n",
        "                    cfg_scale=cfg_scale,\n",
        "                    batch_size=1,\n",
        "                    sampling_params=dict(min_p=min_p, linear=linear, conf=confidence, quad=quadratic),\n",
        "                )\n",
        "\n",
        "            # Audio processing (converting the output audio tensor to 2d (1, n) waveform tensor)\n",
        "            wav_out = self.model.autoencoder.decode(codes).cpu().detach()\n",
        "            if wav_out.dim() == 3: wav_out = wav_out.squeeze(0)\n",
        "            if wav_out.dim() == 1: wav_out = wav_out.unsqueeze(0)\n",
        "            if wav_out.dim() == 2 and wav_out.size(0) > 1:\n",
        "                wav_out = wav_out[0:1, :]\n",
        "            audio_segments.append(wav_out)\n",
        "\n",
        "        if self.progress_callback:\n",
        "            self.progress_callback(\"Generation complete!\")\n",
        "\n",
        "        generated_audio_path = \"output_audio.wav\"\n",
        "        final_audio = self.concatenate_audio(audio_segments)\n",
        "        torchaudio.save(generated_audio_path, final_audio, self.model.autoencoder.sampling_rate)\n",
        "        return generated_audio_path"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XKTMtrET_UpT"
      },
      "source": [
        "## **Test**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fZJ75BTqTrzg"
      },
      "outputs": [],
      "source": [
        "voice_cloner = VoxCloner()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4SfexUaGJQEc"
      },
      "outputs": [],
      "source": [
        "input_voice_file = \"/content/harvard.wav\"\n",
        "input_content = \"This is a example for voice cloning, How is it, huh?\"\n",
        "print(\"Input sample audio\")\n",
        "Audio(input_voice_file)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vBjFI2dIV4TT"
      },
      "outputs": [],
      "source": [
        "# test para from somewhere\n",
        "test_text = \"\"\"\n",
        "    Developing writers can often benefit from examining an essay, a paragraph, or even a sentence to determine what makes it effective.\n",
        "    On the following pages are several paragraphs for you to evaluate on your own, along with the Writing Center's explanation.\n",
        "\"\"\"\n",
        "\n",
        "output_audio_path = voice_cloner.generate_speech(\n",
        "    content=\"/content/EliteNotesPOC.pdf\",\n",
        "    speaker_audio_path=input_voice_file)\n",
        "\n",
        "print(\"Generated audio\")\n",
        "Audio(output_audio_path)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Q5cIYs_03_ft"
      },
      "source": [
        "## **UI**"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### The UI has only few advanced setting for now"
      ],
      "metadata": {
        "id": "let670fS6_d6"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "1v664KHE4A1t"
      },
      "outputs": [],
      "source": [
        "import gradio as gr\n",
        "import os\n",
        "import tempfile\n",
        "import torch\n",
        "import torchaudio\n",
        "import numpy as np\n",
        "from zonos.model import Zonos\n",
        "from zonos.conditioning import make_cond_dict\n",
        "from zonos.utils import DEFAULT_DEVICE as device\n",
        "import spacy\n",
        "import traceback\n",
        "from spacy_layout import spaCyLayout\n",
        "from IPython.display import Audio\n",
        "import os\n",
        "\n",
        "MODEL = None\n",
        "SPEAKER_AUDIO_PATH = \"\"\n",
        "SPEAKER_EMBEDDING = None\n",
        "\n",
        "# Force deterministic settings\n",
        "torch.backends.cudnn.deterministic = True\n",
        "torch.backends.cudnn.benchmark = False"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "z5IZ3EeP9JaT"
      },
      "outputs": [],
      "source": [
        "def save_uploaded_file(file):\n",
        "\n",
        "    temp_dir = tempfile.gettempdir()\n",
        "    temp_path = os.path.join(temp_dir, \"uploaded_\" + os.path.basename(file.name))\n",
        "\n",
        "    with open(temp_path, \"wb\") as f:\n",
        "        f.write(file.read())\n",
        "    return temp_path\n",
        "\n",
        "def adjust_other_emotions(changed_value, changed_name, happiness, sadness, disgust, fear, surprise, anger, other, neutral):\n",
        "    \"\"\"Simple function to adjust all emotions when one changes\"\"\"\n",
        "    # Create dictionary of current values\n",
        "    values = {\n",
        "        \"happiness\": happiness,\n",
        "        \"sadness\": sadness,\n",
        "        \"disgust\": disgust,\n",
        "        \"fear\": fear,\n",
        "        \"surprise\": surprise,\n",
        "        \"anger\": anger,\n",
        "        \"other\": other,\n",
        "        \"neutral\": neutral\n",
        "    }\n",
        "\n",
        "    # Update the changed value\n",
        "    values[changed_name] = changed_value\n",
        "\n",
        "    # Calculate how much we need to adjust others\n",
        "    total = sum(values.values())\n",
        "    if total == 0:\n",
        "        return [0.125] * 8  # Equal distribution if all zero\n",
        "\n",
        "    # Adjust other values proportionally\n",
        "    scale = (1 - values[changed_name]) / (total - values[changed_name])\n",
        "    for name in values:\n",
        "        if name != changed_name:\n",
        "            values[name] = round(values[name] * scale, 2)\n",
        "\n",
        "    # Return values in the same order\n",
        "    return [\n",
        "        values[\"happiness\"],\n",
        "        values[\"sadness\"],\n",
        "        values[\"disgust\"],\n",
        "        values[\"fear\"],\n",
        "        values[\"surprise\"],\n",
        "        values[\"anger\"],\n",
        "        values[\"other\"],\n",
        "        values[\"neutral\"]\n",
        "    ]\n",
        "\n",
        "def initiate_model(model_type=\"transformer\", seed=121):\n",
        "    global MODEL\n",
        "\n",
        "    if MODEL is None:\n",
        "        if torch.cuda.is_available(): torch.cuda.empty_cache()\n",
        "        MODEL = Zonos.from_pretrained(f\"Zyphra/Zonos-v0.1-{model_type}\", device=device)\n",
        "        MODEL.requires_grad_(False).eval()\n",
        "\n",
        "def process_file(file: str, words_per_chunk_seconds = 32):\n",
        "\n",
        "    nlp = spacy.load(\"en_core_web_trf\")\n",
        "\n",
        "    # Extract content\n",
        "    if file.endswith(\".txt\") and os.path.isfile(file):\n",
        "        with open(file, \"r\") as f:\n",
        "            content = f.read()\n",
        "    elif (file.endswith(\".pdf\") or file.endswith(\".xlsx\") or file.endswith(\"docx\")) and os.path.isfile(file):\n",
        "        parser = spaCyLayout(nlp)\n",
        "        doc = parser(file)\n",
        "        content = doc.text\n",
        "    elif not os.path.isfile(file):\n",
        "        content = file\n",
        "    else:\n",
        "        raise Exception(\"Unsupported file type, Come on there! why not, have your content in txt or pdf? it's simple and convinient isn't it?\")\n",
        "\n",
        "    num_words = 0\n",
        "    current_split = 0\n",
        "    chunks = []\n",
        "\n",
        "    # split te extracted content (using on spacy pre-trained transformers model)\n",
        "    # later: https://github.com/segment-any-text/wtpsplit\n",
        "    doc = nlp(content)\n",
        "    splits = [sent.text.strip() for sent in doc.sents]\n",
        "\n",
        "    # generate chunks with words less the word count limit\n",
        "    for i in range(len(splits)):\n",
        "        sent = splits[i]\n",
        "        sentence_per_words = len(sent.split())\n",
        "        if num_words+sentence_per_words >= words_per_chunk_seconds:\n",
        "            text = \" \".join(splits[current_split:i])\n",
        "            chunks.append(text)\n",
        "            num_words = 0\n",
        "            current_split=i\n",
        "        num_words+=sentence_per_words\n",
        "\n",
        "    chunks.append(\" \".join(splits[current_split:])) # flush the remaining splits\n",
        "    return chunks\n",
        "\n",
        "def concatenate_audio(audio_segments):\n",
        "    # concat audio segments + slight silence, into a single audio waveform\n",
        "    silence = torch.zeros(1, int(MODEL.autoencoder.sampling_rate * 0.4))\n",
        "    concatenated = [] # list of 2d (1, n) tensors to be concat along the column dim for single channel audio array\n",
        "    for audio in audio_segments:\n",
        "        concatenated.append(audio)\n",
        "        concatenated.append(silence)\n",
        "    return torch.cat(concatenated, dim=1)\n",
        "\n",
        "def clone_voice(\n",
        "    input_file,\n",
        "    input_text,\n",
        "    speaker_audio_path,\n",
        "    language,\n",
        "    word_limit,\n",
        "    top_p,\n",
        "    top_k,\n",
        "    min_p,\n",
        "    happiness,\n",
        "    sadness,\n",
        "    disgust,\n",
        "    fear,\n",
        "    surprise,\n",
        "    anger,\n",
        "    other,\n",
        "    neutral,\n",
        "    cfg_scale,\n",
        "    linear,\n",
        "    confidence,\n",
        "    quadratic,\n",
        "):\n",
        "    global MODEL, SPEAKER_AUDIO_PATH, SPEAKER_EMBEDDING\n",
        "\n",
        "    seed = 1234\n",
        "    torch.manual_seed(seed)\n",
        "    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)\n",
        "\n",
        "    print(\"Input Audio File:\", speaker_audio)\n",
        "    print(\"Input Text:\", input_text)\n",
        "    print(\"Language:\", language)\n",
        "    print(\"Word Limit:\", word_limit)\n",
        "\n",
        "    try:\n",
        "\n",
        "        # Handle input content\n",
        "        content_path = None\n",
        "        if input_file is not None:\n",
        "            yield None, \"Processing input file...\"\n",
        "            content_path = input_file\n",
        "        elif input_text.strip():\n",
        "            yield None, \"Processing input text...\"\n",
        "            # Save text to temporary file\n",
        "            with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:\n",
        "                f.write(input_text)\n",
        "                content_path = f.name\n",
        "\n",
        "        if content_path is None:\n",
        "            raise ValueError(\"Please provide either text input or upload a file\")\n",
        "\n",
        "        # Save speaker audio and generate embedding for it once for a file\n",
        "        if speaker_audio_path is not None and speaker_audio_path!=SPEAKER_AUDIO_PATH:\n",
        "            SPEAKER_AUDIO_PATH = speaker_audio_path\n",
        "            wav, sampling_rate = torchaudio.load(SPEAKER_AUDIO_PATH)\n",
        "            SPEAKER_EMBEDDING = MODEL.make_speaker_embedding(wav, sampling_rate).to(MODEL.device, dtype=torch.bfloat16)\n",
        "\n",
        "        # Fixed emotion tensor\n",
        "        emotion_tensor = torch.tensor(\n",
        "            [happiness, sadness, disgust, fear, surprise, anger, other, neutral],\n",
        "            device=MODEL.device,\n",
        "            dtype=torch.bfloat16\n",
        "        )\n",
        "\n",
        "        text_chunks = process_file(content_path, word_limit)\n",
        "        total_chunks = len(text_chunks)\n",
        "        yield None, f\"Text split into {total_chunks} chunks. Starting synthesis...\"\n",
        "        audio_segments = []\n",
        "        for idx, chunk in enumerate(text_chunks):\n",
        "            if not chunk.strip(): continue\n",
        "\n",
        "            yield None, f\"Generating chunk {idx}/{total_chunks}...\"\n",
        "\n",
        "            # consistent conditioning\n",
        "            cond_dict = make_cond_dict(\n",
        "                text=chunk.strip(),\n",
        "                language=language,\n",
        "                speaker=SPEAKER_EMBEDDING,\n",
        "                emotion=emotion_tensor,\n",
        "                device=MODEL.device,\n",
        "                # Only disable these if your model supports them\n",
        "                unconditional_keys=[\"vqscore_8\", \"dnsmos_ovrl\"]\n",
        "            )\n",
        "\n",
        "            conditioning = MODEL.prepare_conditioning(cond_dict)\n",
        "\n",
        "            # generation with static conditioning\n",
        "            with torch.no_grad():\n",
        "                codes = MODEL.generate(\n",
        "                    prefix_conditioning=conditioning,\n",
        "                    max_new_tokens=86 * 30,\n",
        "                    cfg_scale=cfg_scale,\n",
        "                    batch_size=1,\n",
        "                    sampling_params=dict(min_p=min_p, linear=linear, conf=confidence, quad=quadratic),\n",
        "                )\n",
        "\n",
        "            # Audio processing (converting the output audio tensor to 2d (1, n) waveform tensor)\n",
        "            wav_out = MODEL.autoencoder.decode(codes).cpu().detach()\n",
        "            if wav_out.dim() == 3: wav_out = wav_out.squeeze(0)\n",
        "            if wav_out.dim() == 1: wav_out = wav_out.unsqueeze(0)\n",
        "            if wav_out.dim() == 2 and wav_out.size(0) > 1:\n",
        "                wav_out = wav_out[0:1, :]\n",
        "            silence = torch.zeros(1, int(MODEL.autoencoder.sampling_rate * 0.4))\n",
        "            wav_out = torch.cat([wav_out, silence], dim=1)\n",
        "            #audio_segments.append(wav_out) # aggregate to create a file later\n",
        "            # current_audio = concatenate_audio(audio_segments).squeeze().numpy()\n",
        "            # print(current_audio.shape)\n",
        "            # print(current_audio)\n",
        "            # print(type(current_audio))\n",
        "            yield (MODEL.autoencoder.sampling_rate, wav_out.squeeze().numpy()), f\"Generated chunk {idx}/{total_chunks}\"\n",
        "\n",
        "        yield None, \"🎉 Speech generation complete!\"\n",
        "\n",
        "        # if nt yielding, and output is final filepath\n",
        "        # generated_audio_path = \"output_audio.wav\"\n",
        "        # final_audio = concatenate_audio(audio_segments)\n",
        "        # torchaudio.save(generated_audio_path, final_audio, model.autoencoder.sampling_rate)\n",
        "        # return generated_audio_path, \"🎉 Speech generation complete!\"\n",
        "\n",
        "    except Exception as e:\n",
        "        error_message = traceback.format_exc()\n",
        "        yield None, error_message\n",
        "\n",
        "# Create the Gradio interface\n",
        "with gr.Blocks(title=\"Voice Cloning Studio\", theme=gr.themes.Soft()) as demo:\n",
        "    gr.Markdown(\n",
        "        \"\"\"\n",
        "        # 🎤 Voice Cloning Studio\n",
        "        Convert text or documents to speech using any voice sample, with precise emotional control and advanced generation settings.\n",
        "        \"\"\"\n",
        "    )\n",
        "\n",
        "    with gr.Row():\n",
        "        with gr.Column():\n",
        "            # Input controls\n",
        "            with gr.Tab(\"Text Input\"):\n",
        "                text_input = gr.Textbox(\n",
        "                    label=\"Text to Convert\",\n",
        "                    placeholder=\"Enter the text you want to convert to speech...\",\n",
        "                    lines=5\n",
        "                )\n",
        "\n",
        "            with gr.Tab(\"File Upload\"):\n",
        "                file_input = gr.File(\n",
        "                    label=\"Upload Document\",\n",
        "                    file_types=[\".txt\", \".pdf\", \".xlsx\", \".docx\"]\n",
        "                )\n",
        "            speaker_audio = gr.Audio(\n",
        "                label=\"Speaker Voice Sample\",\n",
        "                type=\"filepath\"\n",
        "            )\n",
        "            language = gr.Dropdown(\n",
        "                    label=\"Language Code\",\n",
        "                    choices=[\n",
        "                        (\"English\", \"en-us\"),\n",
        "                        (\"Japanese\", \"ja-jp\"),\n",
        "                        (\"Spanish\", \"es-es\"),\n",
        "                        (\"Korean\", \"ko-kr\"),\n",
        "                        (\"Russian\", \"ru-ru\"),\n",
        "                        (\"French\", \"fr-fr\"),\n",
        "                        (\"German\", \"de-de\"),\n",
        "                        (\"Chinese\", \"zh-cn\"),\n",
        "                        (\"Hindi\", \"hi-in\"),\n",
        "                        (\"Arabic\", \"ar-ae\")\n",
        "                    ],\n",
        "                    value=\"en-us\",\n",
        "                    info=\"Select a language code.\"\n",
        "            )\n",
        "\n",
        "            with gr.Accordion(\"Advanced Sampling Settings\", open=False):\n",
        "                with gr.Row():\n",
        "                    with gr.Column():\n",
        "                        word_limit_slider = gr.Slider(\n",
        "                            label=\"Max words/chunk\",\n",
        "                            minimum=15,\n",
        "                            maximum=50,\n",
        "                            value=32,\n",
        "                            step=1,\n",
        "                            info=\"Long text create crappy audio. So we have to chunked generation to mitigate the issue\"\n",
        "                        )\n",
        "                        top_p_slider = gr.Slider(\n",
        "                            label=\"Top P\",\n",
        "                            minimum=0.0,\n",
        "                            maximum=1.0,\n",
        "                            value=0.0,\n",
        "                            step=0.01,\n",
        "                            info=\"Nucleus sampling threshold\"\n",
        "                        )\n",
        "                        top_k_slider = gr.Slider(\n",
        "                            minimum=0,\n",
        "                            maximum=1024,\n",
        "                            value=0,\n",
        "                            step=1,\n",
        "                            label=\"Top K\",\n",
        "                            info=\"Limits the number of tokens considered\"\n",
        "                        )\n",
        "                        min_p_slider = gr.Slider(\n",
        "                            minimum=0.0,\n",
        "                            maximum=1.0,\n",
        "                            value=0.0,\n",
        "                            step=0.01,\n",
        "                            label=\"Min P\",\n",
        "                            info=\"Minimum probability threshold\"\n",
        "                        )\n",
        "                    with gr.Column():\n",
        "                        cfg_scale_slider = gr.Slider(\n",
        "                            label=\"CFG Scale\",\n",
        "                            minimum=1.0,\n",
        "                            maximum=5.0,\n",
        "                            value=2.0,\n",
        "                            step=0.1,\n",
        "                        )\n",
        "                        linear_slider = gr.Slider(\n",
        "                            label=\"Linear\",\n",
        "                            minimum=-2.0,\n",
        "                            maximum=2.0,\n",
        "                            value=0.5,\n",
        "                            step=0.01,\n",
        "                            info=\"Higher values makes outputs less random (produce more consistent/deterministic output), 0 to disable unified sampling\"\n",
        "                        )\n",
        "\n",
        "                        confidence_slider = gr.Slider(\n",
        "                            label=\"Confidence\",\n",
        "                            minimum=-2.0,\n",
        "                            maximum=2.0,\n",
        "                            value=0.4,\n",
        "                            step=0.01,\n",
        "                            info=\"Low values increase output randomness (make random outputs more random)\"\n",
        "                        )\n",
        "                        quadratic_slider = gr.Slider(\n",
        "                            minimum=-2.0,\n",
        "                            maximum=2.0,\n",
        "                            value=0.00,\n",
        "                            step=0.01,\n",
        "                            label=\"Quadratic Control\",\n",
        "                            info=\"Higher values make low probablities much lower (reduce probability of unlikely outputs)\"\n",
        "                        )\n",
        "\n",
        "            with gr.Accordion(\"Emotion Controls\", open=True):\n",
        "                gr.Markdown(\"Adjust emotion values (total will automatically sum to 1.0)\")\n",
        "                with gr.Row():\n",
        "                    with gr.Column():\n",
        "                        happiness = gr.Slider(label=\"Happiness\", minimum=0, maximum=1, value=0.2, step=0.05)\n",
        "                        sadness = gr.Slider(label=\"Sadness\", minimum=0, maximum=1, value=0.1, step=0.05)\n",
        "                        disgust = gr.Slider(label=\"Disgust\", minimum=0, maximum=1, value=0.1, step=0.05)\n",
        "                        fear = gr.Slider(label=\"Fear\", minimum=0, maximum=1, value=0.1, step=0.05)\n",
        "                    with gr.Column():\n",
        "                        surprise = gr.Slider(label=\"Surprise\", minimum=0, maximum=1, value=0.1, step=0.05)\n",
        "                        anger = gr.Slider(label=\"Anger\", minimum=0, maximum=1, value=0.1, step=0.05)\n",
        "                        other = gr.Slider(label=\"Other\", minimum=0, maximum=1, value=0.1, step=0.05)\n",
        "                        neutral = gr.Slider(label=\"Neutral\", minimum=0, maximum=1, value=0.2, step=0.05)\n",
        "\n",
        "                # List all emotion sliders and their names\n",
        "                emotion_sliders = [happiness, sadness, disgust, fear, surprise, anger, other, neutral]\n",
        "                emotion_names = [\"happiness\", \"sadness\", \"disgust\", \"fear\", \"surprise\", \"anger\", \"other\", \"neutral\"]\n",
        "\n",
        "                # Add change handlers for each emotion slider\n",
        "                for slider, name in zip(emotion_sliders, emotion_names):\n",
        "                    slider.change(\n",
        "                        fn=adjust_other_emotions,\n",
        "                        inputs=[\n",
        "                            slider,  # Changed value\n",
        "                            gr.State(name),  # Name of changed emotion\n",
        "                            *emotion_sliders  # All current values\n",
        "                        ],\n",
        "                        outputs=emotion_sliders\n",
        "                    )\n",
        "\n",
        "        with gr.Column():\n",
        "             # Output section\n",
        "            output_audio = gr.Audio(label=\"Generated Speech\", type=\"numpy\", autoplay=False, streaming=True)\n",
        "            error_box = gr.Textbox(label=\"Status/Error Messages\", visible=True)\n",
        "\n",
        "            # Generate button\n",
        "            generate_btn = gr.Button(\"Generate Speech\", variant=\"primary\")\n",
        "\n",
        "            # Information about the process\n",
        "            gr.Markdown(\n",
        "                \"\"\"\n",
        "                 ### Features:\n",
        "                - Support for text input or document upload (TXT, PDF, XLSX, DOCX)\n",
        "                - Custom voice cloning from audio sample\n",
        "                - Emotional tone control\n",
        "                - Advanced generation parameters\n",
        "\n",
        "                ### Tips:\n",
        "                - For best results, use clear reference audio samples (5-15 secs) with minimal to no background noise.\n",
        "                - Adjust emotions subtly for natural speech\n",
        "                - Use advanced settings carefully for fine control\n",
        "\n",
        "                ### Supported File Types:\n",
        "                - Text input directly or TXT, PDF, XLSX, and DOCX files\n",
        "                - Voice samples in WAV, MP3, WEBM and OGG\n",
        "\n",
        "                Note: Generation may take a few moments depending on the length of the parsed text.\n",
        "                \"\"\"\n",
        "            )\n",
        "\n",
        "    # Set up the click event\n",
        "    generate_btn.click(\n",
        "        fn=clone_voice,\n",
        "        inputs=[\n",
        "            file_input,\n",
        "            text_input,\n",
        "            speaker_audio,\n",
        "            language,\n",
        "            word_limit_slider,\n",
        "            top_p_slider,\n",
        "            top_k_slider,\n",
        "            min_p_slider,\n",
        "            happiness,\n",
        "            sadness,\n",
        "            disgust,\n",
        "            fear,\n",
        "            surprise,\n",
        "            anger,\n",
        "            other,\n",
        "            neutral,\n",
        "            cfg_scale_slider,\n",
        "            linear_slider,\n",
        "            confidence_slider,\n",
        "            quadratic_slider\n",
        "        ],\n",
        "        outputs=[output_audio, error_box]\n",
        "    )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "E0x_oRFK4Mpf"
      },
      "outputs": [],
      "source": [
        "initiate_model()\n",
        "demo.launch(debug=True, share=True)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "demo.close()"
      ],
      "metadata": {
        "id": "5Vo-6CcYtRrU"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [
        "5dmP8omT38Zc",
        "XKTMtrET_UpT"
      ],
      "gpuType": "T4",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }