josemarcosrf · September 29, 2022 20:32
diff --git a/stt-exploratory-telebot.ipynb b/stt-exploratory-telebot.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "2_YRR9zHcfln"
      ],
      "toc_visible": true,
      "authorship_tag": "ABX9TyP6rKEdw5QiT96JLOzOYXNY",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/jmrf/9d84e77fc180996198d8a93258904a9f/stt-exploratory-telebot.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Off-line STT exploratory\n",
        "\n",
        "We explore 3 different options:\n",
        "\n",
        " - [pykaldi]()\n",
        " - Facebook's [wav2letter](https://github.com/flashlight/wav2letter/)\n",
        " - OpenAI's  [whisper](https://github.com/openai/whisper)"
      ],
      "metadata": {
        "id": "uT2lztpyRJSC"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Setup"
      ],
      "metadata": {
        "id": "oavaBVl3cVhG"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# System common deps\n",
        "!apt-get install -qq \\\n",
        "    sox \\\n",
        "    mediainfo\n",
        "\n",
        "# Python common deps\n",
        "!pip install -qq -U pip\n",
        "!pip install -qq ffmpeg-python sox"
      ],
      "metadata": {
        "id": "m67hDDqqa5Uz",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "a01ea75c-4169-43ef-c4b9-631c29e8f60b"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
            "Processing triggers for mime-support (3.60ubuntu1) ...\n",
            "\u001b[K     |████████████████████████████████| 2.0 MB 6.9 MB/s \n",
            "\u001b[?25h\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
            "\u001b[0m"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Helpers"
      ],
      "metadata": {
        "id": "EXVIr2nJ48KV"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "import glob\n",
        "import signal\n",
        "import tempfile\n",
        "\n",
        "from contextlib import contextmanager\n",
        "from subprocess import Popen, PIPE, check_output\n",
        "\n",
        "\n",
        "@contextmanager\n",
        "def timeout(duration: int):\n",
        "    def timeout_handler(signum, frame):\n",
        "        raise Exception(f\"Block timed out after {duration} seconds\")\n",
        "\n",
        "    signal.signal(signal.SIGALRM, timeout_handler)\n",
        "    signal.alarm(duration)\n",
        "    try:\n",
        "        yield\n",
        "    finally:\n",
        "        signal.alarm(0)\n",
        "\n",
        "\n",
        "def create_process(cmd):\n",
        "    process = Popen([cmd],\n",
        "                    stdin=PIPE, stdout=PIPE, stderr=PIPE,\n",
        "                    shell=True, preexec_fn=os.setsid) \n",
        "    return process\n",
        "\n",
        "\n",
        "def read_current_output(process):\n",
        "    stt_symbol = \"|P|:\"\n",
        "    word_separator_symbol = \"|\"\n",
        "\n",
        "    transcripts = []\n",
        "    output = True\n",
        "    while output:\n",
        "        output = process.stdout.readline().decode()\n",
        "        stderr = process.stderr.readline().decode()\n",
        "\n",
        "        if output.startswith(stt_symbol):\n",
        "            output = output.replace(stt_symbol, \"\").split(word_separator_symbol)\n",
        "            words = \" \".join([w.strip().replace(\" \", \"\") for w in output])\n",
        "            transcripts.append(words)\n",
        "\n",
        "    return transcripts"
      ],
      "metadata": {
        "id": "NXLShV5Y4-in"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 🤎 PyKaldi"
      ],
      "metadata": {
        "id": "vh0wQ4s4XRUz"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Setup"
      ],
      "metadata": {
        "id": "2_YRR9zHcfln"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cp_CYB4NRFvb"
      },
      "outputs": [],
      "source": [
        "!apt-get install -qq -y --no-install-recommends \\\n",
        "    autoconf \\\n",
        "    automake \\\n",
        "    cmake \\\n",
        "    curl \\\n",
        "    gfortran \\\n",
        "    g++\n",
        "\n",
        "!pip install -U -qq pip setuptools\n",
        "!pip install -qq \\\n",
        "    'coloredlogs==15.0.1' \\\n",
        "    'numpy==1.21.4' \\\n",
        "    'pyaudio==0.2.11' \\\n",
        "    'PyYAML==6.0' \\\n",
        "    'rich==10.15.2' \\\n",
        "    'samplerate==0.1.0' \\\n",
        "    'scipy==1.7.3' \\\n",
        "    'git+https://github.com/wkentaro/[email protected]#egg=gdown'"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%bash\n",
        "\n",
        "cd models/\n",
        "\n",
        "# English model\n",
        "MODEL_FILE=en_160k_nnet3chain_tdnn1f_2048_sp_bi.tar.bz2\n",
        "if [ ! -f $MODEL_FILE ]; then\n",
        "    wget http://ltdata1.informatik.uni-hamburg.de/pykaldi/$MODEL_FILE\n",
        "    tar xvfj $MODEL_FILE\n",
        "    rm $MODEL_FILE\n",
        "fi\n",
        "\n",
        "cd -"
      ],
      "metadata": {
        "id": "XsClxU_xXVT2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 🌊 Wav2Letter\n",
        "\n",
        "We use Facebook's [wav2letter](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl) and pre-trained models. wav2letter has been consolidated into [flashlight/app/asr](https://github.com/flashlight/flashlight/tree/main/flashlight/app/asr) which requires this [flashlight commit](https://github.com/flashlight/flashlight/tree/8f7af9ec1188bfd7050c47abfac528d21650890f).\n",
        "\n",
        "> 🤓 [wav2vec-unsupervised-speech-recognition blog post](https://ai.facebook.com/blog/wav2vec-unsupervised-speech-recognition-without-supervision)\n",
        "\n",
        "> 💡 [Install and inference colab example](https://github.com/flashlight/wav2letter/blob/main/recipes/mling_pl/mling_model.ipynb)\n"
      ],
      "metadata": {
        "id": "wwfS61AVUB92"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Setup"
      ],
      "metadata": {
        "id": "8kS68XLvaUmd"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@markdown First, choose backend to build with\n",
        "MODEL = \"W2L\"\n",
        "backend = 'CUDA' #@param [\"CPU\", \"CUDA\"]"
      ],
      "metadata": {
        "id": "ED0JFFkvYlu9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Compile"
      ],
      "metadata": {
        "id": "35ye60P0G_1E"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@markdown 1. Clone Flashlight\n",
        "!git clone https://github.com/flashlight/flashlight.git\n",
        "\n",
        "#@markdown 2.  install all dependencies for colab notebook\n",
        "!source flashlight/scripts/colab/colab_install_deps.sh\n",
        "\n",
        "#@markdown 3. Export necessary env variables\n",
        "%env MKLROOT=/opt/intel/mkl\n",
        "%env ArrayFire_DIR=/opt/arrayfire/share/ArrayFire/cmake\n",
        "%env DNNL_DIR=/opt/dnnl/dnnl_lnx_2.0.0_cpu_iomp/lib/cmake/dnnl\n",
        "\n",
        "#@markdown 4. Compile!\n",
        "if backend == \"CUDA\":\n",
        "  # Total time: ~13 minutes\n",
        "  !cd flashlight && git checkout d2e1924cb2a2b32b48cc326bb7e332ca3ea54f67 && mkdir -p build && cd build && \\\n",
        "  cmake .. -DCMAKE_BUILD_TYPE=Release \\\n",
        "           -DFL_BUILD_TESTS=OFF \\\n",
        "           -DFL_BUILD_EXAMPLES=OFF \\\n",
        "           -DFL_BUILD_APP_ASR=ON && \\\n",
        "  make -j$(nproc)\n",
        "\n",
        "elif backend == \"CPU\":\n",
        "  # Total time: ~14 minutes\n",
        "  !cd flashlight && git checkout d2e1924cb2a2b32b48cc326bb7e332ca3ea54f67 && mkdir -p build && cd build && \\\n",
        "  cmake .. -DFL_BACKEND=CPU \\\n",
        "           -DCMAKE_BUILD_TYPE=Release \\\n",
        "           -DFL_BUILD_TESTS=OFF \\\n",
        "           -DFL_BUILD_EXAMPLES=OFF \\\n",
        "           -DFL_BUILD_APP_ASR=ON && \\\n",
        "  make -j$(nproc)\n",
        "  \n",
        "else:\n",
        "  raise ValueError(f\"Unknown backend {backend}\")\n",
        "\n",
        "\n",
        "#@markdown 5. Build flaslight\n",
        "%cd /content/flashlight/build\n",
        "# !wget https://raw.githubusercontent.com/flashlight/wav2letter/49087d575ddf77aa5a99a01fee980fc00e92c802/recipes/mling_pl/model_with_externally_controlled_reshaping_big_lid.cpp\n",
        "# !mv model_with_externally_controlled_reshaping_big_lid.cpp mling.cpp\n",
        "!wget https://raw.githubusercontent.com/flashlight/wav2letter/main/recipes/mling_pl/mling_large.cpp\n",
        "\n",
        "# !cmake .. -DFL_PLUGIN_MODULE_SRC_PATH=mling.cpp\n",
        "!cmake .. -DFL_PLUGIN_MODULE_SRC_PATH=mling_large.cpp\n",
        "!make\n",
        "%cd -"
      ],
      "metadata": {
        "id": "viydJlAUZ4Ze",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Pre-compiled\n",
        "\n",
        "If we have a pre-compiled flashlight, we only need to set the env varibales and install the system deps."
      ],
      "metadata": {
        "id": "JJY8iUbqHFOR"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# @markdown 1. Fetch a pre-compiled flashlight from GDrive\n",
        "from google.colab import drive\n",
        "\n",
        "drive.mount('/gdrive')\n",
        "\n",
        "!cp '/gdrive/MyDrive/Colab Notebooks/STT-artifacts/$backend-flashlight.tar.gz' /content/\n",
        "\n",
        "#@markdown 2. extract the pre-compiled flashlight\n",
        "!tar xzf $backend-flashlight.tar.gz\n",
        "\n",
        "# set env. vars\n",
        "%env MKLROOT=/opt/intel/mkl\n",
        "%env ArrayFire_DIR=/opt/arrayfire/share/ArrayFire/cmake\n",
        "%env DNNL_DIR=/opt/dnnl/dnnl_lnx_2.0.0_cpu_iomp/lib/cmake/dnnl\n",
        "\n",
        "# install system deps\n",
        "!source flashlight/scripts/colab/colab_install_deps.sh"
      ],
      "metadata": {
        "id": "9AKcubTVjj3f",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "653a6a48-ebbb-4503-bf58-cb3c745f6941",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /gdrive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Models "
      ],
      "metadata": {
        "id": "hEYDiY2OaXwn"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "MODELS_DIR = \"models/wav2vec\"\n",
        "\n",
        "!mkdir -p $MODELS_DIR\n",
        "\n",
        "#@markdown 1. Download the model checkpoint\n",
        "!wget \"https://dl.fbaipublicfiles.com/wav2letter/mling_pl/checkpoint_cv_finetune.bin\" -P $MODELS_DIR -qq\n",
        "\n",
        "#@markdown 2. Download the tokens\n",
        "!wget \"https://dl.fbaipublicfiles.com/wav2letter/mling_pl/tokens-all.lst\" -P $MODELS_DIR -qq"
      ],
      "metadata": {
        "id": "pkXAoRJDUCFX",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Helpers"
      ],
      "metadata": {
        "id": "i96ufI3Ucqsc"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@markdown We define the inference function as a command line \n",
        "#@markdown call from python as an external process\n",
        "\n",
        "cmd = \"\"\"\n",
        "./flashlight/build/bin/asr/fl_asr_test \\\n",
        "    --test={audio_list} \\\n",
        "    --am={audio_model} \\\n",
        "    --tokens={tokens} \\\n",
        "    --arch={arch} \\\n",
        "    --lexicon={lexicon} \\\n",
        "    --datadir=''  \\\n",
        "    --emission_dir=''  \\\n",
        "    --show\n",
        "\"\"\"\n",
        "\n",
        "\n",
        "def run_inference(\n",
        "    audio_fpath, \n",
        "    am_fpath=\"./models/wav2vec/checkpoint_cv_finetune.bin\",\n",
        "    tokens_fpath=\"./models/wav2vec/tokens-all.lst\", \n",
        "    arch=\"./flashlight/build/mling_large.so\",\n",
        "    lexicon_fpath=\"./lexicon.txt\"\n",
        "):\n",
        "    with tempfile.NamedTemporaryFile(mode='w', suffix='.lst') as f:\n",
        "        duration = float(check_output(\"soxi -D \" + audio_fpath, shell=True))\n",
        "        f.write(\"%d %s %s\\n\" % (0, audio_fpath, duration))\n",
        "        f.seek(0)  # 📣 important so the next process can read the first line!\n",
        "\n",
        "        _cmd = cmd.format(\n",
        "            audio_list=f.name,\n",
        "            audio_model=am_fpath,\n",
        "            tokens=tokens_fpath,\n",
        "            arch=arch,\n",
        "            lexicon=lexicon_fpath\n",
        "        )\n",
        "        proc = create_process(_cmd)\n",
        "        return read_current_output(proc)\n"
      ],
      "metadata": {
        "id": "u7nuxTMmb0ZO",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Audio recording"
      ],
      "metadata": {
        "id": "l-tX8HorrEy2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@markdown Colab provides a convinient widget to record directly within the notebook\n",
        "\n",
        "from flashlight.scripts.colab.record import record_audio\n",
        "\n",
        "audio_name = \"/content/test_audio\"\n",
        "audio_fpath = f\"{audio_name}.wav\"\n",
        "record_audio(audio_name)\n",
        "\n",
        "with open(\"audio.lst\", \"w\") as f:\n",
        "    duration = float(check_output(\"soxi -D \" + audio_fpath, shell=True))\n",
        "    f.write(\"%d %s %s\\n\" % (0, audio_fpath, duration))"
      ],
      "metadata": {
        "id": "yIuxRO13co43",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 152
        },
        "outputId": "a0df97c4-0a44-4608-9f09-271dd434353c",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "<script>\n",
              "var recordButton = document.createElement(\"BUTTON\");\n",
              "recordButton.appendChild(\n",
              "  document.createTextNode(\"Press to start recording\")\n",
              ");\n",
              "restyleButtonBeforeRecording();\n",
              "\n",
              "var my_div = document.createElement(\"DIV\");\n",
              "my_div.appendChild(recordButton);\n",
              "\n",
              "document.body.appendChild(my_div);\n",
              "\n",
              "var base64data = 0;\n",
              "var reader;\n",
              "var recorder, gumStream;\n",
              "\n",
              "function restyleButtonBeforeRecording() {\n",
              "  recordButton.style.width = '270px';\n",
              "  recordButton.style.height = '90';\n",
              "  recordButton.style.padding = '25px';\n",
              "  recordButton.style.backgroundColor = '#4CAF50';\n",
              "  recordButton.style.fontSize = '18px';\n",
              "}\n",
              "\n",
              "function restyleButtonForRecording() {\n",
              "  recordButton.style.backgroundColor = '#008CBA';\n",
              "  recordButton.innerText = \"Recording... press to stop\";\n",
              "}\n",
              "\n",
              "function restyleButtonForSaving() {\n",
              "  recordButton.style.backgroundColor = '#b34d4d';\n",
              "  recordButton.innerText = \"Saving... please wait!\"\n",
              "}\n",
              "\n",
              "var handleSuccess = function(stream) {\n",
              "  gumStream = stream;\n",
              "  recorder = new MediaRecorder(stream);\n",
              "  recorder.ondataavailable = function(e) {\n",
              "    var url = URL.createObjectURL(e.data);\n",
              "    var preview = document.createElement('audio');\n",
              "    preview.controls = true;\n",
              "    preview.src = url;\n",
              "    document.body.appendChild(preview);\n",
              "\n",
              "    reader = new FileReader();\n",
              "    reader.readAsDataURL(e.data);\n",
              "    reader.onloadend = function() {\n",
              "      base64data = reader.result;\n",
              "      //console.log(\"Inside FileReader:\" + base64data);\n",
              "    }\n",
              "  };\n",
              "  recorder.start();\n",
              "  };\n",
              "\n",
              "\n",
              "function toggleRecording() {\n",
              "  if (recorder && recorder.state == \"recording\") {\n",
              "      recorder.stop();\n",
              "      gumStream.getAudioTracks()[0].stop();\n",
              "      restyleButtonForSaving();\n",
              "  }\n",
              "}\n",
              "\n",
              "// https://stackoverflow.com/a/951057\n",
              "function sleep(ms) {\n",
              "  return new Promise(resolve => setTimeout(resolve, ms));\n",
              "}\n",
              "\n",
              "var data = new Promise(resolve=>{\n",
              "  recordButton.onclick = () => {\n",
              "    restyleButtonForRecording();\n",
              "    recordButton.onclick = () => {\n",
              "      toggleRecording();\n",
              "      sleep(2000).then(() => {\n",
              "        // wait 2000ms for the data to be available...\n",
              "        // ideally this should use something like await...\n",
              "        // console.log(\"Inside data:\" + base64data)\n",
              "        resolve(base64data.toString());\n",
              "      });\n",
              "    };\n",
              "    navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
              "  };\n",
              "});\n",
              "\n",
              "</script>\n"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Inference"
      ],
      "metadata": {
        "id": "c5Qj0mIlcnpV"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Create a dummy lexicon (if we use greedy decoding is not used...):\n",
        "!echo 'a a |' > lexicon.txt"
      ],
      "metadata": {
        "id": "ZSGKMnpfkFyM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Wrapped in python helpers\n",
        "for transcript in run_inference(\"/content/test_audio.wav\"):\n",
        "    print(transcript)"
      ],
      "metadata": {
        "id": "7JmD2SV9eWun",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "87058466-7c32-462a-f5e2-e1781506d8c3"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "This is a ie song sanet.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Directly form command line\n",
        "!/content/flashlight/build/bin/asr/fl_asr_test \\\n",
        "    --test=audio.lst \\\n",
        "    --am=/content/models/wav2vec/checkpoint_cv_finetune.bin \\\n",
        "    --tokens=/content/models/wav2vec/tokens-all.lst \\\n",
        "    --arch=flashlight/build/mling_large.so \\\n",
        "    --lexicon=lexicon.txt \\\n",
        "    --datadir=''  \\\n",
        "    --emission_dir='' \\\n",
        "    --show\n",
        "    # --logtostderr=1 \\\n",
        "    # --minloglevel=0"
      ],
      "metadata": {
        "id": "B-pAgH-Hc5y3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 🤫 OpenAI whisper\n",
        "\n",
        "This section uses [OpenAI's whisper](https://github.com/openai/whisper) model.\n",
        "\n",
        "This model present a series of advantages compared to the previos approaches:\n",
        "\n",
        " - multi-language\n",
        " - multi-task model, i.e.: detects the spoken language and direct translation\n",
        " - no-speech detection\n",
        "\n"
      ],
      "metadata": {
        "id": "HXjXNS2YxYT4"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Setup"
      ],
      "metadata": {
        "id": "OTgegNvJyNLx"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@markdown We install `ffmpeg` and `open AI `whisper`\n",
        "MODEL = \"whisper\"\n",
        "!apt install -q ffmpeg\n",
        "!pip install -q git+https://github.com/openai/whisper.git "
      ],
      "metadata": {
        "id": "RDFoBjqWxYjC",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Models\n"
      ],
      "metadata": {
        "id": "5ldp1R4R0wmR"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import whisper\n",
        "\n",
        "\n",
        "model_name = 'medium' #@param [\"tiny\", \"base\", \"small\", \"medium\", \"large\"]\n",
        "\n",
        "print(f\"Loading whisper model '{model_name}'\")\n",
        "model = whisper.load_model(model_name)\n",
        "\n",
        "def run_inference(mp3_file:str):\n",
        "    global model\n",
        "    res = model.transcribe(mp3_file)\n",
        "    return res[\"text\"]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zsx5efxM0Fex",
        "outputId": "db772eb8-182c-49eb-effe-afe933d0b0a5",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Loading whisper model 'medium'\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 🙌 BONUS: Telegram Bot 🤖\n",
        "\n",
        "We run a simple Telegram Bot as a PoC of TTS as a service via audio messages using [pyTelegramBotAPI](https://github.com/eternnoir/pyTelegramBotAPI)."
      ],
      "metadata": {
        "id": "h5YBRp_p7Dcx"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@markdown Install dependencies\n",
        "!pip install -qq -U \\\n",
        "    pyTelegramBotAPI \\\n",
        "    rich"
      ],
      "metadata": {
        "id": "rJB2CiViAPGT",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@markdown Running the Telegram bot!\n",
        "\n",
        "import datetime as dt\n",
        "import telebot\n",
        "import requests\n",
        "\n",
        "from rich import print as pprint\n",
        "\n",
        "BOT_TOKEN = \"your-telegram-token-here\" #@param \n",
        "\n",
        "bot = telebot.TeleBot(BOT_TOKEN, parse_mode=\"MARKDOWN\")\n",
        "\n",
        "\n",
        "def handle_audio_message(message):\n",
        "\n",
        "    now = \"  \".join(dt.datetime.now().isoformat().split(\".\")[0].split(\"T\"))\n",
        "\n",
        "    if message.content_type == \"voice\":\n",
        "        msg = f\"👂 Received a {message.voice.duration}s voice note. Transcribing...\"\n",
        "        print(msg)\n",
        "        ack_reply = bot.send_message(message.chat.id, msg)\n",
        "        file_info = bot.get_file(message.voice.file_id)\n",
        "    else:\n",
        "        bot.reply_to(message, f\"😓 Sorry can't handle audio clips yet...\")\n",
        "        file_info = bot.get_file(message.audio.file_id)\n",
        "    \n",
        "    try:\n",
        "        # Fetch the audio file    \n",
        "        audio_file = requests.get(\n",
        "            f'https://api.telegram.org/file/bot{BOT_TOKEN}/{file_info.file_path}'\n",
        "        )\n",
        "\n",
        "        with tempfile.NamedTemporaryFile(mode='wb', suffix='.ogg') as f:\n",
        "            # write audio to disk\n",
        "            in_file = f.name\n",
        "            f.write(audio_file.content)\n",
        "\n",
        "            if MODEL == \"W2L\":\n",
        "                # Convert to wav\n",
        "                out_file = f.name.replace(\".ogg\", \".wav\")\n",
        "                create_process(\n",
        "                    f'ffmpeg -i {in_file} -acodec pcm_s16le -ar 16000 {out_file}'\n",
        "                ).wait()\n",
        "            elif MODEL == \"whisper\":\n",
        "                # Convert to mp3\n",
        "                out_file = f.name.replace(\".ogg\", \".mp3\")\n",
        "                create_process(\n",
        "                    f'ffmpeg -i {in_file} {out_file}'\n",
        "                ).wait()\n",
        "\n",
        "            # transcribe\n",
        "            transcript = run_inference(out_file)\n",
        "            if isinstance(transcript, list):\n",
        "                transcript = \"\\n\".join(transcript)\n",
        "\n",
        "            text = f\"**{now}**\\n\\n\" + transcript\n",
        "\n",
        "            # Delete ack message and send transcript as a reply\n",
        "            bot.delete_message(message.chat.id, ack_reply.id)\n",
        "            bot.reply_to(message, text)\n",
        "\n",
        "    except Exception as e:\n",
        "        print(f\"🚨 Error! {e}\")\n",
        "        bot.reply_to(message, f\"🚨 Error! {e}\")\n",
        "\n",
        "\n",
        "@bot.message_handler(commands=['start', 'help'])\n",
        "def send_welcome(message):\n",
        "\tbot.reply_to(message, \"Hey, let's start. What are your thoughts?\")\n",
        "\n",
        "\n",
        "@bot.message_handler(func=lambda message: True)\n",
        "def echo_all(message):\n",
        "\tbot.reply_to(message, message.text)\n",
        "\n",
        "\n",
        "@bot.message_handler(content_types=['audio', 'voice'])\n",
        "def handle_docs_audio(message):\n",
        "    handle_audio_message(message)\n",
        "\n",
        "\n",
        "# getMe\n",
        "me = bot.get_me()\n",
        "print(f\"Running bot with ID: {me.id} | Name: {me.username}\")\n",
        "\n",
        "# Run polling\n",
        "bot.infinity_polling()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xvCpjfMi7KaJ",
        "outputId": "33c1c5d0-6862-42c1-b54b-af88b0eef030",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Running bot with ID: 5191934564 | Name: pensabox_bot\n",
            "👂 Received a 2s voice note. Transcribing...\n",
            "👂 Received a 3s voice note. Transcribing...\n",
            "👂 Received a 3s voice note. Transcribing...\n",
            "👂 Received a 9s voice note. Transcribing...\n",
            "👂 Received a 13s voice note. Transcribing...\n"
          ]
        }
      ]
    }
  ]
 }