Last active
September 29, 2022 20:32
-
-
Save josemarcosrf/9d84e77fc180996198d8a93258904a9f to your computer and use it in GitHub Desktop.
stt-exploratory-telebot.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"collapsed_sections": [ | |
"2_YRR9zHcfln" | |
], | |
"toc_visible": true, | |
"authorship_tag": "ABX9TyP6rKEdw5QiT96JLOzOYXNY", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/jmrf/9d84e77fc180996198d8a93258904a9f/stt-exploratory-telebot.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Off-line STT exploratory\n", | |
"\n", | |
"We explore 3 different options:\n", | |
"\n", | |
" - [pykaldi]()\n", | |
" - Facebook's [wav2letter](https://github.com/flashlight/wav2letter/)\n", | |
" - OpenAI's [whisper](https://github.com/openai/whisper)" | |
], | |
"metadata": { | |
"id": "uT2lztpyRJSC" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Setup" | |
], | |
"metadata": { | |
"id": "oavaBVl3cVhG" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# System common deps\n", | |
"!apt-get install -qq \\\n", | |
" sox \\\n", | |
" mediainfo\n", | |
"\n", | |
"# Python common deps\n", | |
"!pip install -qq -U pip\n", | |
"!pip install -qq ffmpeg-python sox" | |
], | |
"metadata": { | |
"id": "m67hDDqqa5Uz", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "a01ea75c-4169-43ef-c4b9-631c29e8f60b" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", | |
"Processing triggers for mime-support (3.60ubuntu1) ...\n", | |
"\u001b[K |ββββββββββββββββββββββββββββββββ| 2.0 MB 6.9 MB/s \n", | |
"\u001b[?25h\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", | |
"\u001b[0m" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Helpers" | |
], | |
"metadata": { | |
"id": "EXVIr2nJ48KV" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import os\n", | |
"import glob\n", | |
"import signal\n", | |
"import tempfile\n", | |
"\n", | |
"from contextlib import contextmanager\n", | |
"from subprocess import Popen, PIPE, check_output\n", | |
"\n", | |
"\n", | |
"@contextmanager\n", | |
"def timeout(duration: int):\n", | |
" def timeout_handler(signum, frame):\n", | |
" raise Exception(f\"Block timed out after {duration} seconds\")\n", | |
"\n", | |
" signal.signal(signal.SIGALRM, timeout_handler)\n", | |
" signal.alarm(duration)\n", | |
" try:\n", | |
" yield\n", | |
" finally:\n", | |
" signal.alarm(0)\n", | |
"\n", | |
"\n", | |
"def create_process(cmd):\n", | |
" process = Popen([cmd],\n", | |
" stdin=PIPE, stdout=PIPE, stderr=PIPE,\n", | |
" shell=True, preexec_fn=os.setsid) \n", | |
" return process\n", | |
"\n", | |
"\n", | |
"def read_current_output(process):\n", | |
" stt_symbol = \"|P|:\"\n", | |
" word_separator_symbol = \"|\"\n", | |
"\n", | |
" transcripts = []\n", | |
" output = True\n", | |
" while output:\n", | |
" output = process.stdout.readline().decode()\n", | |
" stderr = process.stderr.readline().decode()\n", | |
"\n", | |
" if output.startswith(stt_symbol):\n", | |
" output = output.replace(stt_symbol, \"\").split(word_separator_symbol)\n", | |
" words = \" \".join([w.strip().replace(\" \", \"\") for w in output])\n", | |
" transcripts.append(words)\n", | |
"\n", | |
" return transcripts" | |
], | |
"metadata": { | |
"id": "NXLShV5Y4-in" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## π€ PyKaldi" | |
], | |
"metadata": { | |
"id": "vh0wQ4s4XRUz" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Setup" | |
], | |
"metadata": { | |
"id": "2_YRR9zHcfln" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "cp_CYB4NRFvb" | |
}, | |
"outputs": [], | |
"source": [ | |
"!apt-get install -qq -y --no-install-recommends \\\n", | |
" autoconf \\\n", | |
" automake \\\n", | |
" cmake \\\n", | |
" curl \\\n", | |
" gfortran \\\n", | |
" g++\n", | |
"\n", | |
"!pip install -U -qq pip setuptools\n", | |
"!pip install -qq \\\n", | |
" 'coloredlogs==15.0.1' \\\n", | |
" 'numpy==1.21.4' \\\n", | |
" 'pyaudio==0.2.11' \\\n", | |
" 'PyYAML==6.0' \\\n", | |
" 'rich==10.15.2' \\\n", | |
" 'samplerate==0.1.0' \\\n", | |
" 'scipy==1.7.3' \\\n", | |
" 'git+https://github.com/wkentaro/[email protected]#egg=gdown'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%%bash\n", | |
"\n", | |
"cd models/\n", | |
"\n", | |
"# English model\n", | |
"MODEL_FILE=en_160k_nnet3chain_tdnn1f_2048_sp_bi.tar.bz2\n", | |
"if [ ! -f $MODEL_FILE ]; then\n", | |
" wget http://ltdata1.informatik.uni-hamburg.de/pykaldi/$MODEL_FILE\n", | |
" tar xvfj $MODEL_FILE\n", | |
" rm $MODEL_FILE\n", | |
"fi\n", | |
"\n", | |
"cd -" | |
], | |
"metadata": { | |
"id": "XsClxU_xXVT2" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## π Wav2Letter\n", | |
"\n", | |
"We use Facebook's [wav2letter](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl) and pre-trained models. wav2letter has been consolidated into [flashlight/app/asr](https://github.com/flashlight/flashlight/tree/main/flashlight/app/asr) which requires this [flashlight commit](https://github.com/flashlight/flashlight/tree/8f7af9ec1188bfd7050c47abfac528d21650890f).\n", | |
"\n", | |
"> π€ [wav2vec-unsupervised-speech-recognition blog post](https://ai.facebook.com/blog/wav2vec-unsupervised-speech-recognition-without-supervision)\n", | |
"\n", | |
"> π‘ [Install and inference colab example](https://github.com/flashlight/wav2letter/blob/main/recipes/mling_pl/mling_model.ipynb)\n" | |
], | |
"metadata": { | |
"id": "wwfS61AVUB92" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Setup" | |
], | |
"metadata": { | |
"id": "8kS68XLvaUmd" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@markdown First, choose backend to build with\n", | |
"MODEL = \"W2L\"\n", | |
"backend = 'CUDA' #@param [\"CPU\", \"CUDA\"]" | |
], | |
"metadata": { | |
"id": "ED0JFFkvYlu9" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"#### Compile" | |
], | |
"metadata": { | |
"id": "35ye60P0G_1E" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@markdown 1. Clone Flashlight\n", | |
"!git clone https://github.com/flashlight/flashlight.git\n", | |
"\n", | |
"#@markdown 2. install all dependencies for colab notebook\n", | |
"!source flashlight/scripts/colab/colab_install_deps.sh\n", | |
"\n", | |
"#@markdown 3. Export necessary env variables\n", | |
"%env MKLROOT=/opt/intel/mkl\n", | |
"%env ArrayFire_DIR=/opt/arrayfire/share/ArrayFire/cmake\n", | |
"%env DNNL_DIR=/opt/dnnl/dnnl_lnx_2.0.0_cpu_iomp/lib/cmake/dnnl\n", | |
"\n", | |
"#@markdown 4. Compile!\n", | |
"if backend == \"CUDA\":\n", | |
" # Total time: ~13 minutes\n", | |
" !cd flashlight && git checkout d2e1924cb2a2b32b48cc326bb7e332ca3ea54f67 && mkdir -p build && cd build && \\\n", | |
" cmake .. -DCMAKE_BUILD_TYPE=Release \\\n", | |
" -DFL_BUILD_TESTS=OFF \\\n", | |
" -DFL_BUILD_EXAMPLES=OFF \\\n", | |
" -DFL_BUILD_APP_ASR=ON && \\\n", | |
" make -j$(nproc)\n", | |
"\n", | |
"elif backend == \"CPU\":\n", | |
" # Total time: ~14 minutes\n", | |
" !cd flashlight && git checkout d2e1924cb2a2b32b48cc326bb7e332ca3ea54f67 && mkdir -p build && cd build && \\\n", | |
" cmake .. -DFL_BACKEND=CPU \\\n", | |
" -DCMAKE_BUILD_TYPE=Release \\\n", | |
" -DFL_BUILD_TESTS=OFF \\\n", | |
" -DFL_BUILD_EXAMPLES=OFF \\\n", | |
" -DFL_BUILD_APP_ASR=ON && \\\n", | |
" make -j$(nproc)\n", | |
" \n", | |
"else:\n", | |
" raise ValueError(f\"Unknown backend {backend}\")\n", | |
"\n", | |
"\n", | |
"#@markdown 5. Build flaslight\n", | |
"%cd /content/flashlight/build\n", | |
"# !wget https://raw.githubusercontent.com/flashlight/wav2letter/49087d575ddf77aa5a99a01fee980fc00e92c802/recipes/mling_pl/model_with_externally_controlled_reshaping_big_lid.cpp\n", | |
"# !mv model_with_externally_controlled_reshaping_big_lid.cpp mling.cpp\n", | |
"!wget https://raw.githubusercontent.com/flashlight/wav2letter/main/recipes/mling_pl/mling_large.cpp\n", | |
"\n", | |
"# !cmake .. -DFL_PLUGIN_MODULE_SRC_PATH=mling.cpp\n", | |
"!cmake .. -DFL_PLUGIN_MODULE_SRC_PATH=mling_large.cpp\n", | |
"!make\n", | |
"%cd -" | |
], | |
"metadata": { | |
"id": "viydJlAUZ4Ze", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"#### Pre-compiled\n", | |
"\n", | |
"If we have a pre-compiled flashlight, we only need to set the env varibales and install the system deps." | |
], | |
"metadata": { | |
"id": "JJY8iUbqHFOR" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# @markdown 1. Fetch a pre-compiled flashlight from GDrive\n", | |
"from google.colab import drive\n", | |
"\n", | |
"drive.mount('/gdrive')\n", | |
"\n", | |
"!cp '/gdrive/MyDrive/Colab Notebooks/STT-artifacts/$backend-flashlight.tar.gz' /content/\n", | |
"\n", | |
"#@markdown 2. extract the pre-compiled flashlight\n", | |
"!tar xzf $backend-flashlight.tar.gz\n", | |
"\n", | |
"# set env. vars\n", | |
"%env MKLROOT=/opt/intel/mkl\n", | |
"%env ArrayFire_DIR=/opt/arrayfire/share/ArrayFire/cmake\n", | |
"%env DNNL_DIR=/opt/dnnl/dnnl_lnx_2.0.0_cpu_iomp/lib/cmake/dnnl\n", | |
"\n", | |
"# install system deps\n", | |
"!source flashlight/scripts/colab/colab_install_deps.sh" | |
], | |
"metadata": { | |
"id": "9AKcubTVjj3f", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "653a6a48-ebbb-4503-bf58-cb3c745f6941", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Mounted at /gdrive\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Models " | |
], | |
"metadata": { | |
"id": "hEYDiY2OaXwn" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"MODELS_DIR = \"models/wav2vec\"\n", | |
"\n", | |
"!mkdir -p $MODELS_DIR\n", | |
"\n", | |
"#@markdown 1. Download the model checkpoint\n", | |
"!wget \"https://dl.fbaipublicfiles.com/wav2letter/mling_pl/checkpoint_cv_finetune.bin\" -P $MODELS_DIR -qq\n", | |
"\n", | |
"#@markdown 2. Download the tokens\n", | |
"!wget \"https://dl.fbaipublicfiles.com/wav2letter/mling_pl/tokens-all.lst\" -P $MODELS_DIR -qq" | |
], | |
"metadata": { | |
"id": "pkXAoRJDUCFX", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Helpers" | |
], | |
"metadata": { | |
"id": "i96ufI3Ucqsc" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@markdown We define the inference function as a command line \n", | |
"#@markdown call from python as an external process\n", | |
"\n", | |
"cmd = \"\"\"\n", | |
"./flashlight/build/bin/asr/fl_asr_test \\\n", | |
" --test={audio_list} \\\n", | |
" --am={audio_model} \\\n", | |
" --tokens={tokens} \\\n", | |
" --arch={arch} \\\n", | |
" --lexicon={lexicon} \\\n", | |
" --datadir='' \\\n", | |
" --emission_dir='' \\\n", | |
" --show\n", | |
"\"\"\"\n", | |
"\n", | |
"\n", | |
"def run_inference(\n", | |
" audio_fpath, \n", | |
" am_fpath=\"./models/wav2vec/checkpoint_cv_finetune.bin\",\n", | |
" tokens_fpath=\"./models/wav2vec/tokens-all.lst\", \n", | |
" arch=\"./flashlight/build/mling_large.so\",\n", | |
" lexicon_fpath=\"./lexicon.txt\"\n", | |
"):\n", | |
" with tempfile.NamedTemporaryFile(mode='w', suffix='.lst') as f:\n", | |
" duration = float(check_output(\"soxi -D \" + audio_fpath, shell=True))\n", | |
" f.write(\"%d %s %s\\n\" % (0, audio_fpath, duration))\n", | |
" f.seek(0) # π£ important so the next process can read the first line!\n", | |
"\n", | |
" _cmd = cmd.format(\n", | |
" audio_list=f.name,\n", | |
" audio_model=am_fpath,\n", | |
" tokens=tokens_fpath,\n", | |
" arch=arch,\n", | |
" lexicon=lexicon_fpath\n", | |
" )\n", | |
" proc = create_process(_cmd)\n", | |
" return read_current_output(proc)\n" | |
], | |
"metadata": { | |
"id": "u7nuxTMmb0ZO", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Audio recording" | |
], | |
"metadata": { | |
"id": "l-tX8HorrEy2" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@markdown Colab provides a convinient widget to record directly within the notebook\n", | |
"\n", | |
"from flashlight.scripts.colab.record import record_audio\n", | |
"\n", | |
"audio_name = \"/content/test_audio\"\n", | |
"audio_fpath = f\"{audio_name}.wav\"\n", | |
"record_audio(audio_name)\n", | |
"\n", | |
"with open(\"audio.lst\", \"w\") as f:\n", | |
" duration = float(check_output(\"soxi -D \" + audio_fpath, shell=True))\n", | |
" f.write(\"%d %s %s\\n\" % (0, audio_fpath, duration))" | |
], | |
"metadata": { | |
"id": "yIuxRO13co43", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 152 | |
}, | |
"outputId": "a0df97c4-0a44-4608-9f09-271dd434353c", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
], | |
"text/html": [ | |
"\n", | |
"<script>\n", | |
"var recordButton = document.createElement(\"BUTTON\");\n", | |
"recordButton.appendChild(\n", | |
" document.createTextNode(\"Press to start recording\")\n", | |
");\n", | |
"restyleButtonBeforeRecording();\n", | |
"\n", | |
"var my_div = document.createElement(\"DIV\");\n", | |
"my_div.appendChild(recordButton);\n", | |
"\n", | |
"document.body.appendChild(my_div);\n", | |
"\n", | |
"var base64data = 0;\n", | |
"var reader;\n", | |
"var recorder, gumStream;\n", | |
"\n", | |
"function restyleButtonBeforeRecording() {\n", | |
" recordButton.style.width = '270px';\n", | |
" recordButton.style.height = '90';\n", | |
" recordButton.style.padding = '25px';\n", | |
" recordButton.style.backgroundColor = '#4CAF50';\n", | |
" recordButton.style.fontSize = '18px';\n", | |
"}\n", | |
"\n", | |
"function restyleButtonForRecording() {\n", | |
" recordButton.style.backgroundColor = '#008CBA';\n", | |
" recordButton.innerText = \"Recording... press to stop\";\n", | |
"}\n", | |
"\n", | |
"function restyleButtonForSaving() {\n", | |
" recordButton.style.backgroundColor = '#b34d4d';\n", | |
" recordButton.innerText = \"Saving... please wait!\"\n", | |
"}\n", | |
"\n", | |
"var handleSuccess = function(stream) {\n", | |
" gumStream = stream;\n", | |
" recorder = new MediaRecorder(stream);\n", | |
" recorder.ondataavailable = function(e) {\n", | |
" var url = URL.createObjectURL(e.data);\n", | |
" var preview = document.createElement('audio');\n", | |
" preview.controls = true;\n", | |
" preview.src = url;\n", | |
" document.body.appendChild(preview);\n", | |
"\n", | |
" reader = new FileReader();\n", | |
" reader.readAsDataURL(e.data);\n", | |
" reader.onloadend = function() {\n", | |
" base64data = reader.result;\n", | |
" //console.log(\"Inside FileReader:\" + base64data);\n", | |
" }\n", | |
" };\n", | |
" recorder.start();\n", | |
" };\n", | |
"\n", | |
"\n", | |
"function toggleRecording() {\n", | |
" if (recorder && recorder.state == \"recording\") {\n", | |
" recorder.stop();\n", | |
" gumStream.getAudioTracks()[0].stop();\n", | |
" restyleButtonForSaving();\n", | |
" }\n", | |
"}\n", | |
"\n", | |
"// https://stackoverflow.com/a/951057\n", | |
"function sleep(ms) {\n", | |
" return new Promise(resolve => setTimeout(resolve, ms));\n", | |
"}\n", | |
"\n", | |
"var data = new Promise(resolve=>{\n", | |
" recordButton.onclick = () => {\n", | |
" restyleButtonForRecording();\n", | |
" recordButton.onclick = () => {\n", | |
" toggleRecording();\n", | |
" sleep(2000).then(() => {\n", | |
" // wait 2000ms for the data to be available...\n", | |
" // ideally this should use something like await...\n", | |
" // console.log(\"Inside data:\" + base64data)\n", | |
" resolve(base64data.toString());\n", | |
" });\n", | |
" };\n", | |
" navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n", | |
" };\n", | |
"});\n", | |
"\n", | |
"</script>\n" | |
] | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Inference" | |
], | |
"metadata": { | |
"id": "c5Qj0mIlcnpV" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Create a dummy lexicon (if we use greedy decoding is not used...):\n", | |
"!echo 'a a |' > lexicon.txt" | |
], | |
"metadata": { | |
"id": "ZSGKMnpfkFyM" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Wrapped in python helpers\n", | |
"for transcript in run_inference(\"/content/test_audio.wav\"):\n", | |
" print(transcript)" | |
], | |
"metadata": { | |
"id": "7JmD2SV9eWun", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "87058466-7c32-462a-f5e2-e1781506d8c3" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"This is a ie song sanet.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Directly form command line\n", | |
"!/content/flashlight/build/bin/asr/fl_asr_test \\\n", | |
" --test=audio.lst \\\n", | |
" --am=/content/models/wav2vec/checkpoint_cv_finetune.bin \\\n", | |
" --tokens=/content/models/wav2vec/tokens-all.lst \\\n", | |
" --arch=flashlight/build/mling_large.so \\\n", | |
" --lexicon=lexicon.txt \\\n", | |
" --datadir='' \\\n", | |
" --emission_dir='' \\\n", | |
" --show\n", | |
" # --logtostderr=1 \\\n", | |
" # --minloglevel=0" | |
], | |
"metadata": { | |
"id": "B-pAgH-Hc5y3" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## π€« OpenAI whisper\n", | |
"\n", | |
"This section uses [OpenAI's whisper](https://github.com/openai/whisper) model.\n", | |
"\n", | |
"This model present a series of advantages compared to the previos approaches:\n", | |
"\n", | |
" - multi-language\n", | |
" - multi-task model, i.e.: detects the spoken language and direct translation\n", | |
" - no-speech detection\n", | |
"\n" | |
], | |
"metadata": { | |
"id": "HXjXNS2YxYT4" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Setup" | |
], | |
"metadata": { | |
"id": "OTgegNvJyNLx" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@markdown We install `ffmpeg` and `open AI `whisper`\n", | |
"MODEL = \"whisper\"\n", | |
"!apt install -q ffmpeg\n", | |
"!pip install -q git+https://github.com/openai/whisper.git " | |
], | |
"metadata": { | |
"id": "RDFoBjqWxYjC", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Models\n" | |
], | |
"metadata": { | |
"id": "5ldp1R4R0wmR" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import whisper\n", | |
"\n", | |
"\n", | |
"model_name = 'medium' #@param [\"tiny\", \"base\", \"small\", \"medium\", \"large\"]\n", | |
"\n", | |
"print(f\"Loading whisper model '{model_name}'\")\n", | |
"model = whisper.load_model(model_name)\n", | |
"\n", | |
"def run_inference(mp3_file:str):\n", | |
" global model\n", | |
" res = model.transcribe(mp3_file)\n", | |
" return res[\"text\"]" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "zsx5efxM0Fex", | |
"outputId": "db772eb8-182c-49eb-effe-afe933d0b0a5", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Loading whisper model 'medium'\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## π BONUS: Telegram Bot π€\n", | |
"\n", | |
"We run a simple Telegram Bot as a PoC of TTS as a service via audio messages using [pyTelegramBotAPI](https://github.com/eternnoir/pyTelegramBotAPI)." | |
], | |
"metadata": { | |
"id": "h5YBRp_p7Dcx" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@markdown Install dependencies\n", | |
"!pip install -qq -U \\\n", | |
" pyTelegramBotAPI \\\n", | |
" rich" | |
], | |
"metadata": { | |
"id": "rJB2CiViAPGT", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@markdown Running the Telegram bot!\n", | |
"\n", | |
"import datetime as dt\n", | |
"import telebot\n", | |
"import requests\n", | |
"\n", | |
"from rich import print as pprint\n", | |
"\n", | |
"BOT_TOKEN = \"your-telegram-token-here\" #@param \n", | |
"\n", | |
"bot = telebot.TeleBot(BOT_TOKEN, parse_mode=\"MARKDOWN\")\n", | |
"\n", | |
"\n", | |
"def handle_audio_message(message):\n", | |
"\n", | |
" now = \" \".join(dt.datetime.now().isoformat().split(\".\")[0].split(\"T\"))\n", | |
"\n", | |
" if message.content_type == \"voice\":\n", | |
" msg = f\"π Received a {message.voice.duration}s voice note. Transcribing...\"\n", | |
" print(msg)\n", | |
" ack_reply = bot.send_message(message.chat.id, msg)\n", | |
" file_info = bot.get_file(message.voice.file_id)\n", | |
" else:\n", | |
" bot.reply_to(message, f\"π Sorry can't handle audio clips yet...\")\n", | |
" file_info = bot.get_file(message.audio.file_id)\n", | |
" \n", | |
" try:\n", | |
" # Fetch the audio file \n", | |
" audio_file = requests.get(\n", | |
" f'https://api.telegram.org/file/bot{BOT_TOKEN}/{file_info.file_path}'\n", | |
" )\n", | |
"\n", | |
" with tempfile.NamedTemporaryFile(mode='wb', suffix='.ogg') as f:\n", | |
" # write audio to disk\n", | |
" in_file = f.name\n", | |
" f.write(audio_file.content)\n", | |
"\n", | |
" if MODEL == \"W2L\":\n", | |
" # Convert to wav\n", | |
" out_file = f.name.replace(\".ogg\", \".wav\")\n", | |
" create_process(\n", | |
" f'ffmpeg -i {in_file} -acodec pcm_s16le -ar 16000 {out_file}'\n", | |
" ).wait()\n", | |
" elif MODEL == \"whisper\":\n", | |
" # Convert to mp3\n", | |
" out_file = f.name.replace(\".ogg\", \".mp3\")\n", | |
" create_process(\n", | |
" f'ffmpeg -i {in_file} {out_file}'\n", | |
" ).wait()\n", | |
"\n", | |
" # transcribe\n", | |
" transcript = run_inference(out_file)\n", | |
" if isinstance(transcript, list):\n", | |
" transcript = \"\\n\".join(transcript)\n", | |
"\n", | |
" text = f\"**{now}**\\n\\n\" + transcript\n", | |
"\n", | |
" # Delete ack message and send transcript as a reply\n", | |
" bot.delete_message(message.chat.id, ack_reply.id)\n", | |
" bot.reply_to(message, text)\n", | |
"\n", | |
" except Exception as e:\n", | |
" print(f\"π¨ Error! {e}\")\n", | |
" bot.reply_to(message, f\"π¨ Error! {e}\")\n", | |
"\n", | |
"\n", | |
"@bot.message_handler(commands=['start', 'help'])\n", | |
"def send_welcome(message):\n", | |
"\tbot.reply_to(message, \"Hey, let's start. What are your thoughts?\")\n", | |
"\n", | |
"\n", | |
"@bot.message_handler(func=lambda message: True)\n", | |
"def echo_all(message):\n", | |
"\tbot.reply_to(message, message.text)\n", | |
"\n", | |
"\n", | |
"@bot.message_handler(content_types=['audio', 'voice'])\n", | |
"def handle_docs_audio(message):\n", | |
" handle_audio_message(message)\n", | |
"\n", | |
"\n", | |
"# getMe\n", | |
"me = bot.get_me()\n", | |
"print(f\"Running bot with ID: {me.id} | Name: {me.username}\")\n", | |
"\n", | |
"# Run polling\n", | |
"bot.infinity_polling()" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "xvCpjfMi7KaJ", | |
"outputId": "33c1c5d0-6862-42c1-b54b-af88b0eef030", | |
"cellView": "form" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Running bot with ID: 5191934564 | Name: pensabox_bot\n", | |
"π Received a 2s voice note. Transcribing...\n", | |
"π Received a 3s voice note. Transcribing...\n", | |
"π Received a 3s voice note. Transcribing...\n", | |
"π Received a 9s voice note. Transcribing...\n", | |
"π Received a 13s voice note. Transcribing...\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment