Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save singhkunal2050/eb1c5027593dbaac5db3f98b1c0a0a06 to your computer and use it in GitHub Desktop.
Save singhkunal2050/eb1c5027593dbaac5db3f98b1c0a0a06 to your computer and use it in GitHub Desktop.
testing-whisper-audio-to-transcript.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyMHnnmr8FnhKRnvNrnyMJS6",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/singhkunal2050/eb1c5027593dbaac5db3f98b1c0a0a06/testing-whisper-audio-to-transcript.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hJusIlovbfda",
"outputId": "3a7a81cf-5656-4563-e2c7-a3c41f1608bf"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n"
]
}
],
"source": [
"pip --version"
]
},
{
"cell_type": "code",
"source": [
"pip install -U openai-whisper"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "OsqVJ3oybv6g",
"outputId": "d99b0877-dac9-4fab-b803-79618d667a22"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting openai-whisper\n",
" Downloading openai-whisper-20231117.tar.gz (798 kB)\n",
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/798.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m256.0/798.6 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m788.5/798.6 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m798.6/798.6 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: triton<3,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from openai-whisper) (2.1.0)\n",
"Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from openai-whisper) (0.58.1)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from openai-whisper) (1.23.5)\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from openai-whisper) (2.1.0+cu121)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai-whisper) (4.66.1)\n",
"Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from openai-whisper) (10.1.0)\n",
"Collecting tiktoken (from openai-whisper)\n",
" Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from triton<3,>=2.0.0->openai-whisper) (3.13.1)\n",
"Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->openai-whisper) (0.41.1)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken->openai-whisper) (2023.12.25)\n",
"Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.10/dist-packages (from tiktoken->openai-whisper) (2.31.0)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper) (4.9.0)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper) (1.12)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper) (3.2.1)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper) (3.1.3)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper) (2023.6.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken->openai-whisper) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken->openai-whisper) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken->openai-whisper) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken->openai-whisper) (2024.2.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->openai-whisper) (2.1.5)\n",
"Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->openai-whisper) (1.3.0)\n",
"Building wheels for collected packages: openai-whisper\n",
" Building wheel for openai-whisper (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for openai-whisper: filename=openai_whisper-20231117-py3-none-any.whl size=801356 sha256=65ae3d565054d58ff486931d5a734adcd3c1deaf4339976ca1bdd100ac167ebe\n",
" Stored in directory: /root/.cache/pip/wheels/d0/85/e1/9361b4cbea7dd4b7f6702fa4c3afc94877952eeb2b62f45f56\n",
"Successfully built openai-whisper\n",
"Installing collected packages: tiktoken, openai-whisper\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"llmx 0.0.15a0 requires cohere, which is not installed.\n",
"llmx 0.0.15a0 requires openai, which is not installed.\u001b[0m\u001b[31m\n",
"\u001b[0mSuccessfully installed openai-whisper-20231117 tiktoken-0.6.0\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!apt install ffmpeg\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "P3NJefnEcZyx",
"outputId": "7ecd87a1-06ef-4570-c4aa-c68741e55b50"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!ffmpeg"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cvWltDkUcuvt",
"outputId": "9e69110f-fef5-450e-da4f-607f52061997"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers\n",
" built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)\n",
" configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\n",
" libavutil 56. 70.100 / 56. 70.100\n",
" libavcodec 58.134.100 / 58.134.100\n",
" libavformat 58. 76.100 / 58. 76.100\n",
" libavdevice 58. 13.100 / 58. 13.100\n",
" libavfilter 7.110.100 / 7.110.100\n",
" libswscale 5. 9.100 / 5. 9.100\n",
" libswresample 3. 9.100 / 3. 9.100\n",
" libpostproc 55. 9.100 / 55. 9.100\n",
"Hyper fast Audio and Video encoder\n",
"usage: ffmpeg [options] [[infile options] -i infile]... {[outfile options] outfile}...\n",
"\n",
"\u001b[0;33mUse -h to get full help or, even better, run 'man ffmpeg'\n",
"\u001b[0m"
]
}
]
},
{
"cell_type": "code",
"source": [
"pip install setuptools-rust"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jlcvqUTSdC0-",
"outputId": "c50a4758-b21d-49a8-94d4-88e63767c140"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting setuptools-rust\n",
" Downloading setuptools_rust-1.8.1-py3-none-any.whl (26 kB)\n",
"Requirement already satisfied: setuptools>=62.4 in /usr/local/lib/python3.10/dist-packages (from setuptools-rust) (67.7.2)\n",
"Collecting semantic-version<3,>=2.8.2 (from setuptools-rust)\n",
" Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
"Requirement already satisfied: tomli>=1.2.1 in /usr/local/lib/python3.10/dist-packages (from setuptools-rust) (2.0.1)\n",
"Installing collected packages: semantic-version, setuptools-rust\n",
"Successfully installed semantic-version-2.10.0 setuptools-rust-1.8.1\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!whisper"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UJiNIG4Ydsy5",
"outputId": "282cd920-5a87-4dc3-b74b-a9b92520c16b"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"usage: whisper [-h] [--model MODEL] [--model_dir MODEL_DIR] [--device DEVICE]\n",
" [--output_dir OUTPUT_DIR] [--output_format {txt,vtt,srt,tsv,json,all}]\n",
" [--verbose VERBOSE] [--task {transcribe,translate}]\n",
" [--language {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,yue,zh,Afrikaans,Albanian,Amharic,Arabic,Armenian,Assamese,Azerbaijani,Bashkir,Basque,Belarusian,Bengali,Bosnian,Breton,Bulgarian,Burmese,Cantonese,Castilian,Catalan,Chinese,Croatian,Czech,Danish,Dutch,English,Estonian,Faroese,Finnish,Flemish,French,Galician,Georgian,German,Greek,Gujarati,Haitian,Haitian Creole,Hausa,Hawaiian,Hebrew,Hindi,Hungarian,Icelandic,Indonesian,Italian,Japanese,Javanese,Kannada,Kazakh,Khmer,Korean,Lao,Latin,Latvian,Letzeburgesch,Lingala,Lithuanian,Luxembourgish,Macedonian,Malagasy,Malay,Malayalam,Maltese,Mandarin,Maori,Marathi,Moldavian,Moldovan,Mongolian,Myanmar,Nepali,Norwegian,Nynorsk,Occitan,Panjabi,Pashto,Persian,Polish,Portuguese,Punjabi,Pushto,Romanian,Russian,Sanskrit,Serbian,Shona,Sindhi,Sinhala,Sinhalese,Slovak,Slovenian,Somali,Spanish,Sundanese,Swahili,Swedish,Tagalog,Tajik,Tamil,Tatar,Telugu,Thai,Tibetan,Turkish,Turkmen,Ukrainian,Urdu,Uzbek,Valencian,Vietnamese,Welsh,Yiddish,Yoruba}]\n",
" [--temperature TEMPERATURE] [--best_of BEST_OF] [--beam_size BEAM_SIZE]\n",
" [--patience PATIENCE] [--length_penalty LENGTH_PENALTY]\n",
" [--suppress_tokens SUPPRESS_TOKENS] [--initial_prompt INITIAL_PROMPT]\n",
" [--condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT] [--fp16 FP16]\n",
" [--temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK]\n",
" [--compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD]\n",
" [--logprob_threshold LOGPROB_THRESHOLD] [--no_speech_threshold NO_SPEECH_THRESHOLD]\n",
" [--word_timestamps WORD_TIMESTAMPS] [--prepend_punctuations PREPEND_PUNCTUATIONS]\n",
" [--append_punctuations APPEND_PUNCTUATIONS] [--highlight_words HIGHLIGHT_WORDS]\n",
" [--max_line_width MAX_LINE_WIDTH] [--max_line_count MAX_LINE_COUNT]\n",
" [--max_words_per_line MAX_WORDS_PER_LINE] [--threads THREADS]\n",
" audio [audio ...]\n",
"whisper: error: the following arguments are required: audio\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!ls"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3JbkNGMTdxY7",
"outputId": "b1fb1357-6165-4ae7-ad5c-e550f0a34f70"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"download.mp3 sample_data\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!whisper download.mp3 --model tiny --language en"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "eShoBgGDed3z",
"outputId": "0dccdd91-4143-4bb1-d07d-68619df6c3d7"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"100%|██████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 128MiB/s]\n",
"/usr/local/lib/python3.10/dist-packages/whisper/transcribe.py:115: UserWarning: FP16 is not supported on CPU; using FP32 instead\n",
" warnings.warn(\"FP16 is not supported on CPU; using FP32 instead\")\n",
"[00:00.000 --> 00:05.000] Oh, you think darkness is your ally.\n",
"[00:05.000 --> 00:08.000] Are you merely adopted the dark?\n",
"[00:08.000 --> 00:11.000] I was born in it.\n",
"[00:11.000 --> 00:14.000] More lit by it.\n",
"[00:14.000 --> 00:17.000] I didn't see the light until I was already a man,\n",
"[00:17.000 --> 00:20.000] but then it was nothing to me but brightened.\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"*Oh, you think darkness is your ally. But you merely adopted the dark. I was born in it, molded by it. I didn't see the light until I was already a man, by then it was nothing to me but blinding!*"
],
"metadata": {
"id": "B9_Y9LWsfEsZ"
}
},
{
"cell_type": "code",
"source": [
"!whisper hindi.mp3 --model tiny --language hi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "d7dMQs0cgDYr",
"outputId": "3cc42e34-97e0-456f-e833-f2dfe1b5b512"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/usr/local/lib/python3.10/dist-packages/whisper/transcribe.py:115: UserWarning: FP16 is not supported on CPU; using FP32 instead\n",
" warnings.warn(\"FP16 is not supported on CPU; using FP32 instead\")\n",
"[00:00.000 --> 00:08.640] Deesh kais sabhi vane jik bankon ki sabhi kati vidhyo ka sanchalan niyantran vane gman kartha hai\n",
"[00:08.640 --> 00:12.800] Here, ek sar kari bankar ke route me kare kartha hai\n",
"[00:12.800 --> 00:18.100] yaa kaisi bhi deesh ki mudra, aur rind ni tika niyantran kartha hai\n",
"[00:18.100 --> 00:22.960] Bharat me bharthi arizav bank kendria bank hai\n",
"[00:22.960 --> 00:26.900] Arthis vane jik bank ke kare hai\n",
"[00:27.200 --> 00:30.200] is ki muka kare nimna hai\n",
"[00:30.200 --> 00:34.900] Unchallis jamao ko svi kar karna\n",
"[00:34.900 --> 00:39.000] Jamak kisi bhi rind parchalan ka adhhar ho te hai\n",
"[00:39.000 --> 00:44.400] kki bank pese ko uddhar lini, aur dini ka kam karthe hai\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!whisper test-hindi.mp3 --model large --language hi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hMBgnrYnkN5t",
"outputId": "ee0204a0-3714-4adc-c426-baa362fe33a0"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" 24%|█████████▏ | 695M/2.88G [00:08<00:13, 173MiB/s]"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment