Last active
February 1, 2025 01:36
-
-
Save sebington/5b3ac4bb03e747f084dace454d017b8d to your computer and use it in GitHub Desktop.
Transcribe audio file at word-level and write output to .srt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Word-level transcriptions with Faster-Whisper" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"id": "xRe-wO2gWNN4" | |
}, | |
"outputs": [], | |
"source": [ | |
"# pip install faster-whisper -q\n", | |
"from faster_whisper import WhisperModel" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"id": "i8yDvreNvETR" | |
}, | |
"outputs": [], | |
"source": [ | |
"# model initialization (run on GPU with FP16 or on CPU with int8)\n", | |
"model = WhisperModel(\"small.en\", device=\"cpu\", compute_type=\"int8\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"id": "UCZ_UwAiH5sG" | |
}, | |
"outputs": [], | |
"source": [ | |
"# load an audio file\n", | |
"audio = \"bbc_ai_edit.mp3\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Transcribe a file at word level" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[0.00s -> 0.48s] Stuart,\n", | |
"[0.58s -> 0.74s] thank\n", | |
"[0.74s -> 0.94s] you\n", | |
"[0.94s -> 1.30s] so\n", | |
"[1.30s -> 1.70s] much.\n", | |
"[2.26s -> 2.28s] You\n", | |
"[2.28s -> 2.38s] know,\n", | |
"[2.40s -> 2.70s] it's\n", | |
"[2.70s -> 3.30s] absolutely\n", | |
"[3.30s -> 3.94s] fascinating\n", | |
"[3.94s -> 4.38s] to\n", | |
"[4.38s -> 4.60s] me\n", | |
"[4.60s -> 4.86s] that\n", | |
"[4.86s -> 5.62s] you\n", | |
"[5.62s -> 5.78s] are\n", | |
"[5.78s -> 5.90s] an\n", | |
"[5.90s -> 6.22s] expert\n", | |
"[6.22s -> 6.46s] in\n", | |
"[6.46s -> 6.80s] artificial\n", | |
"[6.80s -> 7.46s] intelligence\n", | |
"[7.46s -> 7.80s] and\n", | |
"[7.80s -> 8.16s] you\n", | |
"[8.16s -> 8.32s] have\n", | |
"[8.32s -> 8.52s] been\n", | |
"[8.52s -> 8.88s] taking\n", | |
"[8.88s -> 9.18s] us\n", | |
"[9.18s -> 9.32s] on\n", | |
"[9.32s -> 9.56s] this\n", | |
"[9.56s -> 10.44s] relay\n", | |
"[10.44s -> 11.10s] race\n", | |
"[11.10s -> 11.46s] from\n", | |
"[11.46s -> 11.82s] Bagret\n", | |
"[11.82s -> 12.04s] to\n", | |
"[12.04s -> 12.38s] yourself\n", | |
"[12.38s -> 12.84s] to\n", | |
"[12.84s -> 13.18s] who\n", | |
"[13.18s -> 13.38s] knows\n", | |
"[13.38s -> 13.78s] what.\n", | |
"[14.26s -> 14.42s] And\n", | |
"[14.42s -> 14.76s] all\n", | |
"[14.76s -> 14.96s] I\n", | |
"[14.96s -> 15.12s] kept\n", | |
"[15.12s -> 15.44s] thinking\n", | |
"[15.44s -> 15.72s] about\n", | |
"[15.72s -> 15.92s] is\n", | |
"[15.92s -> 16.42s] what\n", | |
"[16.42s -> 16.60s] does\n", | |
"[16.60s -> 16.76s] this\n", | |
"[16.76s -> 16.92s] say\n", | |
"[16.92s -> 17.10s] about\n", | |
"[17.10s -> 17.22s] the\n", | |
"[17.22s -> 17.46s] human\n", | |
"[17.46s -> 18.00s] condition?\n", | |
"[18.56s -> 18.64s] You\n", | |
"[18.64s -> 18.74s] know,\n", | |
"[18.80s -> 18.90s] what\n", | |
"[18.90s -> 19.36s] it\n", | |
"[19.36s -> 19.54s] is\n", | |
"[19.54s -> 19.70s] to\n", | |
"[19.70s -> 19.88s] be\n", | |
"[19.88s -> 20.36s] human.\n", | |
"[20.84s -> 21.06s] And\n", | |
"[21.06s -> 21.26s] you\n", | |
"[21.26s -> 21.82s] laid\n", | |
"[21.82s -> 22.06s] out\n", | |
"[22.06s -> 22.30s] very\n", | |
"[22.30s -> 22.84s] clearly\n", | |
"[22.84s -> 23.60s] what\n", | |
"[23.60s -> 23.92s] people\n", | |
"[23.92s -> 24.28s] think\n", | |
"[24.28s -> 24.60s] might\n", | |
"[24.60s -> 24.88s] be\n", | |
"[24.88s -> 25.02s] the\n", | |
"[25.02s -> 25.78s] eventuality\n", | |
"[25.78s -> 25.94s] of\n", | |
"[25.94s -> 26.24s] having\n", | |
"[26.24s -> 26.42s] the\n", | |
"[26.42s -> 26.54s] end\n", | |
"[26.54s -> 26.68s] of\n", | |
"[26.68s -> 27.08s] work.\n", | |
"[27.08s -> 27.92s] I\n", | |
"[27.92s -> 28.04s] want\n", | |
"[28.04s -> 28.18s] to\n", | |
"[28.18s -> 28.28s] know\n", | |
"[28.28s -> 28.40s] what\n", | |
"[28.40s -> 28.70s] you\n", | |
"[28.70s -> 29.10s] think\n", | |
"[29.10s -> 29.78s] it\n", | |
"[29.78s -> 30.02s] will\n", | |
"[30.02s -> 30.22s] be\n", | |
"[30.22s -> 30.46s] like.\n", | |
"[30.58s -> 30.58s] You\n", | |
"[30.58s -> 30.68s] know,\n", | |
"[30.80s -> 31.02s] there\n", | |
"[31.02s -> 31.16s] are\n", | |
"[31.16s -> 31.40s] two\n", | |
"[31.40s -> 31.86s] scenarios.\n", | |
"[32.06s -> 32.28s] One,\n", | |
"[32.80s -> 33.04s] I\n", | |
"[33.04s -> 33.38s] lose\n", | |
"[33.38s -> 34.08s] every\n", | |
"[34.08s -> 34.60s] excuse\n", | |
"[34.60s -> 35.18s] not\n", | |
"[35.18s -> 35.40s] to\n", | |
"[35.40s -> 35.54s] learn\n", | |
"[35.54s -> 35.68s] the\n", | |
"[35.68s -> 36.04s] piano\n", | |
"[36.04s -> 36.24s] and\n", | |
"[36.24s -> 36.42s] I'm\n", | |
"[36.42s -> 36.60s] much\n", | |
"[36.60s -> 36.88s] nicer\n", | |
"[36.88s -> 37.08s] to\n", | |
"[37.08s -> 37.24s] my\n", | |
"[37.24s -> 37.52s] children\n", | |
"[37.52s -> 37.82s] and\n", | |
"[37.82s -> 37.98s] I\n", | |
"[37.98s -> 38.52s] catch\n", | |
"[38.52s -> 38.78s] up\n", | |
"[38.78s -> 38.96s] on\n", | |
"[38.96s -> 39.18s] all\n", | |
"[39.18s -> 39.28s] of\n", | |
"[39.28s -> 39.36s] the\n", | |
"[39.36s -> 39.58s] wonderful\n", | |
"[39.58s -> 39.82s] things\n", | |
"[39.82s -> 40.02s] on\n", | |
"[40.02s -> 40.22s] Radio\n", | |
"[40.22s -> 40.66s] 4.\n", | |
"[40.98s -> 41.38s] Or,\n", | |
"[41.70s -> 41.84s] you\n", | |
"[41.84s -> 41.98s] know,\n", | |
"[42.22s -> 42.36s] that\n", | |
"[42.36s -> 43.10s] terrifying\n", | |
"[43.10s -> 44.00s] dystopia,\n", | |
"[44.26s -> 44.34s] the\n", | |
"[44.34s -> 44.86s] animation\n", | |
"[44.86s -> 45.18s] Wall\n", | |
"[45.18s -> 45.48s] -E,\n", | |
"[45.68s -> 45.76s] where\n", | |
"[45.76s -> 45.90s] we\n", | |
"[45.90s -> 46.04s] all\n", | |
"[46.04s -> 46.16s] sit\n", | |
"[46.16s -> 46.30s] on\n", | |
"[46.30s -> 46.42s] our\n", | |
"[46.42s -> 46.74s] bottoms\n", | |
"[46.74s -> 46.94s] getting\n", | |
"[46.94s -> 47.48s] fatter,\n", | |
"[47.82s -> 48.12s] watching\n", | |
"[48.12s -> 49.02s] infomercials.\n", | |
"[49.02s -> 49.08s] I\n", | |
"[49.08s -> 49.18s] mean,\n", | |
"[49.24s -> 49.32s] what\n", | |
"[49.32s -> 49.48s] do\n", | |
"[49.48s -> 49.66s] you\n", | |
"[49.66s -> 49.98s] think\n", | |
"[49.98s -> 50.52s] the\n", | |
"[50.52s -> 50.82s] human\n", | |
"[50.82s -> 51.32s] condition\n", | |
"[51.32s -> 51.76s] leans\n", | |
"[51.76s -> 52.22s] towards?\n" | |
] | |
} | |
], | |
"source": [ | |
"# transcribes and displays results (word level)\n", | |
"segments, _ = model.transcribe(audio, language=\"en\", word_timestamps=True)\n", | |
"segments = list(segments)\n", | |
"\n", | |
"for segment_wl in segments:\n", | |
" for word in segment_wl.words:\n", | |
" print(\"[%.2fs -> %.2fs] %s\" % (word.start, word.end, word.word))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create SRT subs and write to file\n", | |
"import math\n", | |
"\n", | |
"def convert_seconds_to_hms(seconds):\n", | |
" hours, remainder = divmod(seconds, 3600)\n", | |
" minutes, seconds = divmod(remainder, 60)\n", | |
" milliseconds = math.floor((seconds % 1) * 1000)\n", | |
" output = f\"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}\"\n", | |
" return output\n", | |
"\n", | |
"count = 0\n", | |
"with open(f\"{audio}_word-level.srt\", 'w') as f: # enter subtitle file name\n", | |
" for segment in segments:\n", | |
" for word in segment.words:\n", | |
" count +=1\n", | |
" duration = f\"{convert_seconds_to_hms(word.start)} --> {convert_seconds_to_hms(word.end)}\\n\"\n", | |
" text = f\"{word.word.lstrip()}\\n\\n\"\n", | |
" f.write(f\"{count}\\n{duration}{text}\") # Write formatted string to the file" | |
] | |
} | |
], | |
"metadata": { | |
"accelerator": "GPU", | |
"colab": { | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "base", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment