Last active
January 15, 2023 17:04
-
-
Save alexpovel/8e0d051a0a552028258c779588f9d8ec to your computer and use it in GitHub Desktop.
Converting mp4 videos (like lecture notes) to text (transcribing) using Python, its `speech_recognition` libs, and `openai/whisper`. Made reproducible via Docker.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import logging | |
import sys | |
from pathlib import Path | |
from subprocess import run | |
import speech_recognition as sr | |
RECOGNIZER = sr.Recognizer() | |
files = {Path(file.strip()) for file in sys.stdin.read().split("\n") if file.strip()} | |
target_language = sys.argv[1] | |
def convert_to_audio(file: Path, to_suffix: str) -> Path: | |
new_file = file.with_suffix(to_suffix) | |
logging.info(f"Converting {file} to {new_file}...") | |
if new_file.exists(): | |
logging.warning(f"File {new_file} exists, skipping audio conversion.") | |
else: | |
run( | |
["ffmpeg", "-i", str(file.absolute()), "-vn", str(new_file.absolute())], | |
check=True, | |
) | |
logging.info(f"Converted {file} to {new_file}.") | |
return new_file | |
def process(file: Path) -> None: | |
audio_file = convert_to_audio(file, to_suffix=".flac") | |
logging.warning(f"Starting audio processing on {audio_file}...") | |
text_file = file.with_suffix(".transcribed.txt") | |
if text_file.exists(): | |
logging.warning(f"File {text_file} exists, skipping audio recognition.") | |
else: | |
logging.warning(f"Getting audio of {audio_file}...") | |
with sr.AudioFile(str(audio_file)) as source: | |
audio = RECOGNIZER.record(source) | |
logging.warning(f"Got audio of {audio_file}, starting recognition...") | |
text = RECOGNIZER.recognize_whisper(audio, language=target_language) | |
logging.warning(f"Finished audio processing on {audio_file}.") | |
with open(text_file, mode="w") as f: | |
f.write(text) | |
logging.warning(f"Wrote results to {text_file}.") | |
def main(): | |
for file in files: | |
process(file) | |
# Can also do this via multiprocessing, however the recognition library itself seems to | |
# be doing well utilizing multiple cores, so the following isn't really necessary. | |
# from multiprocessing import Pool, cpu_count | |
# with Pool(processes=cpu_count() - 1) as p: | |
# p.map(process, files) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM python:3.10.8-bullseye | |
RUN apt-get update && apt-get install --yes --no-install-recommends \ | |
python3-pyaudio=0.2.11-1.3+b1 \ | |
ffmpeg=7:4.3.5-0+deb11u1 | |
# Alternatively, use git+https://github.com/openai/whisper.git | |
RUN pip install --no-cache-dir \ | |
speechrecognition==3.9.0 \ | |
whisper.ai==1.0.0.1 | |
COPY convert.py . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docker build --tag transcript . | |
docker run -v ${PWD}/videos:/videos transcript bash -c 'ls /videos/*.mp4 | python /convert.py german' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
docker build --tag transcript . && docker run -v $(pwd)/videos:/videos transcript bash -c 'ls /videos/*.mp4 | python /convert.py english' |
$ docker --version
Docker version 20.10.21, build baeda1f
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Prerequisites are:
videos
subdirectoryAfter conversion, the
videos
directory will contain newflac
as well astranscribed.txt
files.The
flac
files are intermediate files, required only becausespeechrecognition
can't handle video files directly (yet?).