Last active
April 8, 2023 22:43
-
-
Save mill1000/4cc4fc0de1eec2a29080c3b6d0b074b7 to your computer and use it in GitHub Desktop.
Bulk audio file transcription using Whisper.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import whisper | |
import argparse | |
import os | |
import math | |
import datetime | |
import sys | |
if __name__ == "__main__": | |
# Argument parsing | |
parser = argparse.ArgumentParser(description="Transcribe files using Whisper and output a summary.", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument( | |
"--model", help="Whisper model to use.", default="base.en") | |
parser.add_argument("--device", help="Whisper device to use.", | |
default="cuda", choices=["cuda", "cpu"]) | |
parser.add_argument( | |
"--rename", help="Rename files with the transcribed text.", action="store_true") | |
parser.add_argument("--output", help="File to output to.", | |
type=argparse.FileType('w'), default=sys.stdout) | |
parser.add_argument("files", help="Audio files to transcribe.", nargs="+") | |
args = parser.parse_args() | |
def convert_time(time): | |
"""Convert a float representation of seconds to a time object.""" | |
microseconds, total_seconds = math.modf(time) | |
minutes = total_seconds / 60 | |
seconds = total_seconds % 60 | |
return datetime.time(minute=int(minutes), second=int(seconds), microsecond=int(microseconds * 1e6)) | |
def eprint(*a, **kwargs): | |
"""Print to stderr.""" | |
print(*a, file=sys.stderr, **kwargs) | |
def oprint(*a, **kwargs): | |
"""Print to the specified output file.""" | |
print(*a, file=args.output, **kwargs) | |
eprint(f"Transcribing {len(args.files)} files.") | |
# Load the selected Whisper model | |
eprint(f"Loading model '{args.model}' using device '{args.device}'.") | |
model = whisper.load_model(args.model, device=args.device) | |
# Supress warnings from Whisper by selecting proper FP16 mode | |
fp16 = True if args.device == "cuda" else False | |
# Convert each file | |
for file in args.files: | |
eprint(f"Transcribing '{file}'.") | |
oprint(f"{file}") | |
if not os.path.exists(file): | |
oprint(f"File does not exist.") | |
eprint(f"File '{file}' does not exists.") | |
continue | |
# Transcribe the file | |
result = model.transcribe(file, fp16=fp16) | |
# Print the complete text | |
complete_text = result["text"].strip() | |
oprint("Full Text:") | |
oprint(f" {complete_text}") | |
# Print each segment with timestamps | |
oprint("Segments:") | |
for segment in result["segments"]: | |
# print(segment) | |
text = segment["text"].strip() | |
start = convert_time(segment["start"]) | |
end = convert_time(segment["end"]) | |
timespan = f"[{start.isoformat(timespec='milliseconds')} -> {end.isoformat(timespec='milliseconds')}]" | |
oprint(f" {timespan}: {text}") | |
oprint("") | |
if args.rename: | |
prefix = os.path.normcase(complete_text[:32]) | |
path, name = os.path.split(file) | |
new_file = os.path.normpath(f"{path}/{prefix} - {name}") | |
eprint(f"Renamed '{file}' to '{new_file}'.") | |
os.rename(file, new_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment