Created
October 26, 2023 10:53
-
-
Save arturmartins/1c78de3e8c21ffce81a17dc2f2181de4 to your computer and use it in GitHub Desktop.
Converts WEBVTT subtitles (vtt) to plain text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Converts WEBVTT subtitles (vtt) to plain text. | |
It removes all time related info as well as duplicated and empty lines. | |
""" | |
# Author: Artur Martins <[email protected]> | |
# Version: 1.0 | |
# Date: 2023-Oct-25 | |
import re | |
import argparse | |
import os | |
ENCODING_TYPE = "utf-8" | |
HEADER = "WEBVTT" | |
def clean_line(line: str) -> str: | |
""" | |
Remove all WebVTT tags and time codes from the given line. | |
Args: | |
line (str): The line of text to clean. | |
Returns: | |
str: The cleaned line with all tags and time codes removed and leading/trailing whitespace stripped. | |
""" | |
# Remove all WebVTT tags and time codes | |
cleaned_line = re.sub(r"<.*?>", "", line) | |
cleaned_line = re.sub(r"\d{2}:\d{2}:\d{2}\.\d{3}", "", cleaned_line) | |
return cleaned_line.strip() | |
def convert_webvtt_to_text(input_path: str, output_path: str, verbose: bool) -> None: | |
""" | |
Convert a WebVTT file to plain text. | |
Args: | |
input_path (str): The path to the WebVTT input file. | |
output_path (str): The path to the output text file. | |
verbose (bool): If True, print the cleaned lines as they are written. | |
Returns: | |
None | |
""" | |
last_written_line = "" | |
with open(input_path, "r", encoding=ENCODING_TYPE) as infile, open( | |
output_path, "w", encoding=ENCODING_TYPE | |
) as outfile: | |
lines = infile.readlines() | |
for line in lines: | |
line = line.strip() | |
# Skip time lines or WebVTT header | |
if "-->" in line or line == HEADER: | |
continue | |
# Skip empty lines | |
if not line: | |
continue | |
cleaned_line = clean_line(line) | |
if cleaned_line and cleaned_line != last_written_line: | |
if verbose: | |
print(f"Writing: {cleaned_line}") | |
outfile.write(cleaned_line + "\n") | |
last_written_line = cleaned_line | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Convert WebVTT to text.") | |
parser.add_argument( | |
"-i", "--input", required=True, help="Path to the input WebVTT file." | |
) | |
parser.add_argument( | |
"-o", | |
"--output", | |
help="Path to the output text file. Defaults to input file name with .txt extension.", | |
) | |
parser.add_argument( | |
"-v", "--verbose", action="store_true", help="Enable verbose output." | |
) | |
args = parser.parse_args() | |
if args.output is None: | |
base_name = os.path.splitext(args.input)[0] | |
args.output = f"{base_name}.txt" | |
if args.verbose: | |
print(f"Converting {args.input} to {args.output}...") | |
convert_webvtt_to_text(args.input, args.output, args.verbose) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice code - can I reuse this in a package?
What would be proper attribution?