arturmartins · October 26, 2023 10:53 · epogrebnyak · Oct 26, 2023
diff --git a/webvtt2txt.py b/webvtt2txt.py
 #!/usr/bin/env python3

 """
 Converts WEBVTT subtitles (vtt) to plain text.

 It removes all time related info as well as duplicated and empty lines.
 """
 # Author: Artur Martins <[email protected]>
 # Version: 1.0
 # Date: 2023-Oct-25

 import re
 import argparse
 import os

 ENCODING_TYPE = "utf-8"
 HEADER = "WEBVTT"


 def clean_line(line: str) -> str:
    """
    Remove all WebVTT tags and time codes from the given line.

    Args:
        line (str): The line of text to clean.

    Returns:
        str: The cleaned line with all tags and time codes removed and leading/trailing whitespace stripped.
    """
    # Remove all WebVTT tags and time codes
    cleaned_line = re.sub(r"<.*?>", "", line)
    cleaned_line = re.sub(r"\d{2}:\d{2}:\d{2}\.\d{3}", "", cleaned_line)
    return cleaned_line.strip()


 def convert_webvtt_to_text(input_path: str, output_path: str, verbose: bool) -> None:
    """
    Convert a WebVTT file to plain text.

    Args:
        input_path (str): The path to the WebVTT input file.
        output_path (str): The path to the output text file.
        verbose (bool): If True, print the cleaned lines as they are written.

    Returns:
        None
    """
    last_written_line = ""
    with open(input_path, "r", encoding=ENCODING_TYPE) as infile, open(
        output_path, "w", encoding=ENCODING_TYPE
    ) as outfile:
        lines = infile.readlines()
        for line in lines:
            line = line.strip()
            # Skip time lines or WebVTT header
            if "-->" in line or line == HEADER:
                continue
            # Skip empty lines
            if not line:
                continue
            cleaned_line = clean_line(line)
            if cleaned_line and cleaned_line != last_written_line:
                if verbose:
                    print(f"Writing: {cleaned_line}")
                outfile.write(cleaned_line + "\n")
                last_written_line = cleaned_line


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert WebVTT to text.")
    parser.add_argument(
        "-i", "--input", required=True, help="Path to the input WebVTT file."
    )
    parser.add_argument(
        "-o",
        "--output",
        help="Path to the output text file. Defaults to input file name with .txt extension.",
    )

    parser.add_argument(
        "-v", "--verbose", action="store_true", help="Enable verbose output."
    )

    args = parser.parse_args()

    if args.output is None:
        base_name = os.path.splitext(args.input)[0]
        args.output = f"{base_name}.txt"

    if args.verbose:
        print(f"Converting {args.input} to {args.output}...")

    convert_webvtt_to_text(args.input, args.output, args.verbose)
	#!/usr/bin/env python3

	"""
	Converts WEBVTT subtitles (vtt) to plain text.

	It removes all time related info as well as duplicated and empty lines.
	"""
	# Author: Artur Martins <[email protected]>
	# Version: 1.0
	# Date: 2023-Oct-25

	import re
	import argparse
	import os

	ENCODING_TYPE = "utf-8"
	HEADER = "WEBVTT"


	def clean_line(line: str) -> str:
	"""
	Remove all WebVTT tags and time codes from the given line.

	Args:
	line (str): The line of text to clean.

	Returns:
	str: The cleaned line with all tags and time codes removed and leading/trailing whitespace stripped.
	"""
	# Remove all WebVTT tags and time codes
	cleaned_line = re.sub(r"<.*?>", "", line)
	cleaned_line = re.sub(r"\d{2}:\d{2}:\d{2}\.\d{3}", "", cleaned_line)
	return cleaned_line.strip()


	def convert_webvtt_to_text(input_path: str, output_path: str, verbose: bool) -> None:
	"""
	Convert a WebVTT file to plain text.

	Args:
	input_path (str): The path to the WebVTT input file.
	output_path (str): The path to the output text file.
	verbose (bool): If True, print the cleaned lines as they are written.

	Returns:
	None
	"""
	last_written_line = ""
	with open(input_path, "r", encoding=ENCODING_TYPE) as infile, open(
	output_path, "w", encoding=ENCODING_TYPE
	) as outfile:
	lines = infile.readlines()
	for line in lines:
	line = line.strip()
	# Skip time lines or WebVTT header
	if "-->" in line or line == HEADER:
	continue
	# Skip empty lines
	if not line:
	continue
	cleaned_line = clean_line(line)
	if cleaned_line and cleaned_line != last_written_line:
	if verbose:
	print(f"Writing: {cleaned_line}")
	outfile.write(cleaned_line + "\n")
	last_written_line = cleaned_line


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Convert WebVTT to text.")
	parser.add_argument(
	"-i", "--input", required=True, help="Path to the input WebVTT file."
	)
	parser.add_argument(
	"-o",
	"--output",
	help="Path to the output text file. Defaults to input file name with .txt extension.",
	)

	parser.add_argument(
	"-v", "--verbose", action="store_true", help="Enable verbose output."
	)

	args = parser.parse_args()

	if args.output is None:
	base_name = os.path.splitext(args.input)[0]
	args.output = f"{base_name}.txt"

	if args.verbose:
	print(f"Converting {args.input} to {args.output}...")

	convert_webvtt_to_text(args.input, args.output, args.verbose)