Airbus5717 · February 8, 2024 11:06
diff --git a/main.py b/main.py
 # source https://gist.github.com/glasslion/b2fcad16bc8a9630dbd7a945ab5ebf5e
 import os

 def find_files(directory, extension):
    """
    Recursively finds all files with a specific extension in a directory and its subdirectories.

    Args:
    - directory (str): The directory to start the search from.
    - extension (str): The file extension to search for (e.g., '.txt', '.jpg', etc.).

    Returns:
    - file_list (list): A list of file paths matching the specified extension.
    """
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(extension):
                file_list.append(os.path.join(root, file))
    return file_list

 # Example usage:
 directory_path = './'
 file_extension = '.vtt'
 found_files = find_files(directory_path, file_extension)
 print("Found files with extension '{}':".format(file_extension))
 # for file_path in found_files:
 #     print(file_path)



 """
 Convert YouTube subtitles(vtt) to human readable text.

 Download only subtitles from YouTube with youtube-dl:
 youtube-dl  --skip-download --convert-subs vtt <video_url>

 Note that default subtitle format provided by YouTube is ass, which is hard
 to process with simple regex. Luckily youtube-dl can convert ass to vtt, which
 is easier to process.

 To conver all vtt files inside a directory:
 find . -name "*.vtt" -exec python vtt2text.py {} \;
 """

 import sys
 import re


 def remove_tags(text):
    """
    Remove vtt markup tags
    """
    tags = [
        r'</c>',
        r'<c(\.color\w+)?>',
        r'<\d{2}:\d{2}:\d{2}\.\d{3}>',

    ]

    for pat in tags:
        text = re.sub(pat, '', text)

    # extract timestamp, only kep HH:MM
    text = re.sub(
        r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
        r'\g<1>',
        text
    )

    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
    return text

 def remove_header(lines):
    """
    Remove vtt file header
    """
    pos = -1
    for mark in ('##', 'Language: en',):
        if mark in lines:
            pos = lines.index(mark)
    lines = lines[pos+1:]
    return lines


 def merge_duplicates(lines):
    """
    Remove duplicated subtitles. Duplacates are always adjacent.
    """
    last_timestamp = ''
    last_cap = ''
    for line in lines:
        if line == "":
            continue
        if re.match('^\d{2}:\d{2}$', line):
            if line != last_timestamp:
                yield line
                last_timestamp = line
        else:
            if line != last_cap:
                yield line
                last_cap = line


 def merge_short_lines(lines):
    buffer = ''
    for line in lines:
        if line == "" or re.match('^\d{2}:\d{2}$', line):
            yield '\n' + line
            continue

        if len(line+buffer) < 80:
            buffer += ' ' + line
        else:
            yield buffer.strip()
            buffer = line
    yield buffer



 def main():
    print(len(found_files))
    for file in found_files:
        print("file path: "+ file)
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
        vtt_file_name = file
        txt_name =  re.sub(r'.vtt$', '.txt', vtt_file_name)
        print("text file path: "+ txt_name)

        text = remove_tags(text)
        lines = text.splitlines()
        lines = remove_header(lines)
        lines = merge_duplicates(lines)
        lines = list(lines)
        lines = merge_short_lines(lines)
        lines = list(lines)

        with open(txt_name, 'w') as f:
            for line in lines:
                f.write(line)
                f.write("\n")



 if __name__ == "__main__":
    main()
	# source https://gist.github.com/glasslion/b2fcad16bc8a9630dbd7a945ab5ebf5e
	import os

	def find_files(directory, extension):
	"""
	Recursively finds all files with a specific extension in a directory and its subdirectories.

	Args:
	- directory (str): The directory to start the search from.
	- extension (str): The file extension to search for (e.g., '.txt', '.jpg', etc.).

	Returns:
	- file_list (list): A list of file paths matching the specified extension.
	"""
	file_list = []
	for root, dirs, files in os.walk(directory):
	for file in files:
	if file.endswith(extension):
	file_list.append(os.path.join(root, file))
	return file_list

	# Example usage:
	directory_path = './'
	file_extension = '.vtt'
	found_files = find_files(directory_path, file_extension)
	print("Found files with extension '{}':".format(file_extension))
	# for file_path in found_files:
	# print(file_path)



	"""
	Convert YouTube subtitles(vtt) to human readable text.

	Download only subtitles from YouTube with youtube-dl:
	youtube-dl --skip-download --convert-subs vtt <video_url>

	Note that default subtitle format provided by YouTube is ass, which is hard
	to process with simple regex. Luckily youtube-dl can convert ass to vtt, which
	is easier to process.

	To conver all vtt files inside a directory:
	find . -name "*.vtt" -exec python vtt2text.py {} \;
	"""

	import sys
	import re


	def remove_tags(text):
	"""
	Remove vtt markup tags
	"""
	tags = [
	r'</c>',
	r'<c(\.color\w+)?>',
	r'<\d{2}:\d{2}:\d{2}\.\d{3}>',

	]

	for pat in tags:
	text = re.sub(pat, '', text)

	# extract timestamp, only kep HH:MM
	text = re.sub(
	r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
	r'\g<1>',
	text
	)

	text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
	return text

	def remove_header(lines):
	"""
	Remove vtt file header
	"""
	pos = -1
	for mark in ('##', 'Language: en',):
	if mark in lines:
	pos = lines.index(mark)
	lines = lines[pos+1:]
	return lines


	def merge_duplicates(lines):
	"""
	Remove duplicated subtitles. Duplacates are always adjacent.
	"""
	last_timestamp = ''
	last_cap = ''
	for line in lines:
	if line == "":
	continue
	if re.match('^\d{2}:\d{2}$', line):
	if line != last_timestamp:
	yield line
	last_timestamp = line
	else:
	if line != last_cap:
	yield line
	last_cap = line


	def merge_short_lines(lines):
	buffer = ''
	for line in lines:
	if line == "" or re.match('^\d{2}:\d{2}$', line):
	yield '\n' + line
	continue

	if len(line+buffer) < 80:
	buffer += ' ' + line
	else:
	yield buffer.strip()
	buffer = line
	yield buffer



	def main():
	print(len(found_files))
	for file in found_files:
	print("file path: "+ file)
	with open(file, 'r', encoding='utf-8') as f:
	text = f.read()
	vtt_file_name = file
	txt_name = re.sub(r'.vtt$', '.txt', vtt_file_name)
	print("text file path: "+ txt_name)

	text = remove_tags(text)
	lines = text.splitlines()
	lines = remove_header(lines)
	lines = merge_duplicates(lines)
	lines = list(lines)
	lines = merge_short_lines(lines)
	lines = list(lines)

	with open(txt_name, 'w') as f:
	for line in lines:
	f.write(line)
	f.write("\n")



	if __name__ == "__main__":
	main()
No results found