rBrenick · September 12, 2024 23:32 · rBrenick · Dec 29, 2022
diff --git a/split_subtitle_file_into_multiline.py b/split_subtitle_file_into_multiline.py
 import sys
 import argparse

 """
 Example usecase:

 python split_srt_lines.py SUBTITLE_FILE_PATH.srt -o OUTPUT_FILE_PATH.srt --max_line_length=42 --comma_split_percent=75

 if you don't specify an output path, it will replace the file content of the input file

 I would not recommend running this multiple times on the same file, as this script currently does not support reprocessing, and might introduce strange line splits.

 """


 def split_text_into_multiline(segment_text, max_line_length, comma_split_threshold):
    
    words = segment_text.split(' ')
    
    lines = [
        words[0]
    ]
    
    for word in words[1:]:
        current_line = lines[-1]
        
        # start a new line if the last word ended with a comma,
        # and we're mostly through this line
        if current_line.endswith(',') and len(current_line) > comma_split_threshold:
            lines.append(word)
            continue
        
        line_with_word = f'{current_line} {word}'
        
        # don't take punctuation into account when checking length
        if len(line_with_word.replace(".", "")) >= max_line_length:
            lines.append(word)
        else:
            lines[-1] = line_with_word
    
    return '\n'.join(lines)


 def optional_int(string):
    return None if string == "None" else int(string)


 def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("subtitle_file", type=str, help=".srt file to split")
    parser.add_argument("--output_file", "-o", type=str, default="", help="output file, if empty will do in-place")
    parser.add_argument("--max_line_length", type=optional_int, default=42, help="max amount of characters for a line in the subtitle files")
    parser.add_argument("--comma_split_percent", type=optional_int, default=75, help="split line on a comma is within this percentage value")
    
    args = parser.parse_args().__dict__
    
    input_file = args.get("subtitle_file")
    
    if args.get("output_file"):
        output_file = args.get("output_file")
    else:
        output_file = input_file
    
    # get the actual integer threshold value
    max_line_length = args.get("max_line_length")
    split_threshold = int(float(max_line_length) * float(args.get("comma_split_percent"))*0.01)
    
    out_lines = []
    with open(input_file, "r") as fp:
        in_lines = fp.readlines()
        
        split_next_line = False
        
        for line in in_lines:
            
            if split_next_line:
                line = split_text_into_multiline(line, max_line_length, split_threshold)
                split_next_line = False
            
            out_lines.append(line)
            
            if "-->" in line:
                split_next_line = True


    with open(output_file, "w+") as fp:
        fp.writelines(out_lines)

 if __name__ == "__main__":
    main()
	import sys
	import argparse

	"""
	Example usecase:

	python split_srt_lines.py SUBTITLE_FILE_PATH.srt -o OUTPUT_FILE_PATH.srt --max_line_length=42 --comma_split_percent=75

	if you don't specify an output path, it will replace the file content of the input file

	I would not recommend running this multiple times on the same file, as this script currently does not support reprocessing, and might introduce strange line splits.

	"""


	def split_text_into_multiline(segment_text, max_line_length, comma_split_threshold):

	words = segment_text.split(' ')

	lines = [
	words[0]
	]

	for word in words[1:]:
	current_line = lines[-1]

	# start a new line if the last word ended with a comma,
	# and we're mostly through this line
	if current_line.endswith(',') and len(current_line) > comma_split_threshold:
	lines.append(word)
	continue

	line_with_word = f'{current_line} {word}'

	# don't take punctuation into account when checking length
	if len(line_with_word.replace(".", "")) >= max_line_length:
	lines.append(word)
	else:
	lines[-1] = line_with_word

	return '\n'.join(lines)


	def optional_int(string):
	return None if string == "None" else int(string)


	def main():
	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument("subtitle_file", type=str, help=".srt file to split")
	parser.add_argument("--output_file", "-o", type=str, default="", help="output file, if empty will do in-place")
	parser.add_argument("--max_line_length", type=optional_int, default=42, help="max amount of characters for a line in the subtitle files")
	parser.add_argument("--comma_split_percent", type=optional_int, default=75, help="split line on a comma is within this percentage value")

	args = parser.parse_args().__dict__

	input_file = args.get("subtitle_file")

	if args.get("output_file"):
	output_file = args.get("output_file")
	else:
	output_file = input_file

	# get the actual integer threshold value
	max_line_length = args.get("max_line_length")
	split_threshold = int(float(max_line_length) * float(args.get("comma_split_percent"))*0.01)

	out_lines = []
	with open(input_file, "r") as fp:
	in_lines = fp.readlines()

	split_next_line = False

	for line in in_lines:

	if split_next_line:
	line = split_text_into_multiline(line, max_line_length, split_threshold)
	split_next_line = False

	out_lines.append(line)

	if "-->" in line:
	split_next_line = True


	with open(output_file, "w+") as fp:
	fp.writelines(out_lines)

	if __name__ == "__main__":
	main()