Last active
September 12, 2024 23:32
-
-
Save rBrenick/fcb8d07ecaa55856ecd9745ecfd29341 to your computer and use it in GitHub Desktop.
Take an .srt file and split the text line if the character count goes above a certain threshold.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import argparse | |
""" | |
Example usecase: | |
python split_srt_lines.py SUBTITLE_FILE_PATH.srt -o OUTPUT_FILE_PATH.srt --max_line_length=42 --comma_split_percent=75 | |
if you don't specify an output path, it will replace the file content of the input file | |
I would not recommend running this multiple times on the same file, as this script currently does not support reprocessing, and might introduce strange line splits. | |
""" | |
def split_text_into_multiline(segment_text, max_line_length, comma_split_threshold): | |
words = segment_text.split(' ') | |
lines = [ | |
words[0] | |
] | |
for word in words[1:]: | |
current_line = lines[-1] | |
# start a new line if the last word ended with a comma, | |
# and we're mostly through this line | |
if current_line.endswith(',') and len(current_line) > comma_split_threshold: | |
lines.append(word) | |
continue | |
line_with_word = f'{current_line} {word}' | |
# don't take punctuation into account when checking length | |
if len(line_with_word.replace(".", "")) >= max_line_length: | |
lines.append(word) | |
else: | |
lines[-1] = line_with_word | |
return '\n'.join(lines) | |
def optional_int(string): | |
return None if string == "None" else int(string) | |
def main(): | |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument("subtitle_file", type=str, help=".srt file to split") | |
parser.add_argument("--output_file", "-o", type=str, default="", help="output file, if empty will do in-place") | |
parser.add_argument("--max_line_length", type=optional_int, default=42, help="max amount of characters for a line in the subtitle files") | |
parser.add_argument("--comma_split_percent", type=optional_int, default=75, help="split line on a comma is within this percentage value") | |
args = parser.parse_args().__dict__ | |
input_file = args.get("subtitle_file") | |
if args.get("output_file"): | |
output_file = args.get("output_file") | |
else: | |
output_file = input_file | |
# get the actual integer threshold value | |
max_line_length = args.get("max_line_length") | |
split_threshold = int(float(max_line_length) * float(args.get("comma_split_percent"))*0.01) | |
out_lines = [] | |
with open(input_file, "r") as fp: | |
in_lines = fp.readlines() | |
split_next_line = False | |
for line in in_lines: | |
if split_next_line: | |
line = split_text_into_multiline(line, max_line_length, split_threshold) | |
split_next_line = False | |
out_lines.append(line) | |
if "-->" in line: | |
split_next_line = True | |
with open(output_file, "w+") as fp: | |
fp.writelines(out_lines) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
see this whisper discussion for context