Skip to content

Instantly share code, notes, and snippets.

@rBrenick
Last active September 12, 2024 23:32
Show Gist options
  • Save rBrenick/fcb8d07ecaa55856ecd9745ecfd29341 to your computer and use it in GitHub Desktop.
Save rBrenick/fcb8d07ecaa55856ecd9745ecfd29341 to your computer and use it in GitHub Desktop.
Take an .srt file and split the text line if the character count goes above a certain threshold.
import sys
import argparse
"""
Example usecase:
python split_srt_lines.py SUBTITLE_FILE_PATH.srt -o OUTPUT_FILE_PATH.srt --max_line_length=42 --comma_split_percent=75
if you don't specify an output path, it will replace the file content of the input file
I would not recommend running this multiple times on the same file, as this script currently does not support reprocessing, and might introduce strange line splits.
"""
def split_text_into_multiline(segment_text, max_line_length, comma_split_threshold):
words = segment_text.split(' ')
lines = [
words[0]
]
for word in words[1:]:
current_line = lines[-1]
# start a new line if the last word ended with a comma,
# and we're mostly through this line
if current_line.endswith(',') and len(current_line) > comma_split_threshold:
lines.append(word)
continue
line_with_word = f'{current_line} {word}'
# don't take punctuation into account when checking length
if len(line_with_word.replace(".", "")) >= max_line_length:
lines.append(word)
else:
lines[-1] = line_with_word
return '\n'.join(lines)
def optional_int(string):
return None if string == "None" else int(string)
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("subtitle_file", type=str, help=".srt file to split")
parser.add_argument("--output_file", "-o", type=str, default="", help="output file, if empty will do in-place")
parser.add_argument("--max_line_length", type=optional_int, default=42, help="max amount of characters for a line in the subtitle files")
parser.add_argument("--comma_split_percent", type=optional_int, default=75, help="split line on a comma is within this percentage value")
args = parser.parse_args().__dict__
input_file = args.get("subtitle_file")
if args.get("output_file"):
output_file = args.get("output_file")
else:
output_file = input_file
# get the actual integer threshold value
max_line_length = args.get("max_line_length")
split_threshold = int(float(max_line_length) * float(args.get("comma_split_percent"))*0.01)
out_lines = []
with open(input_file, "r") as fp:
in_lines = fp.readlines()
split_next_line = False
for line in in_lines:
if split_next_line:
line = split_text_into_multiline(line, max_line_length, split_threshold)
split_next_line = False
out_lines.append(line)
if "-->" in line:
split_next_line = True
with open(output_file, "w+") as fp:
fp.writelines(out_lines)
if __name__ == "__main__":
main()
@rBrenick
Copy link
Author

see this whisper discussion for context

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment