iamevn · January 24, 2023 06:49
diff --git a/amazon-netflix_typeset_split.py b/amazon-netflix_typeset_split.py
 #!/usr/bin/env python3
 # split out AMAZON/NETFLIX STYLE TYPESETTING
 #by iamevn
 import sys, re

 def find_nth(string, substring, n, start=0):
    """find nth occurance of substring in string starting at position start.
    (uses string.find) n starts at 1, start starts at 0"""
    found = string.find(substring, start)
    if n == 1 or found == -1:
        return found
    elif n > 1:
        return find_nth(string, substring, n - 1, start=found + 1)

 # match an ASS event with named groups and newline on the end
 line_pattern = re.compile(r'(?P<Format>[^:]*): ?(?P<Layer>\d*), ?(?P<Start>[^,]*), ?(?P<End>[^,]*), ?(?P<Style>[^,]*), ?(?P<Name>[^,]*), ?(?P<MarginL>[^,]*), ?(?P<MarginR>[^,]*), ?(?P<MarginV>[^,]*), ?(?P<Effect>[^,]*),(?P<Text>.*\n)')
 def line2dict(line):
    """pull fields out of ass event into dictionary
    takes string line as argument and returns dictionary or None if line is not an ASS event"""
    # print(line) # <- fun UnicodeEncodeErrors!
    match = line_pattern.match(line)
    if match:
        return {key: match.group(key) for key in line_pattern.groupindex}
    else:
        return None

 def dict2line(d):
    return "{Format}: {Layer},{Start},{End},{Style},{Name},{MarginL},{MarginR},{MarginV},{Effect},{Text}".format(**d)
        
 def is_sign(text):
    """True if text should be a sign, False otherwise.

    specifically:
    False if line has no alphabetic text and ends in punctuation,
    True if no characters in text outside of {} pairs are lowercase,
    False otherwise.
    """
    in_comment = False
    escaped = False
    has_alphabetic_text = False
    last_char = ''
    for c in text:
        if escaped:
            escaped = False
            last_char = c
        elif c == '\\':
            escaped = True
            last_char = '\\'
        elif in_comment and c == '}':
            in_comment = False
        elif not in_comment and c == '{':
            in_comment = True
        elif not in_comment and c.islower():
            return False
        elif not in_comment and c.isalpha():
            has_alphabetic_text = True
            last_char = c
        elif not in_comment:
            last_char = c
    if (not has_alphabetic_text) and last_char in '.,!?':
        return False
    return True

 def missing_newline(text):
    """True if text doesn't have a newline at the end. Otherwise False."""
    return not text.endswith('\n')

 def main(inpath, outpath, new_style='Type'):
    r""" Split out amazon/netflix style "typesetting"
 finds ALL UPPERCASE lines (whole events or a subsection of an event split by \N)
 and puts them them on a line with a style new_style

 requirement:
 -> Check the line and look for \N

 There is no \N:
 1) Is everything uppercase only? -> Y: Give it the style "Type"
 2) N: Leave it as it is

 There is a \N:
 1) Is everything before the \N only uppercase? -> Y: Split the line and give the uppercase line the style "Type"
 2) Is everything before and after the \N uppercase? -> Y: Give it the style "Type"
 3) N: Leave it as it is

 implemented slightly modified version that works with multiple \N on a line
    """
    lines = list()
    with open(inpath, encoding='utf-8') as infile:
        # seek to [Events] section
        lines.append(infile.readline())
        while lines[-1] != '[Events]\n':
            lines.append(infile.readline())
        lines.append(infile.readline()) # Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
        nextline = infile.readline() # the first line of dialogue

        while nextline:
            d = line2dict(nextline)
            if d.get('Format') == 'Dialogue':
                if not '\\N' in d['Text'] and is_sign(d['Text']):
                    d['Style'] = new_style
                elif '\\N' in d['Text']:
                    upper_segments = []
                    lower_segments = []

                    for segment in d['Text'].split('\\N'):
                        if is_sign(segment):
                            upper_segments.append(segment)
                        else:
                            lower_segments.append(segment)

                    upper_text = '\\N'.join(upper_segments)
                    lower_text = '\\N'.join(lower_segments)

                    if upper_segments and lower_segments:
                        # need to create new line for upper segments
                        upper_d = d.copy()
                        upper_d['Style'] = new_style
                        upper_d['Text'] = upper_text
                        lines.append(dict2line(upper_d))
                        # put lower back in original
                        d['Text'] = lower_text
                        lines.append(dict2line(d))
                    elif upper_segments:
                        # no lower segments, can reuse original line for upper
                        d['Style'] = new_style
                        lines.append(dict2line(d))
                    # if not upper_segments then line is left unchanged
                    else:
                        lines.append(dict2line(d))
                else:
                    lines.append(nextline)
            else:
                lines.append(nextline)
            
            nextline = infile.readline()

    with open(outpath, 'w', encoding='utf-8') as outfile:
        for line in lines:
            outfile.write(line)
            if missing_newline(line):
                outfile.write('\n')

 if __name__ == '__main__':
    if len(sys.argv) != 3:
        sys.exit('Usage: {} infile.ass outfile.ass'.format(sys.argv[0]))

    main(sys.argv[1], sys.argv[2], new_style='Type')
	#!/usr/bin/env python3
	# split out AMAZON/NETFLIX STYLE TYPESETTING
	#by iamevn
	import sys, re

	def find_nth(string, substring, n, start=0):
	"""find nth occurance of substring in string starting at position start.
	(uses string.find) n starts at 1, start starts at 0"""
	found = string.find(substring, start)
	if n == 1 or found == -1:
	return found
	elif n > 1:
	return find_nth(string, substring, n - 1, start=found + 1)

	# match an ASS event with named groups and newline on the end
	line_pattern = re.compile(r'(?P<Format>[^:]): ?(?P<Layer>\d), ?(?P<Start>[^,]), ?(?P<End>[^,]), ?(?P<Style>[^,]), ?(?P<Name>[^,]), ?(?P<MarginL>[^,]), ?(?P<MarginR>[^,]), ?(?P<MarginV>[^,]), ?(?P<Effect>[^,]),(?P<Text>.*\n)')
	def line2dict(line):
	"""pull fields out of ass event into dictionary
	takes string line as argument and returns dictionary or None if line is not an ASS event"""
	# print(line) # <- fun UnicodeEncodeErrors!
	match = line_pattern.match(line)
	if match:
	return {key: match.group(key) for key in line_pattern.groupindex}
	else:
	return None

	def dict2line(d):
	return "{Format}: {Layer},{Start},{End},{Style},{Name},{MarginL},{MarginR},{MarginV},{Effect},{Text}".format(**d)

	def is_sign(text):
	"""True if text should be a sign, False otherwise.

	specifically:
	False if line has no alphabetic text and ends in punctuation,
	True if no characters in text outside of {} pairs are lowercase,
	False otherwise.
	"""
	in_comment = False
	escaped = False
	has_alphabetic_text = False
	last_char = ''
	for c in text:
	if escaped:
	escaped = False
	last_char = c
	elif c == '\\':
	escaped = True
	last_char = '\\'
	elif in_comment and c == '}':
	in_comment = False
	elif not in_comment and c == '{':
	in_comment = True
	elif not in_comment and c.islower():
	return False
	elif not in_comment and c.isalpha():
	has_alphabetic_text = True
	last_char = c
	elif not in_comment:
	last_char = c
	if (not has_alphabetic_text) and last_char in '.,!?':
	return False
	return True

	def missing_newline(text):
	"""True if text doesn't have a newline at the end. Otherwise False."""
	return not text.endswith('\n')

	def main(inpath, outpath, new_style='Type'):
	r""" Split out amazon/netflix style "typesetting"
	finds ALL UPPERCASE lines (whole events or a subsection of an event split by \N)
	and puts them them on a line with a style new_style

	requirement:
	-> Check the line and look for \N

	There is no \N:
	1) Is everything uppercase only? -> Y: Give it the style "Type"
	2) N: Leave it as it is

	There is a \N:
	1) Is everything before the \N only uppercase? -> Y: Split the line and give the uppercase line the style "Type"
	2) Is everything before and after the \N uppercase? -> Y: Give it the style "Type"
	3) N: Leave it as it is

	implemented slightly modified version that works with multiple \N on a line
	"""
	lines = list()
	with open(inpath, encoding='utf-8') as infile:
	# seek to [Events] section
	lines.append(infile.readline())
	while lines[-1] != '[Events]\n':
	lines.append(infile.readline())
	lines.append(infile.readline()) # Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
	nextline = infile.readline() # the first line of dialogue

	while nextline:
	d = line2dict(nextline)
	if d.get('Format') == 'Dialogue':
	if not '\\N' in d['Text'] and is_sign(d['Text']):
	d['Style'] = new_style
	elif '\\N' in d['Text']:
	upper_segments = []
	lower_segments = []

	for segment in d['Text'].split('\\N'):
	if is_sign(segment):
	upper_segments.append(segment)
	else:
	lower_segments.append(segment)

	upper_text = '\\N'.join(upper_segments)
	lower_text = '\\N'.join(lower_segments)

	if upper_segments and lower_segments:
	# need to create new line for upper segments
	upper_d = d.copy()
	upper_d['Style'] = new_style
	upper_d['Text'] = upper_text
	lines.append(dict2line(upper_d))
	# put lower back in original
	d['Text'] = lower_text
	lines.append(dict2line(d))
	elif upper_segments:
	# no lower segments, can reuse original line for upper
	d['Style'] = new_style
	lines.append(dict2line(d))
	# if not upper_segments then line is left unchanged
	else:
	lines.append(dict2line(d))
	else:
	lines.append(nextline)
	else:
	lines.append(nextline)

	nextline = infile.readline()

	with open(outpath, 'w', encoding='utf-8') as outfile:
	for line in lines:
	outfile.write(line)
	if missing_newline(line):
	outfile.write('\n')

	if __name__ == '__main__':
	if len(sys.argv) != 3:
	sys.exit('Usage: {} infile.ass outfile.ass'.format(sys.argv[0]))

	main(sys.argv[1], sys.argv[2], new_style='Type')