Norod · July 16, 2020 15:46
diff --git a/subxml2txt.py b/subxml2txt.py
 #Read xml in OpenSubtitles format, output text intended for training machine learning models

 import sys
 from os import path
 import time
 import xml.etree.ElementTree as ET

 if len(sys.argv) != 2:
    print("Usage: "+ str(sys.argv[0]) + " input.xml")
    exit(-1)

 input_xml_file = sys.argv[1]

 if path.exists(input_xml_file) == False:
    print("Error: "+ str(sys.argv[1]) + " not found")
    exit(-2)

 root = ET.parse(input_xml_file).getroot()
 all_s = root.findall('s')

 print("\n<|endoftext|>\n")
 previous_timestamp = 0
 for type_tag in all_s:
    all_time = type_tag.findall('time')
    if len(all_time) > 0:
        time_tag = all_time[-1]
        value_time = time_tag.get('value')
        ms = int(value_time.split(",")[1])
        time_struct = time.strptime(value_time, "%H:%M:%S,%f")
        timestamp = ((time_struct.tm_hour * 36000) + (time_struct.tm_min * 60) + (time_struct.tm_sec)) * 1000 + ms
        #print('timestamp = ' + str(timestamp))
        if previous_timestamp == 0:
            previous_timestamp = timestamp
        delta_timestamp = timestamp - previous_timestamp
        previous_timestamp = timestamp
        if delta_timestamp > 10000:
            print("<|endoftext|>")

    value_texts = type_tag.itertext()
    print(''.join(value_texts).replace("\n", ""))
 print("<|endoftext|>")
	#Read xml in OpenSubtitles format, output text intended for training machine learning models

	import sys
	from os import path
	import time
	import xml.etree.ElementTree as ET

	if len(sys.argv) != 2:
	print("Usage: "+ str(sys.argv[0]) + " input.xml")
	exit(-1)

	input_xml_file = sys.argv[1]

	if path.exists(input_xml_file) == False:
	print("Error: "+ str(sys.argv[1]) + " not found")
	exit(-2)

	root = ET.parse(input_xml_file).getroot()
	all_s = root.findall('s')

	print("\n<\|endoftext\|>\n")
	previous_timestamp = 0
	for type_tag in all_s:
	all_time = type_tag.findall('time')
	if len(all_time) > 0:
	time_tag = all_time[-1]
	value_time = time_tag.get('value')
	ms = int(value_time.split(",")[1])
	time_struct = time.strptime(value_time, "%H:%M:%S,%f")
	timestamp = ((time_struct.tm_hour * 36000) + (time_struct.tm_min * 60) + (time_struct.tm_sec)) * 1000 + ms
	#print('timestamp = ' + str(timestamp))
	if previous_timestamp == 0:
	previous_timestamp = timestamp
	delta_timestamp = timestamp - previous_timestamp
	previous_timestamp = timestamp
	if delta_timestamp > 10000:
	print("<\|endoftext\|>")

	value_texts = type_tag.itertext()
	print(''.join(value_texts).replace("\n", ""))
	print("<\|endoftext\|>")