Last active
July 16, 2020 15:46
-
-
Save Norod/e109a023fa29e1f375bd9f630e7e84e1 to your computer and use it in GitHub Desktop.
Read xml in OpenSubtitles format, output text intended for training machine learning models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Read xml in OpenSubtitles format, output text intended for training machine learning models | |
import sys | |
from os import path | |
import time | |
import xml.etree.ElementTree as ET | |
if len(sys.argv) != 2: | |
print("Usage: "+ str(sys.argv[0]) + " input.xml") | |
exit(-1) | |
input_xml_file = sys.argv[1] | |
if path.exists(input_xml_file) == False: | |
print("Error: "+ str(sys.argv[1]) + " not found") | |
exit(-2) | |
root = ET.parse(input_xml_file).getroot() | |
all_s = root.findall('s') | |
print("\n<|endoftext|>\n") | |
previous_timestamp = 0 | |
for type_tag in all_s: | |
all_time = type_tag.findall('time') | |
if len(all_time) > 0: | |
time_tag = all_time[-1] | |
value_time = time_tag.get('value') | |
ms = int(value_time.split(",")[1]) | |
time_struct = time.strptime(value_time, "%H:%M:%S,%f") | |
timestamp = ((time_struct.tm_hour * 36000) + (time_struct.tm_min * 60) + (time_struct.tm_sec)) * 1000 + ms | |
#print('timestamp = ' + str(timestamp)) | |
if previous_timestamp == 0: | |
previous_timestamp = timestamp | |
delta_timestamp = timestamp - previous_timestamp | |
previous_timestamp = timestamp | |
if delta_timestamp > 10000: | |
print("<|endoftext|>") | |
value_texts = type_tag.itertext() | |
print(''.join(value_texts).replace("\n", "")) | |
print("<|endoftext|>") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment