Created
April 6, 2018 14:28
-
-
Save TylerTemp/89b1a7cac17c6b1260f278bbb35520e2 to your computer and use it in GitHub Desktop.
youtube timedtext.xml to srt (SubRip) format converter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Usage: python tt2srt.py source.xml > output.srt | |
""" | |
import sys | |
from xml.dom.minidom import parse | |
def mseconds_to_time(mseconds): | |
s, ms = divmod(mseconds, 1000) | |
m, s = divmod(s, 60) | |
h, m = divmod(m, 60) | |
return "%d:%02d:%02d,%03d" % (h, m, s, ms) | |
dom = parse(sys.argv[1]) | |
body = dom.getElementsByTagName("body")[0] | |
paras = body.getElementsByTagName("p") | |
for index, para in enumerate(paras): | |
print(index) | |
start_msecond = int(para.attributes['t'].value) | |
end_msecond = start_msecond + int(para.attributes['d'].value) | |
# start_time = str(datetime.timedelta(seconds=start_second / 1000)) | |
start_time = mseconds_to_time(start_msecond) | |
end_time = mseconds_to_time(end_msecond) | |
print('%s --> %s' % (start_time, end_time)) | |
for child in para.childNodes: | |
if child.nodeName == 'br': | |
# out.write("\n") | |
sys.stdout.write("\n") | |
elif child.nodeName == '#text': | |
sys.stdout.write(child.data.encode('utf-8').decode('utf-8')) | |
sys.stdout.write("\n\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment