Created
April 30, 2012 23:52
-
-
Save mbarkhau/2563730 to your computer and use it in GitHub Desktop.
Convert googles audio transcription xml to sbv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
import sys | |
import os.path | |
import HTMLParser | |
from xml.sax.saxutils import unescape | |
from xml.dom import pulldom | |
MINS = 60 | |
def fmt_time(t): | |
h = t / 3600 | |
t = t % 3600 | |
m = t / 60 | |
t = t % 60 | |
return "%d:%02d:%02.3f" % (h, m, t) | |
def read_elements(filename): | |
""" yields (start_second, end_second, text) """ | |
events = pulldom.parse(filename) | |
for event, node in events: | |
if event == 'START_ELEMENT' and node.tagName=='text': | |
text = "" | |
start = float(node.getAttribute('start')) | |
dur = float(node.getAttribute('dur')) | |
end = start + dur | |
elif event == 'CHARACTERS': | |
text += node.data | |
elif event == 'END_ELEMENT': | |
yield start, end, text | |
def convert_times(elements): | |
for s, e, txt in elements: | |
yield fmt_time(s), fmt_time(e), txt | |
def write_elements(elements, outfile): | |
p = HTMLParser.HTMLParser() | |
with open(outfile, 'w') as f: | |
for s, e, txt in elements: | |
f.write(s) | |
f.write(",") | |
f.write(e) | |
f.write("\n") | |
f.write(p.unescape(txt)) | |
f.write("\n\n") | |
TEST_FILE = "testfile_transscript.xml" | |
def read_url(url, outfile=None): | |
import tempfile | |
import requests | |
if not outfile: | |
temp = temfile.TemporaryFile() | |
outfile = temp.name | |
with open(outfile, 'w') as f: | |
f.write(requests.get(url).text) | |
return outfile | |
if __name__ == '__main__': | |
args = sys.argv | |
outfile = "outfile.sbv" | |
if len(args) > 1: | |
arg = args[1] | |
if arg.startswith("http"): | |
#filename = read_url(arg) | |
filename = TEST_FILE | |
else: | |
filename = arg | |
if len(args) > 2: | |
outfile = args[2] | |
if os.path.exists(filename): | |
elems = read_elements(filename) | |
elems = convert_times(elems) | |
write_elements(elems, outfile) | |
else: | |
print "No such file: " + filename | |
else: | |
print """Usage: youtube_trascribe.py filename.xml [outfile.sbv]""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment