Skip to content

Instantly share code, notes, and snippets.

@deepakg
Created July 7, 2017 20:21
Show Gist options
  • Save deepakg/8ddf7a3bd29190b8348a709dae341b0f to your computer and use it in GitHub Desktop.
Save deepakg/8ddf7a3bd29190b8348a709dae341b0f to your computer and use it in GitHub Desktop.
from __future__ import print_function
import xml.etree.ElementTree as ET
import string
import codecs
ns = { 'ttml' : 'http://www.w3.org/ns/ttml' }
f = codecs.open('abstract.tsv', 'w', "utf-8")
f.write("episode\tscript\n");
for episode in range(8):
tree = ET.parse('abstract-s1e' + str(episode + 1) + '.ttml')
root = tree.getroot()
script = ""
for elem in root.findall('./ttml:body/ttml:div/ttml:p/ttml:span',ns):
script += string.strip(elem.text) + " "
f.write(str(episode + 1) + "\t" + string.strip(script) + "\n")
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment