Created
September 6, 2011 05:28
-
-
Save apit/1196670 to your computer and use it in GitHub Desktop.
TED video/subtitle/comments downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python -u | |
""" | |
@author apit - http://fizdoonk.multiply.com/ | |
@require python-lxml, python-simplejson | |
""" | |
import os, sys, re, codecs | |
import simplejson | |
import urllib2 | |
import lxml.html | |
from lxml.html.clean import Cleaner | |
from urllib import urlopen | |
from subprocess import call | |
DOWNLOAD_VIDEO = True | |
OUTPUT_DIR = './out/' | |
VIDEO_LOWRES = True | |
BASE_URL = "http://www.ted.com" | |
SUBTITLE_URL = BASE_URL + "/talks/subtitles/id/%s/lang/%s" | |
SUBTITLE_HTML = BASE_URL + "/talks/subtitles/id/%s/lang/%s/format/html" | |
COMMENTS_HTML = BASE_URL + "/comments/viewRPC?forum=%s&commentsLoaded=0&limit=999999&sortby=replies" | |
DOWNLOADABLE_LANGS = set(['ind', 'eng']) | |
if 'http_proxy' in os.environ: | |
proxy = urllib2.ProxyHandler({'http': os.environ['http_proxy']}) | |
opener = urllib2.build_opener(proxy) | |
urllib2.install_opener(opener) | |
# from django.template.defaultfilters | |
def slugify(value): | |
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower()) | |
return re.sub('[-\s]+', '_', value) | |
def save_to(filename, content): | |
f = codecs.open(filename, "w", "utf-8") | |
f.write(content) | |
f.close() | |
def save_jsonfile(filename, json): | |
f = codecs.open(filename, "w", "utf-8") | |
f.write(simplejson.dumps(json)) | |
f.close() | |
def load_jsonfile(filename): | |
f = codecs.open(filename, 'r', "utf-8") | |
return simplejson.loads(f.read()) | |
def captions_to_sub(captions, offset): | |
""" | |
Convert TED subtitle format into SUB format. | |
content, startTime, duration, startOfParagraph | |
""" | |
i = 1 | |
sub = '' | |
for c in captions: | |
c['startTime'] += int(offset) | |
min, ms = divmod(c['startTime'], 60000) | |
ms, rest = divmod(ms, 1000) | |
minTo, msTo = divmod(c['startTime'] + c['duration'], 60000) | |
msTo, restTo = divmod(msTo, 1000) | |
# talk is less than 1 hour, thus 00 prefix ;p | |
sub += "%d%s00:%s:%s,%s --> " % (i, os.linesep, str(min).zfill(2), str(ms).zfill(2), str(rest).zfill(3)) | |
sub += "00:%s:%s,%s%s" % (str(minTo).zfill(2), str(msTo).zfill(2), str(restTo).zfill(3), os.linesep) | |
sub += c['content'] | |
sub += os.linesep + os.linesep | |
i += 1 | |
return sub | |
def fetch_subtitle(id, lang, offset): | |
print " > fetching subtitle (%s)... " % lang.upper() | |
subtitle_url = SUBTITLE_URL % (id, lang) | |
subtitle_html = SUBTITLE_HTML % (id, lang) | |
f = urllib2.urlopen(SUBTITLE_URL % (id, lang)) | |
json = unicode(f.read(), "utf-8") | |
subtitle = captions_to_sub(simplejson.loads(json)['captions'], offset) | |
f = urllib2.urlopen(subtitle_html) | |
cleaner = Cleaner(remove_tags=['a']) | |
subtitle_html = unicode(cleaner.clean_html(f.read()), "utf-8") | |
return [subtitle, subtitle_html] | |
def fetch_comments(id): | |
print " > fetching comments..." | |
wrapper = """ | |
<html> | |
<body> | |
<link rel="stylesheet" type="text/css" media="screen" href="http://www.ted.com/css/comments.css" /> | |
<div id="conversations"> | |
<div id="discussion" class="silverBorderPanel"> | |
<div class="clearfix"></div> | |
<h3>Comment on this Talk</h3> | |
<div class="clearfix"> </div> | |
<div class="comment-container">%s</div> | |
</div> | |
</div></body></html> | |
""" | |
try: | |
f = urllib2.urlopen(COMMENTS_HTML % id) | |
return wrapper % unicode(f.read(), "utf-8") | |
except: | |
return '' | |
def parse_info(html): | |
""" | |
Parse requested page and look for title, talk id, and languages transcripted. | |
""" | |
filepattern = r'/talks/([_\w]+\-light\.mp4)' if VIDEO_LOWRES \ | |
else r'/talks/([_\w]+\.mp4)' | |
match = re.search(filepattern, html) | |
if not match: | |
print "No video link" | |
sys.exit(-1) | |
download_url = "http://download.ted.com/talks/%s" % match.group(1) | |
title = re.search(r'altheadline\ \=\ \'([^\']+)\'', html).group(1) | |
offset = re.search(r'introDuration:(\d+)', html).group(1) | |
id = re.search(r'ti:\"(\d+)\"', html).group(1) | |
forum_id = re.search(r'forum:\ (\d+)', html).group(1) | |
doc = lxml.html.fromstring(html) | |
select = doc.cssselect("#languageCode")[0] | |
langs = select.value_options | |
return {'id':id, | |
'forum_id': forum_id, | |
'title': title, | |
'offset': offset, | |
'langs': langs, | |
'download_url': download_url | |
} | |
def fetch_page_info(url, offset, file_pattern): | |
offset = offset * 1000 # in second | |
f = urllib2.urlopen(url) | |
html = f.read() | |
info = parse_info(html) | |
for lang in set(info['langs']).intersection(DOWNLOADABLE_LANGS): | |
subtitle, subtitle_html = fetch_subtitle(info['id'], lang, info['offset']) | |
save_to(file_pattern % (lang, "sub"), subtitle) | |
save_to(file_pattern % (lang, "html"), subtitle_html) | |
info[lang] = {'subtitle': subtitle, 'subtitle_html': subtitle_html} | |
comments = fetch_comments(info['forum_id']) | |
info['comments'] = comments | |
save_to(file_pattern % ("all", "comments.html"), comments) | |
return info | |
def download(url, filename): | |
status = call('/usr/bin/wget -c -O %s %s' % (filename, url), shell=True) | |
def get_and_save(url, output_dir, offset): | |
print "Getting", url | |
title = slugify(url.split('/')[-1].replace('.html', '')) | |
file_pattern = "%s%s-%%s.%%s" % (output_dir, title) | |
cachefile = "%s%s.%s" % (output_dir, title, "info") | |
try: | |
video_info = load_jsonfile(cachefile) | |
except IOError: | |
video_info = fetch_page_info(url, offset, file_pattern) | |
save_jsonfile(cachefile, video_info) | |
# save_all(video_info) # TODO move all save_to into this | |
if DOWNLOAD_VIDEO: | |
download(video_info['download_url'], file_pattern % ("en", "mp4")) | |
def main_pipe(): | |
urls = sys.stdin.read() | |
for url in [u.strip() for u in urls.split(os.linesep) if u]: | |
try: | |
if not url.startswith('#'): | |
get_and_save(url, OUTPUT_DIR, 0) | |
except: | |
print "Failed to fetch %s" % url | |
def main(): | |
if len(sys.argv) < 2: | |
print "TED.com video/subtitle downloader" | |
print "Usage #1: %s TALK-URL [offset in seconds]" % sys.argv[0] | |
print "Usage #2: %s < video-list.txt" % sys.argv[0] | |
print "Eg. %s http://www.ted.com/talks/james_nachtwey_s_searing_pictures_of_war.html" % sys.argv[0] | |
print "Supply [offset] to override one specified by the page (sometimes wrong)." | |
sys.exit(2) | |
try: | |
offset = sys.argv[2] | |
except: | |
offset = 0 | |
try: | |
get_and_save(sys.argv[1], OUTPUT_DIR, offset) | |
except: | |
raise | |
if __name__ == '__main__': | |
if not sys.stdin.isatty(): | |
main_pipe() | |
else: | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment