Created
November 30, 2011 19:54
-
-
Save nixmaniack/1410533 to your computer and use it in GitHub Desktop.
Download lecture videos of ai-class (with Subtitles) (Stanford)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
__author__ = "Deepak.G.R." | |
__license__ = 'Public Domain' | |
""" | |
usage: | |
Go to command line and type | |
python ai-class.py "topic-name" | |
topic-names can be "Welcome to AI", "Problem Solving" | |
PS: Python 2.7.2 should be installed in your system. | |
Let me know if you get into any problems. | |
""" | |
from xml.etree import ElementTree as ET | |
from urllib import * | |
from urlparse import * | |
from sgmllib import SGMLParser | |
import os | |
from json import * | |
import re | |
import pdb | |
import sys | |
import json | |
import urllib2 | |
code = 35 | |
""" | |
code = 34 for 640*360 | |
code = 35 for 854*480(Default) | |
code = 22 for 1270*720 | |
""" | |
if code == 22: | |
video_fmt = '.mp4' | |
else: | |
video_fmt = '.flv' | |
url_youtube = 'http://www.youtube.com/watch?v=' | |
quiz_hash = dict(); | |
req_unit = sys.argv[1] | |
class UrlLister(SGMLParser): | |
def reset(self): | |
SGMLParser.reset(self) | |
self.urls = [] | |
self.flag = 0; | |
self.req_unit = req_unit; | |
self.names = []; | |
def start_a(self, attrs): | |
href = [value for name, value in attrs if name == 'href'] | |
topic = re.search(r'/course/topic/(\d)+', str(href[0])) | |
if topic: | |
self.flag = 0 | |
match = re.search(r'/course/video/\w+/\d+$', str(href[0])) | |
if match and self.flag == 1: | |
category = [value for name, value in attrs if name == 'id'] | |
if 'quiz' in category[0]: | |
quiz_id = re.findall(r'quiz_(\d+)', category[0])[0] | |
video_ids = quiz_hash[quiz_id] | |
for video_id in video_ids: | |
link = url_youtube + video_id | |
self.urls.append(link) | |
else: | |
video_id = re.findall(r'video_\d+_(.+)', category[0])[0] | |
link = url_youtube + video_id | |
self.urls.append(link) | |
def handle_data(self, text): | |
if self.flag == 0: | |
text = text.strip(); | |
text = re.sub(r'[^A-Za-z]', '', text).lower() | |
self.req_unit = re.sub(r'[^A-Za-z]', '', self.req_unit).lower() | |
if text == self.req_unit and len(text) != 0: | |
self.flag = 1 | |
class youtubeSub: | |
def __init__(self): | |
self.srt_string = list() | |
self.title = '' | |
def time_format(self, secs): | |
hrs = 0 | |
mins = 0 | |
parts = str(secs).split('.') | |
secs = int(parts[0]) | |
msecs = parts[1] | |
if secs >= 60: | |
mins = mins + (secs/60) | |
secs = secs % 60 | |
if mins >= 60: | |
hrs = hrs + (mins/60) | |
mins = mins % 60 | |
return (hrs, mins, secs, msecs) | |
def store_line(self, line, hrs, mins, secs, msecs): | |
h = '%02d' % hrs | |
m = '%02d' % mins | |
s = '%02d' % secs | |
ms = msecs + '0' * (3-len(msecs)) | |
self.srt_string.append(h + ':' + m + ':' + s + ',' + ms) | |
def parse_data(self, data): | |
try: | |
tree = ET.fromstring(data) | |
except: | |
return | |
line = 1 | |
for subelement in tree: | |
time = subelement.attrib | |
secs = float(time['start']) | |
t_secs = secs | |
(hrs, mins, secs, msecs) = self.time_format(secs) | |
self.srt_string.append(str(line) + '\n') | |
self.store_line(line, hrs, mins, secs, msecs) | |
self.srt_string.append(' --> ') | |
dur = float(time['dur']) | |
secs = t_secs + dur | |
(hrs, mins, secs, msecs) = self.time_format(secs) | |
self.store_line(line, hrs, mins, secs, msecs) | |
self.srt_string.append('\n' + subelement.text + '\n\n') | |
line = line + 1 | |
self.write_sub() | |
def write_sub(self): | |
dirname = 'subtitles' | |
if not os.path.exists(dirname): | |
os.mkdir(dirname) | |
os.chdir(dirname) | |
fobj = open(self.title, 'w') | |
for line in self.srt_string: | |
fobj.write(line.encode('ASCII', 'ignore')) | |
fobj.close() | |
os.chdir('..') | |
def get_subtitle(self, get_vars): | |
self.title = get_vars['title'][0] + '.srt' | |
try: | |
sub_link = get_vars['ttsurl'][0] + '&'\ | |
+ 'expire=' + get_vars['expire'][0] + '&'\ | |
+ 'key=' + get_vars['key'][0] + '&'\ | |
+ 'format=1' + '&'\ | |
+ 'hl=en' + '&'\ | |
+ 'ts=' + get_vars['timestamp'][0] + '&'\ | |
+ 'v=' + get_vars['video_id'][0] + '&'\ | |
+ 'lang=en' + '&'\ | |
+ 'type=track' + '&'\ | |
+ 'name=English via dotsub' + '&'\ | |
+ 'kind=&asr_langs=en,ja&caps=asr' + '&'\ | |
+ 'signature=' + get_vars['signature'][0] | |
except: | |
return | |
data = urlopen(sub_link).read() | |
self.parse_data(data) | |
def init_quiz_hash(): | |
print 'STATUS: Initializing quiz_id hash' | |
quiz_url = 'http://www.ai-class.com/course/json/filter/QuizQuestion' | |
quiz_url = urllib2.urlopen(quiz_url); | |
data = json.load(quiz_url) | |
quiz_id = list() | |
for ind in xrange(len(data['data'])): | |
piece = str(data['data'][ind]) | |
match = re.search(r'\'quiz_question\': (\d+?),', piece) | |
v_id = re.findall(r'\'youtube_id\': u\'(.+?)\'', piece) | |
hw = re.search(r'\'is_homework\': u\'true', piece) | |
if match and v_id: | |
q_id = match.group(1) | |
for v in v_id: | |
if not quiz_hash.has_key(q_id): | |
quiz_hash[q_id] = list() | |
quiz_hash[q_id].append(v) | |
print 'STATUS: quiz_id Initialized.' | |
def download_video(urls): | |
dirname = str(req_unit) | |
if os.path.exists(dirname): | |
delete_recent_video(dirname) | |
else: | |
os.mkdir(dirname) | |
os.chdir(dirname) | |
for video_url in urls: | |
video_id = parse_qs(urlparse(video_url).query)['v'][0] | |
get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id=" + video_id).read())) | |
title = get_vars['title'][0] + video_fmt | |
if os.path.isfile(title): | |
continue | |
i = 0 | |
entries = get_vars['itag'] | |
for entry in entries: | |
match = re.search(r'.*itag=' + str(code), entry) | |
if match: | |
break | |
i = i + 1 | |
if not match: | |
print 'ERROR: Couldn\'t Download video: ', title | |
continue | |
link = get_vars['itag'][i] | |
link = re.findall(r'\d+,url=(.*)', link)[0] | |
print '\n-->Downloading, Title: ', title | |
urlretrieve(link, title) | |
sub_obj = youtubeSub() | |
sub_obj.get_subtitle(get_vars) | |
os.chdir('..') | |
def delete_recent_video(dirname): | |
os.chdir(dirname) | |
files = os.listdir('.') | |
if not files: | |
return | |
name = '' | |
recent = 0 | |
for fo in files: | |
if os.path.isdir(fo): | |
continue | |
temp = os.stat(fo).st_mtime | |
if temp > recent: | |
recent = temp | |
name = fo | |
os.remove(name) | |
def main(): | |
init_quiz_hash(); | |
page = urllib2.urlopen("http://www.ai-class.com/home/") | |
htmlSource = page.read() | |
parser = UrlLister() | |
print 'STATUS: Fetching video urls.' | |
parser.feed(htmlSource) | |
print 'STATUS: SUCCESS' | |
page.close() | |
parser.close() | |
print 'Number of videos: ', len(parser.urls); | |
print 'STATUS: Starting download.' | |
download_video(parser.urls) | |
print '\n\n*********Download Finished*********' | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment