-
-
Save sumodx/1303700 to your computer and use it in GitHub Desktop.
Download lecture videos of ai-class, with basic resume support
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
__author__ = "Deepak.G.R." | |
__credits__ = "Sumod Hajela" | |
__license__ = 'Public Domain' | |
""" | |
usage: | |
Go to command line and type | |
python ai-class.py "topic-name" | |
topic-names can be "Welcome to AI", "Problem Solving" | |
If download is interrupted, delete the last partial downloaded file restart. The script will skip all existing files and start with next new file. | |
PS: Python2.6.2 should be installed in your system. | |
Let me know if you have any problems. | |
""" | |
from urllib import * | |
from urlparse import * | |
from sgmllib import SGMLParser | |
import re | |
import pdb | |
import sys | |
import json | |
from os import * | |
import os.path | |
url_youtube = 'http://www.youtube.com/watch?v=' | |
#req_unit = 'problem solving' | |
#req_unit = 'welcome to AI' | |
req_unit = sys.argv[1] | |
quiz_hash = dict(); | |
class UrlLister(SGMLParser): | |
def reset(self): | |
SGMLParser.reset(self) | |
self.urls = [] | |
self.flag = 0; | |
self.req_unit = req_unit; | |
self.names = []; | |
def start_a(self, attrs): | |
href = [value for name, value in attrs if name == 'href'] | |
topic = re.search(r'/course/topic/(\d)+', str(href[0])) | |
if topic: | |
self.flag = 0 | |
match = re.search(r'/course/video/\w+/\d+$', str(href[0])) | |
if match and self.flag == 1: | |
category = [value for name, value in attrs if name == 'id'] | |
if 'quiz' in category[0]: | |
quiz_id = re.findall(r'quiz_(\d+)', category[0])[0] | |
video_id = quiz_hash[quiz_id] | |
else: | |
video_id = re.findall(r'video_\d+_(.+)', category[0])[0] | |
link = url_youtube + video_id | |
self.urls.append(link) | |
def handle_data(self, text): | |
if self.flag == 0: | |
text = text.strip(); | |
text = re.sub(r'[^A-Za-z]', '', text) | |
self.req_unit = re.sub(r'[^A-Za-z]', '', self.req_unit) | |
match = re.search(text, self.req_unit, re.IGNORECASE) | |
if match and len(text) != 0: | |
self.flag = 1 | |
def init_quiz_hash(): | |
print 'STATUS: Initializing quiz_id hash' | |
quiz_url = 'http://www.ai-class.com/course/json/filter/QuizQuestion' | |
quiz_url = urlopen(quiz_url) | |
data = json.load(quiz_url) | |
quiz_id = list() | |
for ind in xrange(len(data['data'])): | |
piece = str(data['data'][ind]) | |
match = re.findall('\'youtube_id\': u\'(.+?)\',.*?\'quiz_question\': (\d+?),', piece) | |
if match: | |
quiz_id.append(match[0]) | |
for v, i in quiz_id: | |
quiz_hash[i] = v | |
print 'STATUS: quiz_id Initialized.' | |
def download_video(urls): | |
dirname = 'lecture ' + str(req_unit) | |
py_path = path.abspath(sys.argv[0]) | |
py_path = path.dirname(py_path) | |
if not os.path.exists(dirname): | |
mkdir(dirname) | |
chdir(dirname) | |
for video_url in urls: | |
video_id = parse_qs(urlparse(video_url).query)['v'][0] | |
get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id="+video_id).read())) | |
title = get_vars['title'][0] + '.flv' | |
if os.path.isfile(title): | |
continue; | |
i = 0 | |
entries = (get_vars['fmt_list'][0]).split(',') | |
for entry in entries: | |
match = re.search(r'^45.*', entry) | |
if match: | |
break; | |
i = i + 1; | |
link = get_vars['itag'][i] | |
link = re.findall(r'45,url=(.*)', link)[0] | |
print '\n-->Downloading, Title: ', title | |
urlretrieve(link, title) | |
""" | |
for v in get_vars.keys(): | |
print v, '\n', get_vars[v], '\n\n' | |
pdb.set_trace() | |
""" | |
chdir(py_path) | |
def main(): | |
init_quiz_hash(); | |
page = urlopen("http://www.ai-class.com/home/") | |
htmlSource = page.read() | |
parser = UrlLister() | |
print 'STATUS: Fetching video urls.' | |
parser.feed(htmlSource) | |
print 'STATUS: SUCCESS' | |
page.close() | |
parser.close() | |
i = 0 | |
""" | |
for url in parser.urls: | |
print 'url: ', url, '\n' | |
i = i + 1 | |
""" | |
print 'Number of videos: ', len(parser.urls); | |
print 'STATUS: Starting download.' | |
download_video(parser.urls) | |
print '\n\n*********Download Finished*********' | |
if __name__ == "__main__": | |
main() |
I didn't test it, but there is a Greasemonkey script for this: http://www.notesandreviews.com/education/downloading-ai-class-transcripts-from-youtube
How do I use this script if I'm behind an HTTP proxy?
Ok I figured how to use it with an HTTP proxy. See my fork of this gist (https://gist.github.com/1360366) for details.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
Thanks for the previous answer, nevertheless, I have another question: how can I download the captions within the videos?
Thanks again :D