Last active
June 16, 2018 13:34
-
-
Save lamberta/6531309 to your computer and use it in GitHub Desktop.
Download all the lecture videos from a Coursera class index page. Requires phantomjs and wget.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env phantomjs | |
/** | |
* Scrapes a Coursera class index page and downloads the | |
* individual lecture mp4s. Requires phantomjs and curl. | |
* Usage: ./coursera-slurp [index-url] | |
* | |
* If something breaks, it's probably the DOM selector in the | |
* remote page. Try fiddling with the 'linkSelector' function. | |
*/ | |
var page = require('webpage').create(), | |
spawn = require("child_process").spawn, | |
args = require('system').args, | |
opts = parseOpts(args), | |
userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36", | |
max_proc = 5, //max concurrent child processes, 0 for unlimited | |
return_tally = 0, //keep track of returned "threads" | |
linkqueue; //collection of link objects | |
if (opts.help) { | |
console.log("Usage: coursera-slurp [options] index-url"); | |
console.log("Options:"); | |
console.log(" -h, --help Print help"); | |
console.log(" -v, --verbose Print verbose output"); | |
console.log(" -D, --dry-run Display remote files found but don't download"); | |
console.log(" -A, --auth=token Supply login token, this is the 'CAUTH' cookie"); | |
console.log(" -C, --cookies=string Use additional cookies"); | |
phantom.exit(0); | |
} | |
if (opts.cookies) { | |
for (var name in opts.cookies) { | |
phantom.addCookie({ | |
'name': name, //required | |
'value': opts.cookies[name], //required | |
'domain': '.coursera.org' //required | |
}); | |
} | |
} | |
page.settings.userAgent = userAgent; | |
if (opts.verbose) { | |
console.log("Downloading page: " + opts.url); | |
} | |
page.open(opts.url, function (status) { | |
if (status !== 'success') { | |
console.warn('Failed to load url, aborting.'); | |
phantom.exit(1); | |
} | |
linkqueue = page.evaluate(linkSelector).concat(); | |
if (linkqueue.length === 0) { | |
console.error("No files found!"); | |
phantom.exit(1); | |
} | |
if (opts.verbose || opts.dryrun) { | |
console.log("Found files:"); | |
linkqueue.forEach(function (link) { | |
var filename = getLinkFilename(link), | |
downloadurl = getLinkURL(link); | |
console.log(downloadurl + " => " + filename); | |
}); | |
} | |
if (opts.dry_run) { | |
phantom.exit(0); | |
} | |
//off we go ... | |
if (max_proc === 0) { | |
max_proc = linkqueue.length; | |
} | |
for (var i = 0; i < max_proc; i++) { | |
if (linkqueue.length > 0) { | |
var link = linkqueue.shift(); | |
window.setTimeout(downloadLink, 0, link); | |
} | |
} | |
}); | |
page.onError = function (msg, trace) { | |
console.error("Error in remote page:\n\t", msg); | |
}; | |
/* Evaluated in webpage, limited cross-polination with phantomjs. | |
* @return {Array} Links in order: [link1, link2, ...] | |
* A link object looks like: {id: 'lectureid', title: 'lecture title', pos: idx} | |
*/ | |
function linkSelector () { | |
var link_elems = document.querySelectorAll('.lecture-link'), | |
links = []; | |
Array.prototype.slice.call(link_elems).forEach(function (elem, i) { | |
links.push({ | |
id: elem.getAttribute('data-lecture-id'), //url-id | |
title: elem.innerText.trim(), //link title | |
pos: i //position in page | |
}); | |
}); | |
return links; | |
} | |
/* Downloads the given link. | |
* On finish, download another or close application. | |
*/ | |
function downloadLink (link) { | |
var filename = getLinkFilename(link), | |
downloadurl = getLinkURL(link), | |
curlOpts = ['--location', '--user-agent', userAgent, downloadurl, '--output', filename]; | |
//add cookies | |
if (Array.isArray(phantom.cookies) && phantom.cookies.length > 0) { | |
var cookieStr = ''; | |
phantom.cookies.forEach(function (cookie, i) { | |
if (i !== 0) { cookieStr += "; "; } | |
cookieStr += (cookie.name + "=" + cookie.value); | |
}); | |
curlOpts.unshift('--cookie', cookieStr); | |
} | |
if (opts.verbose) { | |
curlOpts.unshift('--verbose'); | |
console.log("curl options: " + curlOpts.join(' ')); | |
} | |
//download | |
var proc = spawn('curl', curlOpts); | |
console.log("Downloading " + filename); | |
//on finish, use this "thread" to download the next link or ring finished bell | |
proc.on('exit', function (status) { | |
if (opts.verbose) { | |
console.log("Download finished for " + filename + ", status: " + status); | |
} | |
if (linkqueue.length > 0) { | |
var nextlink = linkqueue.shift(); | |
downloadLink(nextlink); | |
} else { | |
return_tally += 1; | |
if (return_tally === max_proc) { | |
console.log("All downloads complete!"); | |
phantom.exit(); | |
} | |
} | |
}); | |
if (opts.verbose) { | |
var logOut = function (data) { console.log(data); }; | |
proc.stdout.on('data', logOut); | |
proc.stderr.on('data', logOut); | |
} | |
} | |
/* Construct download link from Link id. | |
* @return {string} URL of file download. | |
*/ | |
function getLinkURL (link) { | |
var linkPostfix = "/download.mp4?lecture_id=", | |
url = opts.url, | |
base = (url[url.length-1] === '/') ? url.slice(0, -1) : url; | |
return (base + linkPostfix + link.id); | |
} | |
/* Create a output filename based using Link title. | |
* @return {string} Local filename, in form: '001-First-Lecture.mp4' | |
*/ | |
function getLinkFilename (link) { | |
var base = link.title.replace(/\s/g, '-').replace(/[^\w\n-]/g, ''), | |
ext = "mp4", | |
filename = base + '.' + ext; | |
if (typeof link.pos === 'number') { | |
var i = link.pos + 1, | |
pre = (i < 100) ? ("00" + i).slice(-3) : i; //pad | |
filename = (pre + '-' + filename); | |
} | |
return filename; | |
} | |
/* Parse command-line options. | |
*/ | |
function parseOpts (args) { | |
var opts = { | |
url: args[args.length-1] //url should be last | |
}; | |
if (args.length < 2) { | |
opts.help = true; | |
} else { | |
for (var i = 1, len = args.length; i < len; i++) { | |
switch (args[i]) { | |
case '-h': case '--help': | |
opts.help = true; | |
break; | |
case '-D': case '--dry-run': | |
opts.dry_run = true; | |
break; | |
case '-C': case '--cookies': | |
if (!opts.cookies) { opts.cookies = {}; } | |
//parse cookie string | |
args[i+1].split(';').forEach(function (entry) { | |
var keyval = entry.trim().split('='); | |
opts.cookies[keyval[0]] = keyval[1]; | |
}); | |
break; | |
case '-A': case '--auth': | |
if (!opts.cookies) { opts.cookies = {}; } | |
opts.cookies = {'CAUTH': args[i+1]} | |
break; | |
case '-v': case '--verbose': | |
opts.verbose = true; | |
break; | |
} | |
} | |
} | |
return opts; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I tried this script using
phantomjs coursera-slurp.js --auth=myCAUTHkey https://class.coursera.org/algs4partII-003/lecture
and it returns no files. However executinglinkSelector()
in browser console returns 12 links. My Phantomjs version is 1.9.7. What am I missing here? Any help would be great.