Last active
August 29, 2015 14:18
-
-
Save deepakjois/439e1eb8697058735ef4 to your computer and use it in GitHub Desktop.
Coursera Lectures (and Subtitles) Download
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# AUTHOR: | |
# [email protected] | |
# (w/ credit to: http://www.brownfort.com/2014/09/scrap-websites-ruby/) | |
# | |
# DESCRIPTION: | |
# Dumps a list of direct, unprotected CDN links to Coursera | |
# lecture videos, and downloads English subtitles (.txt,.srt) for them. | |
# | |
# USAGE: | |
# ruby fetchlinks.rb <username> <password> <coursename> <section> | |
# | |
# <coursename> is the name of the course, as it shows up in the URL. | |
# For e.g. ‘configuringworld-002’, ‘cryptography-002’, ‘humanlanguage-001’ | |
# etc. | |
# | |
# <section> The section number is a 0-based integer obtained by counting the | |
# number of sections from top, on the Video Lectures download page. | |
# For e.g. if you wish to download videos in the first section, which is | |
# usually (but not always) a Welcome or Getting Started section, | |
# you should use 0. | |
# | |
# | |
require 'uri' | |
require 'nokogiri' | |
require 'typhoeus' | |
def login(email, password) | |
login_url = "https://accounts.coursera.org/api/v1/login" | |
method = "post" | |
params = { email: email, | |
password: password, | |
webrequest: true } | |
headers = { "Host" => "accounts.coursera.org", | |
"Connection"=> "keep-alive", | |
"Origin"=> "https://accounts.coursera.org", | |
"User-Agent"=> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36", | |
"Content-Type"=> "application/x-www-form-urlencoded", | |
"Accept"=> "*/*", | |
"X-Requested-With"=> "XMLHttpRequest", | |
"Referer"=> "https://accounts.coursera.org/signin?post_redirect=https://www.coursera.org/", | |
"X-CSRF2-Token"=> "3dNSKBZddpG2w5RTNLYB9grd", | |
"X-CSRF2-Cookie"=> "csrf2_token_iKSpqIdU", | |
"X-CSRFToken"=> "hH7MfdwlVA1UGBEjqM2MWC3K", | |
"Cookie"=> "csrftoken=hH7MfdwlVA1UGBEjqM2MWC3K; csrf2_token_iKSpqIdU=3dNSKBZddpG2w5RTNLYB9grd" | |
} | |
request = Typhoeus::Request.new( | |
login_url, | |
method: :post, | |
params: params, headers: headers | |
) | |
response = request.run | |
if response.response_code == 200 | |
res_headers = response.response_headers.split("\n") | |
res_headers.each do |res_head| | |
res = res_head.split(":") | |
if res[0].eql? "Set-Cookie" | |
headers["Cookie"] = "#{headers["Cookie"]};#{res[1].split(";")[0]}" | |
end | |
end | |
end | |
return response, headers | |
end | |
def fetch_links(course, week, headers) | |
url = "https://class.coursera.org/#{course}/lecture" | |
headers["Referer"] = "https://class.coursera.org/#{course}" | |
headers["Host"] = "class.coursera.org" | |
request = Typhoeus::Request.new( | |
url, | |
method: :get, | |
headers: headers | |
) | |
response = request.run | |
if response.response_code == 200 | |
page = Nokogiri::HTML(response.response_body) | |
return response, page.css(".course-item-list-section-list")[week].css("a") | |
else | |
return response, nil | |
end | |
end | |
def resolve_link(link, headers) | |
if !link['href'].index('/lecture/download.mp4').nil? | |
response =Typhoeus::Request.get("#{link['href']}", followlocation: false, headers: headers, ssl_verifypeer: false) | |
puts response.headers_hash["Location"] | |
elsif !link['href'].index('/lecture/subtitles').nil? | |
response =Typhoeus::Request.get("#{link['href']}", followlocation: true, headers: headers, ssl_verifypeer: false) | |
if response.response_code == 200 | |
cd = response.headers_hash["Content-Disposition"] | |
filename = URI.unescape(cd.match(/filename=(\"?)(.+)\1/)[2]).gsub("/", "-") | |
File.open(filename, 'w') { |file| file.write(response.response_body) } | |
STDERR.puts "Done saving #{filename}" | |
else | |
STDERR.puts "Problem saving #{filename}" | |
STDERR.puts link['href'] | |
STDERR.puts response.response_code | |
STDERR.puts response.response_body | |
end | |
end | |
end | |
if ARGV.size > 3 | |
email = ARGV[0] | |
password = ARGV[1] | |
course = ARGV[2] | |
week = ARGV[3].to_i | |
STDERR.puts "Logging in." | |
response, headers = login(email, password) | |
if response.response_code == 200 | |
STDERR.puts "Login done." | |
STDERR.puts "Fetching links." | |
response, links = fetch_links(course, week, headers) | |
if response.response_code == 200 | |
STDERR.puts "Fetching links done." | |
links.each do |link| | |
resolve_link(link, headers) | |
end | |
else | |
STDERR.puts "Problem fetching links." | |
end | |
else | |
STDERR.puts "Problem Logging in." | |
end | |
else | |
STDERR.puts "Please provide email, password, course name and week as first, second, third and fourth argument respectively." | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment