Created
January 24, 2016 02:32
-
-
Save rjof/b94fb1ca4aa285c3d452 to your computer and use it in GitHub Desktop.
udemy.com video extraction tool. Works only if you bought a course and obtain a valid cookie.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import requests | |
import json | |
import urlparse | |
from sys import stderr | |
def connect(url): | |
""" Extract source from passed URL. """ | |
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0)"\ | |
" Gecko/20100101 Firefox/30.0"} | |
# Enter your cookie information below. | |
# Depending wether or not you use google or facebook to sign in you might | |
# have to add certain key: value pairs. | |
# Example of a Google login cookie: | |
auth_cookies = {"PHPSESSID": "", | |
"client_id": "", | |
"locale": "en_US", | |
" __udmyvstr": "", | |
"__udmyvst": "", | |
"lastLoggedInDate": "", | |
"GCSCE_0000000000000000000000000000000000000000_S3": \ | |
"C=000000000000-00000000000000000000000000000000.apps.googl"\ | |
"eusercontent.com:S=0000000000000000000000000000000000000000"\ | |
"..c000:I=0000000000:X=0000000000", | |
"G_AUTHUSER_S3": "0", | |
"G_USERSTATE_S3": "113824146029390853931=1", | |
"access_token": "0000000000000000000000000000000000000000"} | |
try: | |
# Try to visit the base site to extract all the lecture links | |
r = requests.get(url, headers=headers, cookies=auth_cookies) | |
if(r.status_code != requests.codes.ok): | |
stderr.write("Failed to establish connection to {0}! (Status code {1})\n"\ | |
.format(url, r.status_code)) | |
except(requests.exceptions.RequestException) as e: | |
stderr.write("Failed to connect to website. {0}\n".format(e.message)) | |
stderr.flush() | |
exit(1) | |
return r.content | |
def get_lecture_links(url): | |
""" Extracts the lecture titles and it's links. """ | |
# obtain the course identifier and build the api link | |
content = connect(url) | |
course_id = get_course_id(content) | |
api_url = "https://www.udemy.com/api-1.1/courses/{0}/curriculum".format(course_id) | |
jdata = json.loads(connect(api_url)) | |
section = 1 | |
chapter = 1 | |
for j in jdata: | |
typeText = j.get("typeText") # Can be either a section or a lecture. | |
title = j.get("title", "N/A") # The title of the lecture/section. | |
asset = j.get("asset") # Stores information like the video link, extra downloads, etc. | |
if not asset: # When no asset key is found in the data blob it's most likely a section. | |
print "Section {0}.\t{1}\n".format(section, title) | |
section += 1 | |
continue | |
# obtain the file title of the current data blob | |
file_title = asset.get("title", "N/A") | |
# Udemy only stores a few video links in the json file (extracting them with the if routine), | |
# the other videos must be obtained by visiting each video and then | |
# extracting it's location. (extracting the rest with the else routine) | |
if asset.has_key("downloadUrl"): | |
link = asset["downloadUrl"].get("download", "Not found") | |
else: | |
link = "Not found" | |
lecture_id = j.get("id") | |
if not lecture_id: | |
# Every data blob should have the id key, but just in case it can't be found... | |
# ...what could go wrong; will go wrong. | |
stderr.write("Chapter {0}.\t{1}\nFile name: {2}\nLink: {3} ---- FAILED\n"\ | |
.format(chapter, title, file_title, link)) | |
stderr.flush() | |
continue | |
# produce the embed link and obtain the source, which contains the | |
# file link we are looking for. | |
embed_url = "https://www.udemy.com/embed/{0}/".format(lecture_id) | |
embed_source = connect(embed_url) | |
match = re.search("\[{\"sources\".*\}]", embed_source) | |
if match: | |
# Choose the best quality of the available source and store the link | |
video_blob = json.loads(match.group()[1:-1]) | |
quality = ("720p", "460p", "360p") | |
for source in video_blob.get("sources"): | |
for q in quality: | |
if source.get("label") == q: | |
link = "{0}&filedownload=attachment".format(source.get("file")) | |
break | |
if "Not found" not in link: | |
break | |
print "Chapter {0}.\t{1}\nFile name: {2}\nLink: {3}\n"\ | |
.format(chapter, title, file_title, link) | |
if typeText == "Section": | |
section += 1 | |
elif typeText == "Lecture": | |
chapter += 1 | |
def get_course_id(source): | |
""" Obtains the course identifier, which is required to access the file links. """ | |
match = re.search("/courses/\d+/visible-instructors", source) | |
try: | |
return match.group().split('/')[2] | |
except(IndexError, AttributeError): | |
stderr.write("Failed to obtain the course identifier, without this id"\ | |
" we can't continue execution.\n") | |
stderr.flush() | |
exit(1) | |
if __name__ == "__main__": | |
get_lecture_links("https://www.udemy.com/learn-the-basics-of-ethical-hacking-and-penetration-testing/") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to get key id and id of a video