Skip to content

Instantly share code, notes, and snippets.

@rjof
Created January 24, 2016 02:32
Show Gist options
  • Save rjof/b94fb1ca4aa285c3d452 to your computer and use it in GitHub Desktop.
Save rjof/b94fb1ca4aa285c3d452 to your computer and use it in GitHub Desktop.
udemy.com video extraction tool. Works only if you bought a course and obtain a valid cookie.
import re
import requests
import json
import urlparse
from sys import stderr
def connect(url):
""" Extract source from passed URL. """
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0)"\
" Gecko/20100101 Firefox/30.0"}
# Enter your cookie information below.
# Depending wether or not you use google or facebook to sign in you might
# have to add certain key: value pairs.
# Example of a Google login cookie:
auth_cookies = {"PHPSESSID": "",
"client_id": "",
"locale": "en_US",
" __udmyvstr": "",
"__udmyvst": "",
"lastLoggedInDate": "",
"GCSCE_0000000000000000000000000000000000000000_S3": \
"C=000000000000-00000000000000000000000000000000.apps.googl"\
"eusercontent.com:S=0000000000000000000000000000000000000000"\
"..c000:I=0000000000:X=0000000000",
"G_AUTHUSER_S3": "0",
"G_USERSTATE_S3": "113824146029390853931=1",
"access_token": "0000000000000000000000000000000000000000"}
try:
# Try to visit the base site to extract all the lecture links
r = requests.get(url, headers=headers, cookies=auth_cookies)
if(r.status_code != requests.codes.ok):
stderr.write("Failed to establish connection to {0}! (Status code {1})\n"\
.format(url, r.status_code))
except(requests.exceptions.RequestException) as e:
stderr.write("Failed to connect to website. {0}\n".format(e.message))
stderr.flush()
exit(1)
return r.content
def get_lecture_links(url):
""" Extracts the lecture titles and it's links. """
# obtain the course identifier and build the api link
content = connect(url)
course_id = get_course_id(content)
api_url = "https://www.udemy.com/api-1.1/courses/{0}/curriculum".format(course_id)
jdata = json.loads(connect(api_url))
section = 1
chapter = 1
for j in jdata:
typeText = j.get("typeText") # Can be either a section or a lecture.
title = j.get("title", "N/A") # The title of the lecture/section.
asset = j.get("asset") # Stores information like the video link, extra downloads, etc.
if not asset: # When no asset key is found in the data blob it's most likely a section.
print "Section {0}.\t{1}\n".format(section, title)
section += 1
continue
# obtain the file title of the current data blob
file_title = asset.get("title", "N/A")
# Udemy only stores a few video links in the json file (extracting them with the if routine),
# the other videos must be obtained by visiting each video and then
# extracting it's location. (extracting the rest with the else routine)
if asset.has_key("downloadUrl"):
link = asset["downloadUrl"].get("download", "Not found")
else:
link = "Not found"
lecture_id = j.get("id")
if not lecture_id:
# Every data blob should have the id key, but just in case it can't be found...
# ...what could go wrong; will go wrong.
stderr.write("Chapter {0}.\t{1}\nFile name: {2}\nLink: {3} ---- FAILED\n"\
.format(chapter, title, file_title, link))
stderr.flush()
continue
# produce the embed link and obtain the source, which contains the
# file link we are looking for.
embed_url = "https://www.udemy.com/embed/{0}/".format(lecture_id)
embed_source = connect(embed_url)
match = re.search("\[{\"sources\".*\}]", embed_source)
if match:
# Choose the best quality of the available source and store the link
video_blob = json.loads(match.group()[1:-1])
quality = ("720p", "460p", "360p")
for source in video_blob.get("sources"):
for q in quality:
if source.get("label") == q:
link = "{0}&filedownload=attachment".format(source.get("file"))
break
if "Not found" not in link:
break
print "Chapter {0}.\t{1}\nFile name: {2}\nLink: {3}\n"\
.format(chapter, title, file_title, link)
if typeText == "Section":
section += 1
elif typeText == "Lecture":
chapter += 1
def get_course_id(source):
""" Obtains the course identifier, which is required to access the file links. """
match = re.search("/courses/\d+/visible-instructors", source)
try:
return match.group().split('/')[2]
except(IndexError, AttributeError):
stderr.write("Failed to obtain the course identifier, without this id"\
" we can't continue execution.\n")
stderr.flush()
exit(1)
if __name__ == "__main__":
get_lecture_links("https://www.udemy.com/learn-the-basics-of-ethical-hacking-and-penetration-testing/")
@hieutrung107
Copy link

How to get key id and id of a video

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment