rjof · January 24, 2016 02:32 · hieutrung107 · Oct 16, 2023
diff --git a/udemy.py b/udemy.py
 import re
 import requests
 import json
 import urlparse
 from sys import stderr


 def connect(url):
    """ Extract source from passed URL. """
    headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0)"\
                             " Gecko/20100101 Firefox/30.0"}

    # Enter your cookie information below.
    # Depending wether or not you use google or facebook to sign in you might
    # have to add certain key: value pairs.

    # Example of a Google login cookie:
    auth_cookies = {"PHPSESSID": "",
                    "client_id": "",
                    "locale": "en_US",
                    " __udmyvstr": "",
                    "__udmyvst": "",
                    "lastLoggedInDate": "",
                    "GCSCE_0000000000000000000000000000000000000000_S3": \
                        "C=000000000000-00000000000000000000000000000000.apps.googl"\
                        "eusercontent.com:S=0000000000000000000000000000000000000000"\
                        "..c000:I=0000000000:X=0000000000",
                    "G_AUTHUSER_S3": "0",
                    "G_USERSTATE_S3": "113824146029390853931=1",
                    "access_token": "0000000000000000000000000000000000000000"}
    try:
        # Try to visit the base site to extract all the lecture links
        r = requests.get(url, headers=headers, cookies=auth_cookies)
        if(r.status_code != requests.codes.ok):
            stderr.write("Failed to establish connection to {0}! (Status code {1})\n"\
                         .format(url, r.status_code))
    except(requests.exceptions.RequestException) as e:
        stderr.write("Failed to connect to website. {0}\n".format(e.message))
        stderr.flush()
        exit(1)
    return r.content

 def get_lecture_links(url):
    """ Extracts the lecture titles and it's links. """
    # obtain the course identifier and build the api link
    content = connect(url)
    course_id = get_course_id(content)
    api_url = "https://www.udemy.com/api-1.1/courses/{0}/curriculum".format(course_id)
    jdata = json.loads(connect(api_url))

    section = 1
    chapter = 1
    for j in jdata:
        typeText = j.get("typeText") # Can be either a section or a lecture.
        title = j.get("title", "N/A")  # The title of the lecture/section.
        asset = j.get("asset") # Stores information like the video link, extra downloads, etc.
        if not asset: # When no asset key is found in the data blob it's most likely a section.
            print "Section {0}.\t{1}\n".format(section, title)
            section += 1
            continue

        # obtain the file title of the current data blob
        file_title = asset.get("title", "N/A")

        # Udemy only stores a few video links in the json file (extracting them with the if routine),
        # the other videos must be obtained by visiting each video and then
        # extracting it's location. (extracting the rest with the else routine)
        if asset.has_key("downloadUrl"):
            link = asset["downloadUrl"].get("download", "Not found")
        else:
            link = "Not found"
            lecture_id = j.get("id")
            if not lecture_id:
                # Every data blob should have the id key, but just in case it can't be found...
                # ...what could go wrong; will go wrong.
                stderr.write("Chapter {0}.\t{1}\nFile name: {2}\nLink: {3} ---- FAILED\n"\
                             .format(chapter, title, file_title, link))
                stderr.flush()
                continue
            # produce the embed link and obtain the source, which contains the
            # file link we are looking for.
            embed_url = "https://www.udemy.com/embed/{0}/".format(lecture_id)
            embed_source = connect(embed_url)
            match = re.search("\[{\"sources\".*\}]", embed_source)
            if match:
                # Choose the best quality of the available source and store the link
                video_blob = json.loads(match.group()[1:-1])
                quality = ("720p", "460p", "360p")
                for source in video_blob.get("sources"):
                    for q in quality:
                        if source.get("label") == q:
                            link = "{0}&filedownload=attachment".format(source.get("file"))
                            break
                    if "Not found" not in link:
                        break

        print "Chapter {0}.\t{1}\nFile name: {2}\nLink: {3}\n"\
              .format(chapter, title, file_title, link)

        if typeText == "Section":
            section += 1
        elif typeText == "Lecture":
            chapter += 1

 def get_course_id(source):
    """ Obtains the course identifier, which is required to access the file links. """
    match = re.search("/courses/\d+/visible-instructors", source)
    try:
        return match.group().split('/')[2]
    except(IndexError, AttributeError):
        stderr.write("Failed to obtain the course identifier, without this id"\
                     " we can't continue execution.\n")
        stderr.flush()
        exit(1)


 if __name__ == "__main__":
    get_lecture_links("https://www.udemy.com/learn-the-basics-of-ethical-hacking-and-penetration-testing/")
	import re
	import requests
	import json
	import urlparse
	from sys import stderr


	def connect(url):
	""" Extract source from passed URL. """
	headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0)"\
	" Gecko/20100101 Firefox/30.0"}

	# Enter your cookie information below.
	# Depending wether or not you use google or facebook to sign in you might
	# have to add certain key: value pairs.

	# Example of a Google login cookie:
	auth_cookies = {"PHPSESSID": "",
	"client_id": "",
	"locale": "en_US",
	" __udmyvstr": "",
	"__udmyvst": "",
	"lastLoggedInDate": "",
	"GCSCE_0000000000000000000000000000000000000000_S3": \
	"C=000000000000-00000000000000000000000000000000.apps.googl"\
	"eusercontent.com:S=0000000000000000000000000000000000000000"\
	"..c000:I=0000000000:X=0000000000",
	"G_AUTHUSER_S3": "0",
	"G_USERSTATE_S3": "113824146029390853931=1",
	"access_token": "0000000000000000000000000000000000000000"}
	try:
	# Try to visit the base site to extract all the lecture links
	r = requests.get(url, headers=headers, cookies=auth_cookies)
	if(r.status_code != requests.codes.ok):
	stderr.write("Failed to establish connection to {0}! (Status code {1})\n"\
	.format(url, r.status_code))
	except(requests.exceptions.RequestException) as e:
	stderr.write("Failed to connect to website. {0}\n".format(e.message))
	stderr.flush()
	exit(1)
	return r.content

	def get_lecture_links(url):
	""" Extracts the lecture titles and it's links. """
	# obtain the course identifier and build the api link
	content = connect(url)
	course_id = get_course_id(content)
	api_url = "https://www.udemy.com/api-1.1/courses/{0}/curriculum".format(course_id)
	jdata = json.loads(connect(api_url))

	section = 1
	chapter = 1
	for j in jdata:
	typeText = j.get("typeText") # Can be either a section or a lecture.
	title = j.get("title", "N/A") # The title of the lecture/section.
	asset = j.get("asset") # Stores information like the video link, extra downloads, etc.
	if not asset: # When no asset key is found in the data blob it's most likely a section.
	print "Section {0}.\t{1}\n".format(section, title)
	section += 1
	continue

	# obtain the file title of the current data blob
	file_title = asset.get("title", "N/A")

	# Udemy only stores a few video links in the json file (extracting them with the if routine),
	# the other videos must be obtained by visiting each video and then
	# extracting it's location. (extracting the rest with the else routine)
	if asset.has_key("downloadUrl"):
	link = asset["downloadUrl"].get("download", "Not found")
	else:
	link = "Not found"
	lecture_id = j.get("id")
	if not lecture_id:
	# Every data blob should have the id key, but just in case it can't be found...
	# ...what could go wrong; will go wrong.
	stderr.write("Chapter {0}.\t{1}\nFile name: {2}\nLink: {3} ---- FAILED\n"\
	.format(chapter, title, file_title, link))
	stderr.flush()
	continue
	# produce the embed link and obtain the source, which contains the
	# file link we are looking for.
	embed_url = "https://www.udemy.com/embed/{0}/".format(lecture_id)
	embed_source = connect(embed_url)
	match = re.search("\[{\"sources\".*\}]", embed_source)
	if match:
	# Choose the best quality of the available source and store the link
	video_blob = json.loads(match.group()[1:-1])
	quality = ("720p", "460p", "360p")
	for source in video_blob.get("sources"):
	for q in quality:
	if source.get("label") == q:
	link = "{0}&filedownload=attachment".format(source.get("file"))
	break
	if "Not found" not in link:
	break

	print "Chapter {0}.\t{1}\nFile name: {2}\nLink: {3}\n"\
	.format(chapter, title, file_title, link)

	if typeText == "Section":
	section += 1
	elif typeText == "Lecture":
	chapter += 1

	def get_course_id(source):
	""" Obtains the course identifier, which is required to access the file links. """
	match = re.search("/courses/\d+/visible-instructors", source)
	try:
	return match.group().split('/')[2]
	except(IndexError, AttributeError):
	stderr.write("Failed to obtain the course identifier, without this id"\
	" we can't continue execution.\n")
	stderr.flush()
	exit(1)


	if __name__ == "__main__":
	get_lecture_links("https://www.udemy.com/learn-the-basics-of-ethical-hacking-and-penetration-testing/")