urielm · July 18, 2013 21:33
diff --git a/gdc-downloader.py b/gdc-downloader.py
 # GDC Vault videos can't be watched on mobile devices and this is a very sad thing indeed!
 # This script is designed to circumvent this by downloading the lecture and slideshow
 # videos which can then be re-encoded into whatever format you wish.

 # Note: this code is rather flimsy and was written as fast as possible for my own personal use.
 # The code only works for the most recent GDC Vault videos, since they all use the same player
 # format. If the XML format used to run the player is changed (as it  has in the past), the code
 # will have to be reconfigured. In the past, I was able to feed a wget-compatible cookies.txt
 # file into the wget call, but I can't get it to trigger anymore. So for now, the way I download
 # each video is I look at the source for the player page, find the player.html URL, and feed
 # it into the script with the -f flag. Ugly and slow, but hey, it works.

 # I generally hate reinventing the wheel and it does look like youtube-dl does some of the same
 # stuff I'm doing, but I couldn't get it to work with the GDC URLs. So off to Python land we go!!!

 # Usage is as follows:
 #
 #   With cookies.txt: gdc-downloader.py "[GDC video URL]" [output dir]
 #   Without cookies.txt: gdc-downloader.py -f "[GDC player.html URL]" [output dir]
 #
 # A GDC video URL looks like this:
 #   http://www.gdcvault.com/play/1015662/Creative-Panic-How-Agility-Turned
 #
 # A GDC player.html URL looks like this:
 #   http://evt.dispeak.com/ubm/gdc/sf12/player.html?xmlURL=xml/201203238_1331124629609NXXJ.xml&token=1234567890
 #
 # The output dir should be the name of your video. For example, suppling TestDir/GDCVid will create
 # TestDir/GDCVid/GDCVid.xml, TestDir/GDCVid/GDCVid-slide.flv, etc.

 # You need to have wget and rtmpdump installed in order for this script to work. I recommend macports.

 #############
 # Constants #
 #############

 cookies_filename = "cookies.txt"

 player_regular_expression = r"^.*\"(.*?)(player\.html)(.*?xmlURL=(.*?)[&].*?)\".*$"
 player_regular_expression_force = r"^(.*?)(player\.html)(.*?xmlURL=(.*?)[&].*?)$" # same as above but parses URL directly
 login_regular_expression = r"^.*\"(.*?login\.php.*?)\".*$"
 swf_name_regular_expression = r"^.*embed the Flash Content SWF when all tests are passed.*?\"src\".*?\"(.*?)\".*$"

 # DEPRECATED: This URL was retrieved from the player SWF, and may change in the future.
 # rtmp_url = "rtmp://fms.digitallyspeaking.com/cfx/st/ondemand/fcs/ident"

 ########
 # Code #
 ########

 import sys
 import os
 import subprocess
 import re
 from xml.dom import minidom

 def error(message):
    print "[gdc-downloader] Error: " + message
    sys.exit(1)

 def message(msg):
    print "[gdc-downloader] Message: " + msg

 def check_dependencies(force):
    if not force:
        f = None
        try:
            f = open(cookies_filename)
        except Exception, e:
            error("cookies not found in " + cookies_filename)
        f.close()

    # TODO: check wget, rtmpdump

 def dump_to_file(data, dest):
    dest_dir = os.path.abspath(os.path.split(dest)[0])
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    debug_file = open(dest, "w")
    debug_file.write(data)
    debug_file.close()

 def download_url(url):
    args = ["wget", "-qO-", "--load-cookies", cookies_filename, url]

    try:
        retval = subprocess.Popen(args, stdout=subprocess.PIPE)
    except Exception, e:
        error("wget error with url " + url)

    out = retval.communicate()

    return out[0]

 def retrieve_data_from_base_url(url, force, xml_dest):
    if not force: # I haven't tested this code in a while, so it might not work
        html = download_url(url).replace('\n', '').replace('\r', '')

        login_regex = re.compile(login_regular_expression)
        login_results = login_regex.match(html)
        if (login_results):
            error("downloaded login page -- check your cookies")

        player_regex = re.compile(player_regular_expression)
        player_results = player_regex.match(html)
    else:
        html = url

        player_regex = re.compile(player_regular_expression_force)
        player_results = player_regex.match(html)

    if not player_results:
        error("player URL not found")

    dump_to_file(html, xml_dest)

    base_url = player_results.group(1)
    player_url = player_results.group(2)
    player_arguments = player_results.group(3)
    xml_url = player_results.group(4)

    message("player url is " + base_url + player_url)
    message("player arguments are " + player_arguments)
    message("xml url is " + base_url + xml_url)

    player_html = download_url(base_url + player_url).replace('\n', '').replace('\r', '')

    swf_name_regex = re.compile(swf_name_regular_expression)
    swf_name_results = swf_name_regex.match(player_html)

    if not swf_name_results:
        error("SWF URL not found")

    swf_url = base_url + swf_name_results.group(1) + ".swf"

    message("swf url is " + swf_url)

    data = {}
    data["player_url"] = base_url + player_url + player_arguments
    data["swf_url"] = swf_url
    data["xml_url"] = base_url + xml_url

    return data

 def parse_xml_from_url(url, xml_dest):
    xml = download_url(url)

    dump_to_file(xml, xml_dest)

    parsed_xml = minidom.parseString(xml)

    akamai_host_xml = parsed_xml.getElementsByTagName("akamaiHost")
    speaker_video_xml = parsed_xml.getElementsByTagName("speakerVideo")
    slide_video_xml = parsed_xml.getElementsByTagName("slideVideo")

    if not len(akamai_host_xml) or not len(speaker_video_xml) or not len(slide_video_xml):
        error("xml missing properties")

    akamai_host = "rtmp://" + akamai_host_xml[0].firstChild.nodeValue + "/fcs/ident"
    speaker_video = speaker_video_xml[0].firstChild.nodeValue.replace(".flv", "")
    slide_video = slide_video_xml[0].firstChild.nodeValue.replace(".flv", "")

    message("akamai host is " + akamai_host)
    message("speaker video is " + speaker_video)
    message("slide video is " + slide_video)

    # some of the xml files contain exta audio tracks; we want those, don't we?
    audios = parsed_xml.getElementsByTagName("audios")
    audio_metadata = {}
    if (audios):
        for audio_node in audios[0].getElementsByTagName("audio"):
            audio_url = None
            code = None
            for (name, value) in audio_node.attributes.items():
                if name == "url":
                    audio_url = value.replace(".flv", "")
                elif name == "code":
                    code = value
            if code:
                audio_metadata[code] = audio_url
                message("audio " + code + " is " + audio_url)

    data = {}
    data["akamai"] = akamai_host
    data["speaker"] = speaker_video
    data["slide"] = slide_video
    data["audio"] = audio_metadata

    return data

 def download_video(rtmp, playpath, swf_url, page_url, filename):
    args = ["rtmpdump", "--rtmp", rtmp, "--playpath", playpath, "--swfUrl", swf_url, "--pageUrl", page_url, "--flv", filename]

    try:
        retval = subprocess.call(args, stdin=None)
    except Exception, e:
        error("rtmpdump error")

    return None

 def download_gdc_video_at_url(url, dest="", force=False):
    dest_path = os.path.abspath(dest)
    dest_name = "GDCVideo" if os.path.split(dest)[1] == "" else os.path.split(dest)[1]

    # Step 0: Check dependencies.
    check_dependencies(force)

    # Step 1: Extract the following from the URL: player URL, SWF URL, XML URL.
    data = retrieve_data_from_base_url(url, force, os.path.join(dest_path, dest_name + "-player-url.txt"))

    # Step 2: Parse the XML and extract the speaker video URL, slide video URL, and metadata.
    metadata = parse_xml_from_url(data["xml_url"], os.path.join(dest_path, dest_name + ".xml"))

    # Step 3: Download the videos.
    download_video(metadata["akamai"], metadata["slide"], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-slide.flv"))
    download_video(metadata["akamai"], metadata["speaker"], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-speaker.flv"))
    for code in metadata["audio"]:
        download_video(metadata["akamai"], metadata["audio"][code], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-audio-" + code + ".flv"))

    message("All done!")

 if __name__ == "__main__":
    if len(sys.argv) >= 2:
        try:
            force = (sys.argv[1] == "-f")
            offset = 1 if force else 0
            if ((len(sys.argv) == 2) or (force and (len(sys.argv) == 3))):
                download_gdc_video_at_url(sys.argv[1+offset], "", force)
            elif ((len(sys.argv) == 3) or (force and (len(sys.argv) == 4))):
                download_gdc_video_at_url(sys.argv[1+offset], sys.argv[2+offset], force)
            else:
                error("invalid number of arguments")
        except KeyboardInterrupt:
            error("program interrupted")
    else:
        error("invalid number of arguments")
	# GDC Vault videos can't be watched on mobile devices and this is a very sad thing indeed!
	# This script is designed to circumvent this by downloading the lecture and slideshow
	# videos which can then be re-encoded into whatever format you wish.

	# Note: this code is rather flimsy and was written as fast as possible for my own personal use.
	# The code only works for the most recent GDC Vault videos, since they all use the same player
	# format. If the XML format used to run the player is changed (as it has in the past), the code
	# will have to be reconfigured. In the past, I was able to feed a wget-compatible cookies.txt
	# file into the wget call, but I can't get it to trigger anymore. So for now, the way I download
	# each video is I look at the source for the player page, find the player.html URL, and feed
	# it into the script with the -f flag. Ugly and slow, but hey, it works.

	# I generally hate reinventing the wheel and it does look like youtube-dl does some of the same
	# stuff I'm doing, but I couldn't get it to work with the GDC URLs. So off to Python land we go!!!

	# Usage is as follows:
	#
	# With cookies.txt: gdc-downloader.py "[GDC video URL]" [output dir]
	# Without cookies.txt: gdc-downloader.py -f "[GDC player.html URL]" [output dir]
	#
	# A GDC video URL looks like this:
	# http://www.gdcvault.com/play/1015662/Creative-Panic-How-Agility-Turned
	#
	# A GDC player.html URL looks like this:
	# http://evt.dispeak.com/ubm/gdc/sf12/player.html?xmlURL=xml/201203238_1331124629609NXXJ.xml&token=1234567890
	#
	# The output dir should be the name of your video. For example, suppling TestDir/GDCVid will create
	# TestDir/GDCVid/GDCVid.xml, TestDir/GDCVid/GDCVid-slide.flv, etc.

	# You need to have wget and rtmpdump installed in order for this script to work. I recommend macports.

	#############
	# Constants #
	#############

	cookies_filename = "cookies.txt"

	player_regular_expression = r"^.\"(.?)(player\.html)(.?xmlURL=(.?)[&].?)\".$"
	player_regular_expression_force = r"^(.?)(player\.html)(.?xmlURL=(.?)[&].?)$" # same as above but parses URL directly
	login_regular_expression = r"^.\"(.?login\.php.?)\".$"
	swf_name_regular_expression = r"^.embed the Flash Content SWF when all tests are passed.?\"src\".?\"(.?)\".*$"

	# DEPRECATED: This URL was retrieved from the player SWF, and may change in the future.
	# rtmp_url = "rtmp://fms.digitallyspeaking.com/cfx/st/ondemand/fcs/ident"

	########
	# Code #
	########

	import sys
	import os
	import subprocess
	import re
	from xml.dom import minidom

	def error(message):
	print "[gdc-downloader] Error: " + message
	sys.exit(1)

	def message(msg):
	print "[gdc-downloader] Message: " + msg

	def check_dependencies(force):
	if not force:
	f = None
	try:
	f = open(cookies_filename)
	except Exception, e:
	error("cookies not found in " + cookies_filename)
	f.close()

	# TODO: check wget, rtmpdump

	def dump_to_file(data, dest):
	dest_dir = os.path.abspath(os.path.split(dest)[0])
	if not os.path.exists(dest_dir):
	os.makedirs(dest_dir)
	debug_file = open(dest, "w")
	debug_file.write(data)
	debug_file.close()

	def download_url(url):
	args = ["wget", "-qO-", "--load-cookies", cookies_filename, url]

	try:
	retval = subprocess.Popen(args, stdout=subprocess.PIPE)
	except Exception, e:
	error("wget error with url " + url)

	out = retval.communicate()

	return out[0]

	def retrieve_data_from_base_url(url, force, xml_dest):
	if not force: # I haven't tested this code in a while, so it might not work
	html = download_url(url).replace('\n', '').replace('\r', '')

	login_regex = re.compile(login_regular_expression)
	login_results = login_regex.match(html)
	if (login_results):
	error("downloaded login page -- check your cookies")

	player_regex = re.compile(player_regular_expression)
	player_results = player_regex.match(html)
	else:
	html = url

	player_regex = re.compile(player_regular_expression_force)
	player_results = player_regex.match(html)

	if not player_results:
	error("player URL not found")

	dump_to_file(html, xml_dest)

	base_url = player_results.group(1)
	player_url = player_results.group(2)
	player_arguments = player_results.group(3)
	xml_url = player_results.group(4)

	message("player url is " + base_url + player_url)
	message("player arguments are " + player_arguments)
	message("xml url is " + base_url + xml_url)

	player_html = download_url(base_url + player_url).replace('\n', '').replace('\r', '')

	swf_name_regex = re.compile(swf_name_regular_expression)
	swf_name_results = swf_name_regex.match(player_html)

	if not swf_name_results:
	error("SWF URL not found")

	swf_url = base_url + swf_name_results.group(1) + ".swf"

	message("swf url is " + swf_url)

	data = {}
	data["player_url"] = base_url + player_url + player_arguments
	data["swf_url"] = swf_url
	data["xml_url"] = base_url + xml_url

	return data

	def parse_xml_from_url(url, xml_dest):
	xml = download_url(url)

	dump_to_file(xml, xml_dest)

	parsed_xml = minidom.parseString(xml)

	akamai_host_xml = parsed_xml.getElementsByTagName("akamaiHost")
	speaker_video_xml = parsed_xml.getElementsByTagName("speakerVideo")
	slide_video_xml = parsed_xml.getElementsByTagName("slideVideo")

	if not len(akamai_host_xml) or not len(speaker_video_xml) or not len(slide_video_xml):
	error("xml missing properties")

	akamai_host = "rtmp://" + akamai_host_xml[0].firstChild.nodeValue + "/fcs/ident"
	speaker_video = speaker_video_xml[0].firstChild.nodeValue.replace(".flv", "")
	slide_video = slide_video_xml[0].firstChild.nodeValue.replace(".flv", "")

	message("akamai host is " + akamai_host)
	message("speaker video is " + speaker_video)
	message("slide video is " + slide_video)

	# some of the xml files contain exta audio tracks; we want those, don't we?
	audios = parsed_xml.getElementsByTagName("audios")
	audio_metadata = {}
	if (audios):
	for audio_node in audios[0].getElementsByTagName("audio"):
	audio_url = None
	code = None
	for (name, value) in audio_node.attributes.items():
	if name == "url":
	audio_url = value.replace(".flv", "")
	elif name == "code":
	code = value
	if code:
	audio_metadata[code] = audio_url
	message("audio " + code + " is " + audio_url)

	data = {}
	data["akamai"] = akamai_host
	data["speaker"] = speaker_video
	data["slide"] = slide_video
	data["audio"] = audio_metadata

	return data

	def download_video(rtmp, playpath, swf_url, page_url, filename):
	args = ["rtmpdump", "--rtmp", rtmp, "--playpath", playpath, "--swfUrl", swf_url, "--pageUrl", page_url, "--flv", filename]

	try:
	retval = subprocess.call(args, stdin=None)
	except Exception, e:
	error("rtmpdump error")

	return None

	def download_gdc_video_at_url(url, dest="", force=False):
	dest_path = os.path.abspath(dest)
	dest_name = "GDCVideo" if os.path.split(dest)[1] == "" else os.path.split(dest)[1]

	# Step 0: Check dependencies.
	check_dependencies(force)

	# Step 1: Extract the following from the URL: player URL, SWF URL, XML URL.
	data = retrieve_data_from_base_url(url, force, os.path.join(dest_path, dest_name + "-player-url.txt"))

	# Step 2: Parse the XML and extract the speaker video URL, slide video URL, and metadata.
	metadata = parse_xml_from_url(data["xml_url"], os.path.join(dest_path, dest_name + ".xml"))

	# Step 3: Download the videos.
	download_video(metadata["akamai"], metadata["slide"], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-slide.flv"))
	download_video(metadata["akamai"], metadata["speaker"], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-speaker.flv"))
	for code in metadata["audio"]:
	download_video(metadata["akamai"], metadata["audio"][code], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-audio-" + code + ".flv"))

	message("All done!")

	if __name__ == "__main__":
	if len(sys.argv) >= 2:
	try:
	force = (sys.argv[1] == "-f")
	offset = 1 if force else 0
	if ((len(sys.argv) == 2) or (force and (len(sys.argv) == 3))):
	download_gdc_video_at_url(sys.argv[1+offset], "", force)
	elif ((len(sys.argv) == 3) or (force and (len(sys.argv) == 4))):
	download_gdc_video_at_url(sys.argv[1+offset], sys.argv[2+offset], force)
	else:
	error("invalid number of arguments")
	except KeyboardInterrupt:
	error("program interrupted")
	else:
	error("invalid number of arguments")