-
-
Save urielm/6033298 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# GDC Vault videos can't be watched on mobile devices and this is a very sad thing indeed! | |
# This script is designed to circumvent this by downloading the lecture and slideshow | |
# videos which can then be re-encoded into whatever format you wish. | |
# Note: this code is rather flimsy and was written as fast as possible for my own personal use. | |
# The code only works for the most recent GDC Vault videos, since they all use the same player | |
# format. If the XML format used to run the player is changed (as it has in the past), the code | |
# will have to be reconfigured. In the past, I was able to feed a wget-compatible cookies.txt | |
# file into the wget call, but I can't get it to trigger anymore. So for now, the way I download | |
# each video is I look at the source for the player page, find the player.html URL, and feed | |
# it into the script with the -f flag. Ugly and slow, but hey, it works. | |
# I generally hate reinventing the wheel and it does look like youtube-dl does some of the same | |
# stuff I'm doing, but I couldn't get it to work with the GDC URLs. So off to Python land we go!!! | |
# Usage is as follows: | |
# | |
# With cookies.txt: gdc-downloader.py "[GDC video URL]" [output dir] | |
# Without cookies.txt: gdc-downloader.py -f "[GDC player.html URL]" [output dir] | |
# | |
# A GDC video URL looks like this: | |
# http://www.gdcvault.com/play/1015662/Creative-Panic-How-Agility-Turned | |
# | |
# A GDC player.html URL looks like this: | |
# http://evt.dispeak.com/ubm/gdc/sf12/player.html?xmlURL=xml/201203238_1331124629609NXXJ.xml&token=1234567890 | |
# | |
# The output dir should be the name of your video. For example, suppling TestDir/GDCVid will create | |
# TestDir/GDCVid/GDCVid.xml, TestDir/GDCVid/GDCVid-slide.flv, etc. | |
# You need to have wget and rtmpdump installed in order for this script to work. I recommend macports. | |
############# | |
# Constants # | |
############# | |
cookies_filename = "cookies.txt" | |
player_regular_expression = r"^.*\"(.*?)(player\.html)(.*?xmlURL=(.*?)[&].*?)\".*$" | |
player_regular_expression_force = r"^(.*?)(player\.html)(.*?xmlURL=(.*?)[&].*?)$" # same as above but parses URL directly | |
login_regular_expression = r"^.*\"(.*?login\.php.*?)\".*$" | |
swf_name_regular_expression = r"^.*embed the Flash Content SWF when all tests are passed.*?\"src\".*?\"(.*?)\".*$" | |
# DEPRECATED: This URL was retrieved from the player SWF, and may change in the future. | |
# rtmp_url = "rtmp://fms.digitallyspeaking.com/cfx/st/ondemand/fcs/ident" | |
######## | |
# Code # | |
######## | |
import sys | |
import os | |
import subprocess | |
import re | |
from xml.dom import minidom | |
def error(message): | |
print "[gdc-downloader] Error: " + message | |
sys.exit(1) | |
def message(msg): | |
print "[gdc-downloader] Message: " + msg | |
def check_dependencies(force): | |
if not force: | |
f = None | |
try: | |
f = open(cookies_filename) | |
except Exception, e: | |
error("cookies not found in " + cookies_filename) | |
f.close() | |
# TODO: check wget, rtmpdump | |
def dump_to_file(data, dest): | |
dest_dir = os.path.abspath(os.path.split(dest)[0]) | |
if not os.path.exists(dest_dir): | |
os.makedirs(dest_dir) | |
debug_file = open(dest, "w") | |
debug_file.write(data) | |
debug_file.close() | |
def download_url(url): | |
args = ["wget", "-qO-", "--load-cookies", cookies_filename, url] | |
try: | |
retval = subprocess.Popen(args, stdout=subprocess.PIPE) | |
except Exception, e: | |
error("wget error with url " + url) | |
out = retval.communicate() | |
return out[0] | |
def retrieve_data_from_base_url(url, force, xml_dest): | |
if not force: # I haven't tested this code in a while, so it might not work | |
html = download_url(url).replace('\n', '').replace('\r', '') | |
login_regex = re.compile(login_regular_expression) | |
login_results = login_regex.match(html) | |
if (login_results): | |
error("downloaded login page -- check your cookies") | |
player_regex = re.compile(player_regular_expression) | |
player_results = player_regex.match(html) | |
else: | |
html = url | |
player_regex = re.compile(player_regular_expression_force) | |
player_results = player_regex.match(html) | |
if not player_results: | |
error("player URL not found") | |
dump_to_file(html, xml_dest) | |
base_url = player_results.group(1) | |
player_url = player_results.group(2) | |
player_arguments = player_results.group(3) | |
xml_url = player_results.group(4) | |
message("player url is " + base_url + player_url) | |
message("player arguments are " + player_arguments) | |
message("xml url is " + base_url + xml_url) | |
player_html = download_url(base_url + player_url).replace('\n', '').replace('\r', '') | |
swf_name_regex = re.compile(swf_name_regular_expression) | |
swf_name_results = swf_name_regex.match(player_html) | |
if not swf_name_results: | |
error("SWF URL not found") | |
swf_url = base_url + swf_name_results.group(1) + ".swf" | |
message("swf url is " + swf_url) | |
data = {} | |
data["player_url"] = base_url + player_url + player_arguments | |
data["swf_url"] = swf_url | |
data["xml_url"] = base_url + xml_url | |
return data | |
def parse_xml_from_url(url, xml_dest): | |
xml = download_url(url) | |
dump_to_file(xml, xml_dest) | |
parsed_xml = minidom.parseString(xml) | |
akamai_host_xml = parsed_xml.getElementsByTagName("akamaiHost") | |
speaker_video_xml = parsed_xml.getElementsByTagName("speakerVideo") | |
slide_video_xml = parsed_xml.getElementsByTagName("slideVideo") | |
if not len(akamai_host_xml) or not len(speaker_video_xml) or not len(slide_video_xml): | |
error("xml missing properties") | |
akamai_host = "rtmp://" + akamai_host_xml[0].firstChild.nodeValue + "/fcs/ident" | |
speaker_video = speaker_video_xml[0].firstChild.nodeValue.replace(".flv", "") | |
slide_video = slide_video_xml[0].firstChild.nodeValue.replace(".flv", "") | |
message("akamai host is " + akamai_host) | |
message("speaker video is " + speaker_video) | |
message("slide video is " + slide_video) | |
# some of the xml files contain exta audio tracks; we want those, don't we? | |
audios = parsed_xml.getElementsByTagName("audios") | |
audio_metadata = {} | |
if (audios): | |
for audio_node in audios[0].getElementsByTagName("audio"): | |
audio_url = None | |
code = None | |
for (name, value) in audio_node.attributes.items(): | |
if name == "url": | |
audio_url = value.replace(".flv", "") | |
elif name == "code": | |
code = value | |
if code: | |
audio_metadata[code] = audio_url | |
message("audio " + code + " is " + audio_url) | |
data = {} | |
data["akamai"] = akamai_host | |
data["speaker"] = speaker_video | |
data["slide"] = slide_video | |
data["audio"] = audio_metadata | |
return data | |
def download_video(rtmp, playpath, swf_url, page_url, filename): | |
args = ["rtmpdump", "--rtmp", rtmp, "--playpath", playpath, "--swfUrl", swf_url, "--pageUrl", page_url, "--flv", filename] | |
try: | |
retval = subprocess.call(args, stdin=None) | |
except Exception, e: | |
error("rtmpdump error") | |
return None | |
def download_gdc_video_at_url(url, dest="", force=False): | |
dest_path = os.path.abspath(dest) | |
dest_name = "GDCVideo" if os.path.split(dest)[1] == "" else os.path.split(dest)[1] | |
# Step 0: Check dependencies. | |
check_dependencies(force) | |
# Step 1: Extract the following from the URL: player URL, SWF URL, XML URL. | |
data = retrieve_data_from_base_url(url, force, os.path.join(dest_path, dest_name + "-player-url.txt")) | |
# Step 2: Parse the XML and extract the speaker video URL, slide video URL, and metadata. | |
metadata = parse_xml_from_url(data["xml_url"], os.path.join(dest_path, dest_name + ".xml")) | |
# Step 3: Download the videos. | |
download_video(metadata["akamai"], metadata["slide"], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-slide.flv")) | |
download_video(metadata["akamai"], metadata["speaker"], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-speaker.flv")) | |
for code in metadata["audio"]: | |
download_video(metadata["akamai"], metadata["audio"][code], data["swf_url"], data["player_url"], os.path.join(dest_path, dest_name + "-audio-" + code + ".flv")) | |
message("All done!") | |
if __name__ == "__main__": | |
if len(sys.argv) >= 2: | |
try: | |
force = (sys.argv[1] == "-f") | |
offset = 1 if force else 0 | |
if ((len(sys.argv) == 2) or (force and (len(sys.argv) == 3))): | |
download_gdc_video_at_url(sys.argv[1+offset], "", force) | |
elif ((len(sys.argv) == 3) or (force and (len(sys.argv) == 4))): | |
download_gdc_video_at_url(sys.argv[1+offset], sys.argv[2+offset], force) | |
else: | |
error("invalid number of arguments") | |
except KeyboardInterrupt: | |
error("program interrupted") | |
else: | |
error("invalid number of arguments") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment