Created
October 30, 2010 12:56
-
-
Save Leonidas-from-XIV/655272 to your computer and use it in GitHub Desktop.
zeropunctuation-dl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# A downloader for Zero Punctuation episodes. Grabs them from the internet | |
# and saves them with the approriate naming into the folder. The name is | |
# determined automatically from the web site. | |
# Licensed under GPLv3, fwiw. | |
import sys, urllib.request, re, json | |
# the browser that we are going to pretend we are | |
# yay for increasing Firefox and Linux marketshare) | |
user_agent = """Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101027 Firefox/3.6.12""" | |
# where to find the path to the config file. pretty crude, admittedly | |
config_re = re.compile(r'value="config=(http://www.themis-media.com/videos/config/\d*-\w*.js)') | |
charset_re = re.compile(r'charset=(.*)') | |
def construct_request(url, host, referrer=None): | |
"""This constructs a request that actually works. Setting User-Agent is not enough, | |
we also set some headers (grabbed from what Firefox 3.6.12 actually sends).""" | |
req = urllib.request.Request(url, headers={ | |
'User-Agent' : user_agent, | |
'Host' : host, | |
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Language' : 'en-us,en;q=0.7,de;q=0.3', | |
'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', | |
'Connection' : 'close' | |
}) | |
# maybe they also want a referrer? In any case, we can send one | |
if referrer is not None: | |
req.add_header('Referer', referrer) | |
return req | |
def get_config_path(source): | |
"""Finds the URL of the config file on the page""" | |
return config_re.search(source).group(1) | |
def download_hook(blocks_done, blocksize, total): | |
"""Displays the download progress""" | |
print("\r {0}/{1}".format(blocks_done * blocksize, total), end='') | |
def main(): | |
url = sys.argv[1] | |
# load the "intro" page and extract the config file | |
with urllib.request.urlopen(construct_request(url, 'www.escapistmagazine.com')) as page: | |
bytestream = page.read() | |
content_type = dict(page.getheaders())['Content-Type'] | |
charset = charset_re.findall(content_type)[0] | |
content = bytestream.decode(charset) | |
config_path = get_config_path(content) | |
# grab the config file. this is not that easy, because the page blacklists | |
# "incorrect" requests, so we get BadStatusLine exception | |
# => take care that we pretend to be a browser good enough. | |
with urllib.request.urlopen(construct_request(config_path, 'www.themis-media.com', referrer=url)) as page: | |
# the JS file can be parsed as JSON when we change the quotes | |
content = page.read().decode('utf-8').replace("'", '"') | |
config = json.loads(content) | |
# URL to the video | |
video = config['playlist'][1]['url'] | |
# the name of the episode | |
name = config['plugins']['viral']['share']['description'] | |
# destination filename | |
filename = name + '.mp4' | |
# grab it | |
print("Downloading to {0}".format(filename)) | |
urllib.request.urlretrieve(video, filename, download_hook) | |
print() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment