Created
November 30, 2012 07:06
-
-
Save blinsay/4174233 to your computer and use it in GitHub Desktop.
GIVE ME ALL OF YOUR GIFS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import requests | |
import argparse | |
from urlparse import urlparse | |
from time import ctime, sleep | |
from bs4 import BeautifulSoup | |
ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
def base62_encode(num, alphabet=ALPHABET): | |
"""Encode a number in Base X | |
`num`: The number to encode | |
`alphabet`: The alphabet to use for encoding | |
""" | |
if (num == 0): | |
return alphabet[0] | |
arr = [] | |
base = len(alphabet) | |
while num: | |
rem = num % base | |
num = num // base | |
arr.append(alphabet[rem]) | |
arr.reverse() | |
return ''.join(arr) | |
def base62_decode(string, alphabet=ALPHABET): | |
"""Decode a Base X encoded string into the number | |
Arguments: | |
- `string`: The encoded string | |
- `alphabet`: The alphabet to use for encoding | |
""" | |
base = len(alphabet) | |
strlen = len(string) | |
num = 0 | |
idx = 0 | |
for char in string: | |
power = (strlen - (idx + 1)) | |
num += alphabet.index(char) * (base ** power) | |
idx += 1 | |
return num | |
def ensure_dir(path): | |
if not os.path.isdir(path): | |
os.makedirs(path) | |
def save_gif(download_dir, save_path, contents): | |
print >> sys.stderr, '[%s] saving to %s...' % (ctime(), save_path), | |
ensure_dir(download_dir) | |
with open(save_path, 'w') as output: | |
output.write(contents) | |
output.close() | |
print >> sys.stderr, " done" | |
def download_gif(img_url, base_dir): | |
""" | |
Download a gif to the given folder. | |
""" | |
_, netloc, path, _, _, _ = urlparse(img_url) | |
gif_name = os.path.basename(path) | |
download_dir = os.path.join(base_dir, netloc) | |
save_path = os.path.join(download_dir, gif_name) | |
if os.path.exists(save_path): | |
print >> sys.stderr, "[%s] file %s already exists. skipping" % (ctime(), save_path) | |
else: | |
r = requests.get(img_url) | |
if r.ok: | |
_, site, gif_id = img_url.rsplit('/', 2) | |
save_gif(download_dir, save_path, r.content) | |
else: | |
print >> sys.stderr, "Failed to fetch an image from", img_url | |
def get_gif_url(response): | |
""" | |
Parse the uri of the source gif from a maxgif.com page. | |
""" | |
soup = BeautifulSoup(response.content) | |
img = soup.img | |
if img['class'] != [u'check-image']: | |
raise ValueError, "Aw sheyt. MaxGif changed something. Image tag was %s" % img | |
return img['src'] | |
def maxgif_for(n, template = 'http://maxgif.com/%s'): | |
""" | |
Get the Nth maxgif page | |
""" | |
encoded = base62_encode(n) | |
return template % encoded | |
def all_of_maxgif(start = 1, template = 'http://maxgif.com/%s'): | |
i = start | |
while True: | |
yield maxgif_for(i, template) | |
i += 1 | |
def main(args): | |
start = base62_decode(args.start) | |
image_dir = args.dir | |
for maxgif_url in all_of_maxgif(start = start): | |
r = requests.get(maxgif_url) | |
if r.ok: | |
img_url = get_gif_url(r) | |
if not img_url: | |
print >> sys.stderr, '[%s] blank: %s' % (ctime(), maxgif_url) | |
else: | |
print >> sys.stderr, '[%s] found: %s -> %s' % (ctime(), maxgif_url, img_url) | |
if args.download: | |
download_gif(img_url, 'images') | |
else: | |
print >> sys.stderr, 'error: request to maxgif failed' | |
print >> sys.stderr, r.status, r.reason | |
sleep(args.sleep_interval) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description = "All of your .gifs. Give them to me.") | |
parser.add_argument('-d', action = 'store', dest = 'dir', default = 'images', | |
help = 'the directory to download .gifs to') | |
parser.add_argument('-s', action = 'store', default = '001', dest = 'start', | |
help = 'the gif number to start crawling from (b62 encoded. Just copy the end of the URL)') | |
parser.add_argument('-x', action = 'store_true', default = False, dest = 'download', | |
help = 'save gifs to disk') | |
parser.add_argument('-i', type = float, default = 0.2, dest = 'sleep_interval', | |
help = 'wait time between requests to maxgif. BE POLITE, DON\'T BE AN ASSHOLE AND SET IT LOWER THAN 0.1') | |
main(parser.parse_args()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment