Skip to content

Instantly share code, notes, and snippets.

@blinsay
Created November 30, 2012 07:06
Show Gist options
  • Save blinsay/4174233 to your computer and use it in GitHub Desktop.
Save blinsay/4174233 to your computer and use it in GitHub Desktop.
GIVE ME ALL OF YOUR GIFS
#!/usr/bin/env python
import os
import sys
import requests
import argparse
from urlparse import urlparse
from time import ctime, sleep
from bs4 import BeautifulSoup
ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
def base62_encode(num, alphabet=ALPHABET):
"""Encode a number in Base X
`num`: The number to encode
`alphabet`: The alphabet to use for encoding
"""
if (num == 0):
return alphabet[0]
arr = []
base = len(alphabet)
while num:
rem = num % base
num = num // base
arr.append(alphabet[rem])
arr.reverse()
return ''.join(arr)
def base62_decode(string, alphabet=ALPHABET):
"""Decode a Base X encoded string into the number
Arguments:
- `string`: The encoded string
- `alphabet`: The alphabet to use for encoding
"""
base = len(alphabet)
strlen = len(string)
num = 0
idx = 0
for char in string:
power = (strlen - (idx + 1))
num += alphabet.index(char) * (base ** power)
idx += 1
return num
def ensure_dir(path):
if not os.path.isdir(path):
os.makedirs(path)
def save_gif(download_dir, save_path, contents):
print >> sys.stderr, '[%s] saving to %s...' % (ctime(), save_path),
ensure_dir(download_dir)
with open(save_path, 'w') as output:
output.write(contents)
output.close()
print >> sys.stderr, " done"
def download_gif(img_url, base_dir):
"""
Download a gif to the given folder.
"""
_, netloc, path, _, _, _ = urlparse(img_url)
gif_name = os.path.basename(path)
download_dir = os.path.join(base_dir, netloc)
save_path = os.path.join(download_dir, gif_name)
if os.path.exists(save_path):
print >> sys.stderr, "[%s] file %s already exists. skipping" % (ctime(), save_path)
else:
r = requests.get(img_url)
if r.ok:
_, site, gif_id = img_url.rsplit('/', 2)
save_gif(download_dir, save_path, r.content)
else:
print >> sys.stderr, "Failed to fetch an image from", img_url
def get_gif_url(response):
"""
Parse the uri of the source gif from a maxgif.com page.
"""
soup = BeautifulSoup(response.content)
img = soup.img
if img['class'] != [u'check-image']:
raise ValueError, "Aw sheyt. MaxGif changed something. Image tag was %s" % img
return img['src']
def maxgif_for(n, template = 'http://maxgif.com/%s'):
"""
Get the Nth maxgif page
"""
encoded = base62_encode(n)
return template % encoded
def all_of_maxgif(start = 1, template = 'http://maxgif.com/%s'):
i = start
while True:
yield maxgif_for(i, template)
i += 1
def main(args):
start = base62_decode(args.start)
image_dir = args.dir
for maxgif_url in all_of_maxgif(start = start):
r = requests.get(maxgif_url)
if r.ok:
img_url = get_gif_url(r)
if not img_url:
print >> sys.stderr, '[%s] blank: %s' % (ctime(), maxgif_url)
else:
print >> sys.stderr, '[%s] found: %s -> %s' % (ctime(), maxgif_url, img_url)
if args.download:
download_gif(img_url, 'images')
else:
print >> sys.stderr, 'error: request to maxgif failed'
print >> sys.stderr, r.status, r.reason
sleep(args.sleep_interval)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = "All of your .gifs. Give them to me.")
parser.add_argument('-d', action = 'store', dest = 'dir', default = 'images',
help = 'the directory to download .gifs to')
parser.add_argument('-s', action = 'store', default = '001', dest = 'start',
help = 'the gif number to start crawling from (b62 encoded. Just copy the end of the URL)')
parser.add_argument('-x', action = 'store_true', default = False, dest = 'download',
help = 'save gifs to disk')
parser.add_argument('-i', type = float, default = 0.2, dest = 'sleep_interval',
help = 'wait time between requests to maxgif. BE POLITE, DON\'T BE AN ASSHOLE AND SET IT LOWER THAN 0.1')
main(parser.parse_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment