Created
May 24, 2012 13:23
-
-
Save fsiler/2781530 to your computer and use it in GitHub Desktop.
Buckflickr: a script to rip Flickr galleries given an image URL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# named in honor of Jack Buck: "Go crazy, folks! Go crazy!" | |
# yes, this is ugly. No, it doesn't use the API. I don't care. I want my pictures quickly without fuss. | |
# | |
# see http://goo.gl/yJVl5 for PyCon inspiration video | |
# | |
# TODO: handle flickr login | |
# TODO: add command line options for username, password | |
# TODO: make at least "tocrawled" persistent, perhaps database, maybe even by pickling | |
# TODO: add some kind of metadata so it's easy to re-crawl things later as galleries get updated | |
# TODO: set file creation time to flickr date, or maybe source from EXIF if available | |
# TODO: automatically delete older, smaller downloads of same image | |
# TODO: videos? | |
# TODO: fix assumption that all media is .jpg | |
# ideas: | |
# - manipulate EXIF data in each picture to include URL and description (having trouble finding lib for this) | |
# - use create sqlite database (which will work with multiple open) or plain text file log | |
import mechanize, logging, os, sys, re, optparse | |
parser = optparse.OptionParser(description="Download an entire gallery given a flickr image URL.", epilog="") | |
parser.add_option("-v", "--verbose", action="count", default=0, help="increase verbosity") | |
(args, userurls) = parser.parse_args() | |
logging.basicConfig( level=(40 - vars(args)['verbose'] * 10) ) | |
logging.debug("optparse values: " + str(vars(args))) | |
b = mechanize.Browser() | |
b.set_handle_robots(False) | |
b.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98;)')] | |
# add images so that they appear in b.links() | |
b._factory._links_factory.urltags['img'] = 'src' | |
tocrawl = set() # initially populated with user args; holds flickr URLs to traverse | |
crawled = set() # holds flickr URLs already handled | |
skipped = 0 | |
def visit(url, options=None): | |
"""wraps mechanize.Browser.open in order to catch bad URLs more intelligibly""" | |
logging.info("attempting to visit url %s" % (url)) | |
try: | |
return b.open(url.partition("?")[0], options) | |
except: | |
logging.warning("could not visit URL %s, options %s" % (url, options)) | |
# begin with user-provided URLs | |
for url in userurls: | |
tocrawl.add(url) | |
while tocrawl: | |
imgdata = None | |
url = tocrawl.pop() | |
if url in crawled: | |
logging.info("already visited %s, continuing to next item" % (url)) | |
continue | |
logging.info("%d items to crawl, %d already crawled, %d skipped" % (len(tocrawl), len(crawled), skipped)) | |
logging.debug("tocrawl %s\ncrawled %s" % (tocrawl, crawled)) | |
crawled.add(url) | |
b.clear_history() | |
visit(url) | |
logging.info("page %s, url %s " % (b.title(), url)) | |
for link in b.links(): | |
# logging.debug("found link: %s" % (link)) | |
attrs = dict(link.attrs) | |
if "ywa-track" in attrs.get('class', '') and "context-thumb-link" in attrs.get('class', ''): | |
logging.debug("queuing URL: %s" % (link.url)) | |
if link.url.startswith("http"): | |
tocrawl.add(link.url) | |
else: | |
tocrawl.add("http://www.flickr.com/" + link.url.lstrip("/")) | |
if "All Sizes" in attrs.get('data-ywa-name', ''): | |
logging.info("found All Sizes URL: %s" % link.url) | |
allsizesurl = link.url | |
try: | |
allsizespage = visit(allsizesurl) | |
except NameError: | |
logging.error("Couldn't find \"All Sizes\" page: please try entering the URL of a single image page on Flickr. This may also be a problem with being logged in to Flickr.") | |
skipped += 1 | |
continue | |
def imgsort(link): | |
"""given a mechanize.Link object, return an ordering for it based on image size""" | |
# these are the image size keys that flickr uses in its URLs, going from o (original) to sq (tiny thumbnail) | |
sizes = ["o", "k", "h", "b", "c", "z", "m", "n", "s", "t", "q", "sq"] | |
# logging.debug("attempting to find key for URL %s" % (link.url)) | |
try: | |
return sizes.index( re.findall("/([a-z][a-z]?)/", link.url)[0]) | |
except IndexError: | |
# this should only happen if the regex doesn't match, meaning we should have an image | |
try: | |
return sizes.index(re.findall(".*?_([a-z][a-z]?)(?:_d)?\.jpg", link.url)[0]) | |
except IndexError: | |
logging.error("could not find key for url %s" % (link.url)) | |
return len(sizes) + 1 | |
# These might be handy for future expansion but are not currently needed | |
# if "disabled downloading" in allsizespage.read(): | |
# logging.info("found protected image") | |
# | |
# if list(b.links(url_regex="_o(_d)?\.jpg")): | |
# logging.info("found original size image!") | |
imgpageurls = b.links(url_regex="\/sizes\/[a-z][a-z]?|\.jpg") | |
imgpageurls = sorted(imgpageurls, key=imgsort) | |
for i in imgpageurls: | |
logging.debug("candidate image page URL,title: %s, %s" % (i.url, i.text)) | |
imgurl = imgpageurls[0].url.partition("?")[0] | |
# if URL is not an image, it means we need another layer of indirection | |
if not imgurl.endswith(".jpg"): | |
logging.debug("now visiting page which hosts largest image: %s" % (imgurl)) | |
img = visit(imgurl) | |
imgurl = b.links(url_regex="\.jpg").next().url.partition("?")[0] | |
fn = os.path.basename(imgurl) | |
if os.path.exists(fn) and os.stat(fn).st_size: | |
logging.critical("detected nonempty file %s, not clobbering." % (fn)) | |
skipped += 1 | |
continue | |
img = visit(imgurl) | |
imgdata = img.read() | |
with open(fn, "w") as f: | |
f.write(imgdata) | |
print "crawled %d items, %d skipped" % (len(crawled), skipped) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment