Skip to content

Instantly share code, notes, and snippets.

@fsiler
Created May 24, 2012 13:23
Show Gist options
  • Save fsiler/2781530 to your computer and use it in GitHub Desktop.
Save fsiler/2781530 to your computer and use it in GitHub Desktop.
Buckflickr: a script to rip Flickr galleries given an image URL
#!/usr/bin/env python
# named in honor of Jack Buck: "Go crazy, folks! Go crazy!"
# yes, this is ugly. No, it doesn't use the API. I don't care. I want my pictures quickly without fuss.
#
# see http://goo.gl/yJVl5 for PyCon inspiration video
#
# TODO: handle flickr login
# TODO: add command line options for username, password
# TODO: make at least "tocrawled" persistent, perhaps database, maybe even by pickling
# TODO: add some kind of metadata so it's easy to re-crawl things later as galleries get updated
# TODO: set file creation time to flickr date, or maybe source from EXIF if available
# TODO: automatically delete older, smaller downloads of same image
# TODO: videos?
# TODO: fix assumption that all media is .jpg
# ideas:
# - manipulate EXIF data in each picture to include URL and description (having trouble finding lib for this)
# - use create sqlite database (which will work with multiple open) or plain text file log
import mechanize, logging, os, sys, re, optparse
parser = optparse.OptionParser(description="Download an entire gallery given a flickr image URL.", epilog="")
parser.add_option("-v", "--verbose", action="count", default=0, help="increase verbosity")
(args, userurls) = parser.parse_args()
logging.basicConfig( level=(40 - vars(args)['verbose'] * 10) )
logging.debug("optparse values: " + str(vars(args)))
b = mechanize.Browser()
b.set_handle_robots(False)
b.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98;)')]
# add images so that they appear in b.links()
b._factory._links_factory.urltags['img'] = 'src'
tocrawl = set() # initially populated with user args; holds flickr URLs to traverse
crawled = set() # holds flickr URLs already handled
skipped = 0
def visit(url, options=None):
"""wraps mechanize.Browser.open in order to catch bad URLs more intelligibly"""
logging.info("attempting to visit url %s" % (url))
try:
return b.open(url.partition("?")[0], options)
except:
logging.warning("could not visit URL %s, options %s" % (url, options))
# begin with user-provided URLs
for url in userurls:
tocrawl.add(url)
while tocrawl:
imgdata = None
url = tocrawl.pop()
if url in crawled:
logging.info("already visited %s, continuing to next item" % (url))
continue
logging.info("%d items to crawl, %d already crawled, %d skipped" % (len(tocrawl), len(crawled), skipped))
logging.debug("tocrawl %s\ncrawled %s" % (tocrawl, crawled))
crawled.add(url)
b.clear_history()
visit(url)
logging.info("page %s, url %s " % (b.title(), url))
for link in b.links():
# logging.debug("found link: %s" % (link))
attrs = dict(link.attrs)
if "ywa-track" in attrs.get('class', '') and "context-thumb-link" in attrs.get('class', ''):
logging.debug("queuing URL: %s" % (link.url))
if link.url.startswith("http"):
tocrawl.add(link.url)
else:
tocrawl.add("http://www.flickr.com/" + link.url.lstrip("/"))
if "All Sizes" in attrs.get('data-ywa-name', ''):
logging.info("found All Sizes URL: %s" % link.url)
allsizesurl = link.url
try:
allsizespage = visit(allsizesurl)
except NameError:
logging.error("Couldn't find \"All Sizes\" page: please try entering the URL of a single image page on Flickr. This may also be a problem with being logged in to Flickr.")
skipped += 1
continue
def imgsort(link):
"""given a mechanize.Link object, return an ordering for it based on image size"""
# these are the image size keys that flickr uses in its URLs, going from o (original) to sq (tiny thumbnail)
sizes = ["o", "k", "h", "b", "c", "z", "m", "n", "s", "t", "q", "sq"]
# logging.debug("attempting to find key for URL %s" % (link.url))
try:
return sizes.index( re.findall("/([a-z][a-z]?)/", link.url)[0])
except IndexError:
# this should only happen if the regex doesn't match, meaning we should have an image
try:
return sizes.index(re.findall(".*?_([a-z][a-z]?)(?:_d)?\.jpg", link.url)[0])
except IndexError:
logging.error("could not find key for url %s" % (link.url))
return len(sizes) + 1
# These might be handy for future expansion but are not currently needed
# if "disabled downloading" in allsizespage.read():
# logging.info("found protected image")
#
# if list(b.links(url_regex="_o(_d)?\.jpg")):
# logging.info("found original size image!")
imgpageurls = b.links(url_regex="\/sizes\/[a-z][a-z]?|\.jpg")
imgpageurls = sorted(imgpageurls, key=imgsort)
for i in imgpageurls:
logging.debug("candidate image page URL,title: %s, %s" % (i.url, i.text))
imgurl = imgpageurls[0].url.partition("?")[0]
# if URL is not an image, it means we need another layer of indirection
if not imgurl.endswith(".jpg"):
logging.debug("now visiting page which hosts largest image: %s" % (imgurl))
img = visit(imgurl)
imgurl = b.links(url_regex="\.jpg").next().url.partition("?")[0]
fn = os.path.basename(imgurl)
if os.path.exists(fn) and os.stat(fn).st_size:
logging.critical("detected nonempty file %s, not clobbering." % (fn))
skipped += 1
continue
img = visit(imgurl)
imgdata = img.read()
with open(fn, "w") as f:
f.write(imgdata)
print "crawled %d items, %d skipped" % (len(crawled), skipped)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment