fsiler · May 24, 2012 13:23
diff --git a/buckflickr.py b/buckflickr.py
 #!/usr/bin/env python
 # named in honor of Jack Buck: "Go crazy, folks! Go crazy!"
 # yes, this is ugly.  No, it doesn't use the API.  I don't care.  I want my pictures quickly without fuss.
 #
 # see http://goo.gl/yJVl5 for PyCon inspiration video
 #
 # TODO: handle flickr login
 # TODO: add command line options for username, password
 # TODO: make at least "tocrawled" persistent, perhaps database, maybe even by pickling
 # TODO: add some kind of metadata so it's easy to re-crawl things later as galleries get updated
 # TODO: set file creation time to flickr date, or maybe source from EXIF if available
 # TODO: automatically delete older, smaller downloads of same image
 # TODO: videos?
 # TODO: fix assumption that all media is .jpg
 # ideas:
 #  - manipulate EXIF data in each picture to include URL and description (having trouble finding lib for this)
 #  - use create sqlite database (which will work with multiple open) or plain text file log

 import mechanize, logging, os, sys, re, optparse
 parser = optparse.OptionParser(description="Download an entire gallery given a flickr image URL.", epilog="")
 parser.add_option("-v", "--verbose", action="count", default=0, help="increase verbosity")
 (args, userurls) = parser.parse_args()
 logging.basicConfig( level=(40 - vars(args)['verbose'] * 10) )
 logging.debug("optparse values: " + str(vars(args)))

 b = mechanize.Browser()
 b.set_handle_robots(False)
 b.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98;)')]
 # add images so that they appear in b.links()
 b._factory._links_factory.urltags['img'] = 'src'

 tocrawl = set()     # initially populated with user args; holds flickr URLs to traverse
 crawled = set()     # holds flickr URLs already handled
 skipped = 0

 def visit(url, options=None):
    """wraps mechanize.Browser.open in order to catch bad URLs more intelligibly"""
    logging.info("attempting to visit url %s" % (url))
    try:
        return b.open(url.partition("?")[0], options)
    except:
        logging.warning("could not visit URL %s, options %s" % (url, options))

 # begin with user-provided URLs
 for url in userurls:
    tocrawl.add(url)

 while tocrawl:
    imgdata = None
    url = tocrawl.pop()
    if url in crawled:
        logging.info("already visited %s, continuing to next item" % (url))
        continue

    logging.info("%d items to crawl, %d already crawled, %d skipped" % (len(tocrawl), len(crawled), skipped))
    logging.debug("tocrawl %s\ncrawled %s" % (tocrawl, crawled))
    crawled.add(url)

    b.clear_history()
    visit(url)
    logging.info("page %s, url %s " % (b.title(), url))
    for link in b.links():
 #        logging.debug("found link: %s" % (link))
        attrs = dict(link.attrs)
        if "ywa-track" in attrs.get('class', '') and "context-thumb-link" in attrs.get('class', ''):
            logging.debug("queuing URL: %s" % (link.url))
            if link.url.startswith("http"):
                tocrawl.add(link.url)
            else:
                tocrawl.add("http://www.flickr.com/" + link.url.lstrip("/"))

        if "All Sizes" in attrs.get('data-ywa-name', ''):
            logging.info("found All Sizes URL: %s" % link.url)
            allsizesurl = link.url

    try:
        allsizespage = visit(allsizesurl)
    except NameError:
        logging.error("Couldn't find \"All Sizes\" page: please try entering the URL of a single image page on Flickr.  This may also be a problem with being logged in to Flickr.")
        skipped += 1
        continue

    def imgsort(link):
        """given a mechanize.Link object, return an ordering for it based on image size"""
        # these are the image size keys that flickr uses in its URLs, going from o (original) to sq (tiny thumbnail)
        sizes = ["o", "k", "h", "b", "c", "z", "m", "n", "s", "t", "q", "sq"]
 #        logging.debug("attempting to find key for URL %s" % (link.url))
        try:
            return sizes.index( re.findall("/([a-z][a-z]?)/", link.url)[0])
        except IndexError:
            # this should only happen if the regex doesn't match, meaning we should have an image
            try:
                return sizes.index(re.findall(".*?_([a-z][a-z]?)(?:_d)?\.jpg", link.url)[0])
            except IndexError:
                logging.error("could not find key for url %s" % (link.url))
                return len(sizes) + 1
 # These might be handy for future expansion but are not currently needed
 #    if "disabled downloading" in allsizespage.read():
 #        logging.info("found protected image")
 #
 #    if list(b.links(url_regex="_o(_d)?\.jpg")):
 #        logging.info("found original size image!")
    imgpageurls = b.links(url_regex="\/sizes\/[a-z][a-z]?|\.jpg")
    imgpageurls = sorted(imgpageurls, key=imgsort)
    for i in imgpageurls:
        logging.debug("candidate image page URL,title: %s, %s" % (i.url, i.text))

    imgurl = imgpageurls[0].url.partition("?")[0]
    # if URL is not an image, it means we need another layer of indirection
    if not imgurl.endswith(".jpg"):
        logging.debug("now visiting page which hosts largest image: %s" % (imgurl))
        img = visit(imgurl)
        imgurl = b.links(url_regex="\.jpg").next().url.partition("?")[0]

    fn = os.path.basename(imgurl)
    if os.path.exists(fn) and os.stat(fn).st_size:
        logging.critical("detected nonempty file %s, not clobbering." % (fn))
        skipped += 1
        continue

    img = visit(imgurl)
    imgdata = img.read()
    with open(fn, "w") as f:
        f.write(imgdata)

 print "crawled %d items, %d skipped" % (len(crawled), skipped)
	#!/usr/bin/env python
	# named in honor of Jack Buck: "Go crazy, folks! Go crazy!"
	# yes, this is ugly. No, it doesn't use the API. I don't care. I want my pictures quickly without fuss.
	#
	# see http://goo.gl/yJVl5 for PyCon inspiration video
	#
	# TODO: handle flickr login
	# TODO: add command line options for username, password
	# TODO: make at least "tocrawled" persistent, perhaps database, maybe even by pickling
	# TODO: add some kind of metadata so it's easy to re-crawl things later as galleries get updated
	# TODO: set file creation time to flickr date, or maybe source from EXIF if available
	# TODO: automatically delete older, smaller downloads of same image
	# TODO: videos?
	# TODO: fix assumption that all media is .jpg
	# ideas:
	# - manipulate EXIF data in each picture to include URL and description (having trouble finding lib for this)
	# - use create sqlite database (which will work with multiple open) or plain text file log

	import mechanize, logging, os, sys, re, optparse
	parser = optparse.OptionParser(description="Download an entire gallery given a flickr image URL.", epilog="")
	parser.add_option("-v", "--verbose", action="count", default=0, help="increase verbosity")
	(args, userurls) = parser.parse_args()
	logging.basicConfig( level=(40 - vars(args)['verbose'] * 10) )
	logging.debug("optparse values: " + str(vars(args)))

	b = mechanize.Browser()
	b.set_handle_robots(False)
	b.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98;)')]
	# add images so that they appear in b.links()
	b._factory._links_factory.urltags['img'] = 'src'

	tocrawl = set() # initially populated with user args; holds flickr URLs to traverse
	crawled = set() # holds flickr URLs already handled
	skipped = 0

	def visit(url, options=None):
	"""wraps mechanize.Browser.open in order to catch bad URLs more intelligibly"""
	logging.info("attempting to visit url %s" % (url))
	try:
	return b.open(url.partition("?")[0], options)
	except:
	logging.warning("could not visit URL %s, options %s" % (url, options))

	# begin with user-provided URLs
	for url in userurls:
	tocrawl.add(url)

	while tocrawl:
	imgdata = None
	url = tocrawl.pop()
	if url in crawled:
	logging.info("already visited %s, continuing to next item" % (url))
	continue

	logging.info("%d items to crawl, %d already crawled, %d skipped" % (len(tocrawl), len(crawled), skipped))
	logging.debug("tocrawl %s\ncrawled %s" % (tocrawl, crawled))
	crawled.add(url)

	b.clear_history()
	visit(url)
	logging.info("page %s, url %s " % (b.title(), url))
	for link in b.links():
	# logging.debug("found link: %s" % (link))
	attrs = dict(link.attrs)
	if "ywa-track" in attrs.get('class', '') and "context-thumb-link" in attrs.get('class', ''):
	logging.debug("queuing URL: %s" % (link.url))
	if link.url.startswith("http"):
	tocrawl.add(link.url)
	else:
	tocrawl.add("http://www.flickr.com/" + link.url.lstrip("/"))

	if "All Sizes" in attrs.get('data-ywa-name', ''):
	logging.info("found All Sizes URL: %s" % link.url)
	allsizesurl = link.url

	try:
	allsizespage = visit(allsizesurl)
	except NameError:
	logging.error("Couldn't find \"All Sizes\" page: please try entering the URL of a single image page on Flickr. This may also be a problem with being logged in to Flickr.")
	skipped += 1
	continue

	def imgsort(link):
	"""given a mechanize.Link object, return an ordering for it based on image size"""
	# these are the image size keys that flickr uses in its URLs, going from o (original) to sq (tiny thumbnail)
	sizes = ["o", "k", "h", "b", "c", "z", "m", "n", "s", "t", "q", "sq"]
	# logging.debug("attempting to find key for URL %s" % (link.url))
	try:
	return sizes.index( re.findall("/([a-z][a-z]?)/", link.url)[0])
	except IndexError:
	# this should only happen if the regex doesn't match, meaning we should have an image
	try:
	return sizes.index(re.findall(".*?_([a-z][a-z]?)(?:_d)?\.jpg", link.url)[0])
	except IndexError:
	logging.error("could not find key for url %s" % (link.url))
	return len(sizes) + 1
	# These might be handy for future expansion but are not currently needed
	# if "disabled downloading" in allsizespage.read():
	# logging.info("found protected image")
	#
	# if list(b.links(url_regex="_o(_d)?\.jpg")):
	# logging.info("found original size image!")
	imgpageurls = b.links(url_regex="\/sizes\/[a-z][a-z]?\|\.jpg")
	imgpageurls = sorted(imgpageurls, key=imgsort)
	for i in imgpageurls:
	logging.debug("candidate image page URL,title: %s, %s" % (i.url, i.text))

	imgurl = imgpageurls[0].url.partition("?")[0]
	# if URL is not an image, it means we need another layer of indirection
	if not imgurl.endswith(".jpg"):
	logging.debug("now visiting page which hosts largest image: %s" % (imgurl))
	img = visit(imgurl)
	imgurl = b.links(url_regex="\.jpg").next().url.partition("?")[0]

	fn = os.path.basename(imgurl)
	if os.path.exists(fn) and os.stat(fn).st_size:
	logging.critical("detected nonempty file %s, not clobbering." % (fn))
	skipped += 1
	continue

	img = visit(imgurl)
	imgdata = img.read()
	with open(fn, "w") as f:
	f.write(imgdata)

	print "crawled %d items, %d skipped" % (len(crawled), skipped)