Skip to content

Instantly share code, notes, and snippets.

@gabrielfalcao
Created August 23, 2010 18:38
Show Gist options
  • Save gabrielfalcao/546042 to your computer and use it in GitHub Desktop.
Save gabrielfalcao/546042 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# <image downloader - bulk-download all images>
# Copyright (C) <2010> Gabriel Falcão <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import re
import sys
import couleur
import warnings
warnings.simplefilter("ignore")
import unicodedata
from urlparse import urlsplit
from bolacha import Bolacha
from lxml import html
args = sys.argv[:]
if len(args) != 2:
print "usage %s URL" % __file__
sys.exit(1)
def normalize(string):
string = "_".join(re.findall("\w+", string)).lower()
return unicodedata.normalize('NFKD', unicode(string)).encode('ascii', 'ignore')
# colors
couleur.proxy(sys.stdout).enable()
# argument
args.pop(0)
url = args[0]
# http fetcher
http = Bolacha()
headers, body = http.get(url)
dom = html.fromstring(body)
image_urls = [img.attrib['src'] for img in dom.cssselect('img') if 'src' in img.attrib and img.attrib['src'].lower().startswith('http')]
dirname = normalize(url.replace('http://', ''))
sys.stdout.write('#{bold}#{red}creating the directory "%s"' % dirname)
try:
os.makedirs(dirname)
except OSError, e:
if e.errno == 17:
pass
os.chdir(dirname)
print "#{green} DONE!#{reset}"
print "=" * 10
for link in image_urls:
sys.stdout.write('#{bold}#{white}downloading image "%s"' % link)
link_parts = urlsplit(link)
filename = normalize(link_parts.path.split("/")[-1])
headers, body = http.get(link)
sys.stdout.write('#{yellow} and now i am saving at "%s" ...' % filename)
fd = open(filename, 'w')
fd.write(body)
fd.close()
print "#{green} DONE!#{reset}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment