Skip to content

Instantly share code, notes, and snippets.

@paulrohrbeck
Created August 1, 2014 17:55
Show Gist options
  • Save paulrohrbeck/05cd9acb092bddec520d to your computer and use it in GitHub Desktop.
Save paulrohrbeck/05cd9acb092bddec520d to your computer and use it in GitHub Desktop.
Bulk download images from a defined page (including CSS selector).
#!/usr/bin/python
"""
Download Images
"""
import optparse
from bs4 import BeautifulSoup
import requests
from urlparse import urlsplit
import urllib
import os
def get_options():
"""
Define and retrieve options from the command line
"""
parser = optparse.OptionParser()
parser.add_option('-u', help='URL with table of contents to parse (required)', dest='url', action='store_true')
parser.add_option('-c',
help='CSS Class or ID to parse, eg. main-content (required)',
dest='css_class',
action='store_true')
(opts, args) = parser.parse_args()
# Making sure all mandatory options are set:
mandatory_options = ['url', 'css_class']
for m in mandatory_options:
if not opts.__dict__[m]:
print "Mandatory options is missing!\n"
parser.print_help()
exit(-1)
return opts, args
def get_soup(url):
"""
Retrieve HTML soup from given URL
"""
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
return soup
def get_links(soup, selector, url):
"""
Get 'table of contents page' and retrieve list of pages to send to Readability.com
"""
url_list = []
# choose selector
if selector[0] == 'id':
divs = soup.find_all(id=selector[1])
elif selector[0] == 'class':
divs = soup.find_all(class_=selector[1], limit=1)
else:
divs = soup.find_all('body', limit=1)
# then retrieve all links:
for div in divs:
for link in div.find_all('a'):
href = str(link.get('href'))
# ignore empty links, anchors, and mailto:
if href != '' and href[0] != '#' and 'None' not in href and 'mailto:' not in href:
href = sanitize_url(url, link.get('href'))
url_list.append(href)
print 'Found %s links (Selector: %s).' % (len(url_list), selector)
print 'Removing duplicates, the list was reduced to %s links.' % len(url_list)
return url_list
def sanitize_url(url, current_url):
"""
Here we have to account for internal links, so if there's no netloc,
prepend the current (given) URL
SplitResult(scheme='http', netloc='', path=u'abc.html', query='', fragment=u'')
"""
current_url_parts = urlsplit(current_url)
if 'http' in current_url:
sanitized_url = 'http://' + current_url_parts.netloc + current_url_parts.path
else:
url_parts = urlsplit(url)
#sanitized_url = 'http://' + url_parts.netloc + url_parts.path + current_url_parts.path
sanitized_url = 'http://' + url_parts.netloc + current_url_parts.path
return sanitized_url
def class_or_id(selector):
"""
Differentiate between classes and ids in the way jQuery does (#id, .class)
"""
if selector[0] == '.':
soup_selector = 'class'
elif selector[0] == '#':
soup_selector = 'id'
else:
soup_selector = ''
return [soup_selector, selector[1:]]
def download_file(url):
"""
download one file
"""
filename = url[url.rfind('/')+1:]
urllib.urlretrieve(url, filename)
def download_files(url_list, category):
"""
Loop through urls
"""
os.mkdir(category)
os.chdir(category)
for id, url in enumerate(url_list):
download_file(url)
# strip id out of url:
filename = url[url.rfind('/')+1:]
#bookmark_id = location[location.rfind('/')+1:]
print '- [%s] Downloaded %s' % (id+1, filename)
def main():
"""
Main function that starts everything else
"""
# get options:
(opts, args) = get_options()
url = str(args[0]) if opts.url else ""
css_class = str(args[1]) if opts.css_class else ""
# start parser:
soup = get_soup(url)
css_class = class_or_id(css_class)
url_list = get_links(soup, css_class, url)
download_files(url_list, url[url.rfind('/')+1:])
# DEBUG (only send part of the links):
#url_list = url_list[21:25]
#print 'len url_list', len(url_list)
#print 'url_list', url_list
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment