Created
August 1, 2014 17:55
-
-
Save paulrohrbeck/05cd9acb092bddec520d to your computer and use it in GitHub Desktop.
Bulk download images from a defined page (including CSS selector).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
Download Images | |
""" | |
import optparse | |
from bs4 import BeautifulSoup | |
import requests | |
from urlparse import urlsplit | |
import urllib | |
import os | |
def get_options(): | |
""" | |
Define and retrieve options from the command line | |
""" | |
parser = optparse.OptionParser() | |
parser.add_option('-u', help='URL with table of contents to parse (required)', dest='url', action='store_true') | |
parser.add_option('-c', | |
help='CSS Class or ID to parse, eg. main-content (required)', | |
dest='css_class', | |
action='store_true') | |
(opts, args) = parser.parse_args() | |
# Making sure all mandatory options are set: | |
mandatory_options = ['url', 'css_class'] | |
for m in mandatory_options: | |
if not opts.__dict__[m]: | |
print "Mandatory options is missing!\n" | |
parser.print_help() | |
exit(-1) | |
return opts, args | |
def get_soup(url): | |
""" | |
Retrieve HTML soup from given URL | |
""" | |
r = requests.get(url) | |
data = r.text | |
soup = BeautifulSoup(data) | |
return soup | |
def get_links(soup, selector, url): | |
""" | |
Get 'table of contents page' and retrieve list of pages to send to Readability.com | |
""" | |
url_list = [] | |
# choose selector | |
if selector[0] == 'id': | |
divs = soup.find_all(id=selector[1]) | |
elif selector[0] == 'class': | |
divs = soup.find_all(class_=selector[1], limit=1) | |
else: | |
divs = soup.find_all('body', limit=1) | |
# then retrieve all links: | |
for div in divs: | |
for link in div.find_all('a'): | |
href = str(link.get('href')) | |
# ignore empty links, anchors, and mailto: | |
if href != '' and href[0] != '#' and 'None' not in href and 'mailto:' not in href: | |
href = sanitize_url(url, link.get('href')) | |
url_list.append(href) | |
print 'Found %s links (Selector: %s).' % (len(url_list), selector) | |
print 'Removing duplicates, the list was reduced to %s links.' % len(url_list) | |
return url_list | |
def sanitize_url(url, current_url): | |
""" | |
Here we have to account for internal links, so if there's no netloc, | |
prepend the current (given) URL | |
SplitResult(scheme='http', netloc='', path=u'abc.html', query='', fragment=u'') | |
""" | |
current_url_parts = urlsplit(current_url) | |
if 'http' in current_url: | |
sanitized_url = 'http://' + current_url_parts.netloc + current_url_parts.path | |
else: | |
url_parts = urlsplit(url) | |
#sanitized_url = 'http://' + url_parts.netloc + url_parts.path + current_url_parts.path | |
sanitized_url = 'http://' + url_parts.netloc + current_url_parts.path | |
return sanitized_url | |
def class_or_id(selector): | |
""" | |
Differentiate between classes and ids in the way jQuery does (#id, .class) | |
""" | |
if selector[0] == '.': | |
soup_selector = 'class' | |
elif selector[0] == '#': | |
soup_selector = 'id' | |
else: | |
soup_selector = '' | |
return [soup_selector, selector[1:]] | |
def download_file(url): | |
""" | |
download one file | |
""" | |
filename = url[url.rfind('/')+1:] | |
urllib.urlretrieve(url, filename) | |
def download_files(url_list, category): | |
""" | |
Loop through urls | |
""" | |
os.mkdir(category) | |
os.chdir(category) | |
for id, url in enumerate(url_list): | |
download_file(url) | |
# strip id out of url: | |
filename = url[url.rfind('/')+1:] | |
#bookmark_id = location[location.rfind('/')+1:] | |
print '- [%s] Downloaded %s' % (id+1, filename) | |
def main(): | |
""" | |
Main function that starts everything else | |
""" | |
# get options: | |
(opts, args) = get_options() | |
url = str(args[0]) if opts.url else "" | |
css_class = str(args[1]) if opts.css_class else "" | |
# start parser: | |
soup = get_soup(url) | |
css_class = class_or_id(css_class) | |
url_list = get_links(soup, css_class, url) | |
download_files(url_list, url[url.rfind('/')+1:]) | |
# DEBUG (only send part of the links): | |
#url_list = url_list[21:25] | |
#print 'len url_list', len(url_list) | |
#print 'url_list', url_list | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment