Created
March 19, 2011 10:11
-
-
Save craSH/877372 to your computer and use it in GitHub Desktop.
A few functions that serve to download files in a threaded manner. Uses Python's multiprocessing module.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """ | |
| A few functions that serve to download files in a threaded manner. | |
| Essentially a map() which is threadded, with a URL fetching function | |
| Copyleft 2010 Ian Gallagher <[email protected]> | |
| """ | |
| import os, sys, urllib | |
| import socket | |
| # Set socket timeout to 2 seconds - adjust as needed. Applies to urllib. | |
| socket.setdefaulttimeout(2) | |
| # http://docs.python.org/library/multiprocessing.html | |
| from multiprocessing import Pool | |
| # Number of threads to run concurrently, find the sweet spot based on your | |
| # connection and the ability of the server | |
| thread_count = 20 | |
| # Verbosity can be 0 (no display), 1 (+ for successful download, . for fail), | |
| # or 2, which prints detailed info for the URL fetched and file saved | |
| verbosity = 2 | |
| def pmap(func, iterable, threads=thread_count): | |
| """ | |
| Behave like python's map(), but perform actions in a threaded manner. | |
| func: A function which takes a single item to act on. May return a value. | |
| iterable: An iterable object of which's items will be passed to func() | |
| threads: The number of concurrent instances of func() to run on items | |
| """ | |
| pool = Pool(threads) | |
| return pool.map(func, iterable) | |
| def fetch_file((storage_location, file_url)): | |
| """ | |
| Fetch a file in to a given location on disk. | |
| Accepts a tuple with the following items: | |
| storage_location: Absolute directory on disk where file will be written to | |
| file_url: URL of file to download | |
| """ | |
| local_path = os.path.join(storage_location, file_url.rsplit('/', 1)[-1]) | |
| saved_file = None | |
| try: | |
| saved_file, remote_headers = urllib.urlretrieve(file_url, filename=local_path) | |
| except Exception as ex: | |
| if(1 == verbosity): | |
| sys.stdout.write('.') | |
| sys.stdout.flush() | |
| elif(2 == verbosity): | |
| sys.stderr.write("ERROR: Failed to save '%s' to '%s'\n" % (file_url, local_path)) | |
| sys.stderr.flush() | |
| else: # case for 0 or >2 | |
| pass | |
| if saved_file: | |
| if(1 == verbosity): | |
| sys.stdout.write('+') | |
| sys.stdout.flush() | |
| elif(2 == verbosity): | |
| sys.stdout.write("Successfully downloaded '%s' to '%s'\n" % (file_url, local_path)) | |
| sys.stdout.flush() | |
| else: # case for 0 or >2 | |
| pass | |
| return saved_file | |
| else: | |
| return False | |
| def pool_urlretrieve(save_path, url_iterable): | |
| if(os.path.isfile(save_path)): | |
| raise Exception("A file exists in the specified save_path, aborting.") | |
| elif(not os.path.isdir(save_path)): | |
| os.makdirs(save_path) | |
| dirs_urls = ((save_path, x) for x in url_iterable) | |
| results = pmap(fetch_file, dirs_urls) | |
| if(1 == verbosity): | |
| sys.stdout.write('\n') | |
| sys.stdout.flush() | |
| return results | |
| if '__main__' == __name__: | |
| """ | |
| If called as main, run a little test. Also serves as an example for the reader | |
| """ | |
| storage_location = "/tmp" | |
| urls = [ | |
| "http://www.google.com/robots.txt", | |
| "https://neg9.org/communicate", | |
| "http://aaaaaaneg9.org/fail", | |
| "http://datalinkcontrol.net/ip" | |
| ] | |
| print "About to pull down %d URLs with %d threads at a time" % (len(urls), thread_count) | |
| results = pool_urlretrieve(storage_location, urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment