Skip to content

Instantly share code, notes, and snippets.

@craSH
Created March 19, 2011 10:11
Show Gist options
  • Save craSH/877372 to your computer and use it in GitHub Desktop.
Save craSH/877372 to your computer and use it in GitHub Desktop.
A few functions that serve to download files in a threaded manner. Uses Python's multiprocessing module.
#!/usr/bin/env python
"""
A few functions that serve to download files in a threaded manner.
Essentially a map() which is threadded, with a URL fetching function
Copyleft 2010 Ian Gallagher <[email protected]>
"""
import os, sys, urllib
import socket
# Set socket timeout to 2 seconds - adjust as needed. Applies to urllib.
socket.setdefaulttimeout(2)
# http://docs.python.org/library/multiprocessing.html
from multiprocessing import Pool
# Number of threads to run concurrently, find the sweet spot based on your
# connection and the ability of the server
thread_count = 20
# Verbosity can be 0 (no display), 1 (+ for successful download, . for fail),
# or 2, which prints detailed info for the URL fetched and file saved
verbosity = 2
def pmap(func, iterable, threads=thread_count):
"""
Behave like python's map(), but perform actions in a threaded manner.
func: A function which takes a single item to act on. May return a value.
iterable: An iterable object of which's items will be passed to func()
threads: The number of concurrent instances of func() to run on items
"""
pool = Pool(threads)
return pool.map(func, iterable)
def fetch_file((storage_location, file_url)):
"""
Fetch a file in to a given location on disk.
Accepts a tuple with the following items:
storage_location: Absolute directory on disk where file will be written to
file_url: URL of file to download
"""
local_path = os.path.join(storage_location, file_url.rsplit('/', 1)[-1])
saved_file = None
try:
saved_file, remote_headers = urllib.urlretrieve(file_url, filename=local_path)
except Exception as ex:
if(1 == verbosity):
sys.stdout.write('.')
sys.stdout.flush()
elif(2 == verbosity):
sys.stderr.write("ERROR: Failed to save '%s' to '%s'\n" % (file_url, local_path))
sys.stderr.flush()
else: # case for 0 or >2
pass
if saved_file:
if(1 == verbosity):
sys.stdout.write('+')
sys.stdout.flush()
elif(2 == verbosity):
sys.stdout.write("Successfully downloaded '%s' to '%s'\n" % (file_url, local_path))
sys.stdout.flush()
else: # case for 0 or >2
pass
return saved_file
else:
return False
def pool_urlretrieve(save_path, url_iterable):
if(os.path.isfile(save_path)):
raise Exception("A file exists in the specified save_path, aborting.")
elif(not os.path.isdir(save_path)):
os.makdirs(save_path)
dirs_urls = ((save_path, x) for x in url_iterable)
results = pmap(fetch_file, dirs_urls)
if(1 == verbosity):
sys.stdout.write('\n')
sys.stdout.flush()
return results
if '__main__' == __name__:
"""
If called as main, run a little test. Also serves as an example for the reader
"""
storage_location = "/tmp"
urls = [
"http://www.google.com/robots.txt",
"https://neg9.org/communicate",
"http://aaaaaaneg9.org/fail",
"http://datalinkcontrol.net/ip"
]
print "About to pull down %d URLs with %d threads at a time" % (len(urls), thread_count)
results = pool_urlretrieve(storage_location, urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment