craSH · March 19, 2011 10:11
diff --git a/threaded_urlretrieve.py b/threaded_urlretrieve.py
 #!/usr/bin/env python
 """
 A few functions that serve to download files in a threaded manner.
 Essentially a map() which is threadded, with a URL fetching function

 Copyleft 2010 Ian Gallagher <[email protected]>
 """

 import os, sys, urllib
 import socket
 # Set socket timeout to 2 seconds - adjust as needed. Applies to urllib.
 socket.setdefaulttimeout(2)

 # http://docs.python.org/library/multiprocessing.html
 from multiprocessing import Pool

 # Number of threads to run concurrently, find the sweet spot based on your
 # connection and the ability of the server
 thread_count = 20

 # Verbosity can be 0 (no display), 1 (+ for successful download, . for fail),
 # or 2, which prints detailed info for the URL fetched and file saved
 verbosity = 2

 def pmap(func, iterable, threads=thread_count):
    """
    Behave like python's map(), but perform actions in a threaded manner.
    func: A function which takes a single item to act on. May return a value.
    iterable: An iterable object of which's items will be passed to func()
    threads: The number of concurrent instances of func() to run on items
    """
    pool = Pool(threads)
    return pool.map(func, iterable)

 def fetch_file((storage_location, file_url)):
    """
    Fetch a file in to a given location on disk.
    Accepts a tuple with the following items:
    storage_location: Absolute directory on disk where file will be written to
    file_url: URL of file to download
    """
    local_path = os.path.join(storage_location, file_url.rsplit('/', 1)[-1])
    saved_file = None
    try:
        saved_file, remote_headers = urllib.urlretrieve(file_url, filename=local_path)
    except Exception as ex:
        if(1 == verbosity):
            sys.stdout.write('.')
            sys.stdout.flush()
        elif(2 == verbosity):
            sys.stderr.write("ERROR: Failed to save '%s' to '%s'\n" % (file_url, local_path))
            sys.stderr.flush()
        else: # case for 0 or >2
            pass

    if saved_file:
        if(1 == verbosity):
            sys.stdout.write('+')
            sys.stdout.flush()
        elif(2 == verbosity):
            sys.stdout.write("Successfully downloaded '%s' to '%s'\n" % (file_url, local_path))
            sys.stdout.flush()
        else: # case for 0 or >2
            pass

        return saved_file
    else:
        return False

 def pool_urlretrieve(save_path, url_iterable):
    if(os.path.isfile(save_path)):
        raise Exception("A file exists in the specified save_path, aborting.")
    elif(not os.path.isdir(save_path)):
        os.makdirs(save_path)

    dirs_urls = ((save_path, x) for x in url_iterable)

    results = pmap(fetch_file, dirs_urls)

    if(1 == verbosity):
        sys.stdout.write('\n')
        sys.stdout.flush()

    return results

 if '__main__' == __name__:
    """
    If called as main, run a little test. Also serves as an example for the reader
    """
    storage_location = "/tmp"
    urls = [
            "http://www.google.com/robots.txt",
            "https://neg9.org/communicate",
            "http://aaaaaaneg9.org/fail",
            "http://datalinkcontrol.net/ip"
            ]

    print "About to pull down %d URLs with %d threads at a time" % (len(urls), thread_count)

    results = pool_urlretrieve(storage_location, urls)
	#!/usr/bin/env python
	"""
	A few functions that serve to download files in a threaded manner.
	Essentially a map() which is threadded, with a URL fetching function

	Copyleft 2010 Ian Gallagher <[email protected]>
	"""

	import os, sys, urllib
	import socket
	# Set socket timeout to 2 seconds - adjust as needed. Applies to urllib.
	socket.setdefaulttimeout(2)

	# http://docs.python.org/library/multiprocessing.html
	from multiprocessing import Pool

	# Number of threads to run concurrently, find the sweet spot based on your
	# connection and the ability of the server
	thread_count = 20

	# Verbosity can be 0 (no display), 1 (+ for successful download, . for fail),
	# or 2, which prints detailed info for the URL fetched and file saved
	verbosity = 2

	def pmap(func, iterable, threads=thread_count):
	"""
	Behave like python's map(), but perform actions in a threaded manner.
	func: A function which takes a single item to act on. May return a value.
	iterable: An iterable object of which's items will be passed to func()
	threads: The number of concurrent instances of func() to run on items
	"""
	pool = Pool(threads)
	return pool.map(func, iterable)

	def fetch_file((storage_location, file_url)):
	"""
	Fetch a file in to a given location on disk.
	Accepts a tuple with the following items:
	storage_location: Absolute directory on disk where file will be written to
	file_url: URL of file to download
	"""
	local_path = os.path.join(storage_location, file_url.rsplit('/', 1)[-1])
	saved_file = None
	try:
	saved_file, remote_headers = urllib.urlretrieve(file_url, filename=local_path)
	except Exception as ex:
	if(1 == verbosity):
	sys.stdout.write('.')
	sys.stdout.flush()
	elif(2 == verbosity):
	sys.stderr.write("ERROR: Failed to save '%s' to '%s'\n" % (file_url, local_path))
	sys.stderr.flush()
	else: # case for 0 or >2
	pass

	if saved_file:
	if(1 == verbosity):
	sys.stdout.write('+')
	sys.stdout.flush()
	elif(2 == verbosity):
	sys.stdout.write("Successfully downloaded '%s' to '%s'\n" % (file_url, local_path))
	sys.stdout.flush()
	else: # case for 0 or >2
	pass

	return saved_file
	else:
	return False

	def pool_urlretrieve(save_path, url_iterable):
	if(os.path.isfile(save_path)):
	raise Exception("A file exists in the specified save_path, aborting.")
	elif(not os.path.isdir(save_path)):
	os.makdirs(save_path)

	dirs_urls = ((save_path, x) for x in url_iterable)

	results = pmap(fetch_file, dirs_urls)

	if(1 == verbosity):
	sys.stdout.write('\n')
	sys.stdout.flush()

	return results

	if '__main__' == __name__:
	"""
	If called as main, run a little test. Also serves as an example for the reader
	"""
	storage_location = "/tmp"
	urls = [
	"http://www.google.com/robots.txt",
	"https://neg9.org/communicate",
	"http://aaaaaaneg9.org/fail",
	"http://datalinkcontrol.net/ip"
	]

	print "About to pull down %d URLs with %d threads at a time" % (len(urls), thread_count)

	results = pool_urlretrieve(storage_location, urls)