Skip to content

Instantly share code, notes, and snippets.

@ctrochalakis
Created January 24, 2010 10:36
Show Gist options
  • Save ctrochalakis/285136 to your computer and use it in GitHub Desktop.
Save ctrochalakis/285136 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#
# Copyright (c) 2010 Giorgos Keramidas.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
import os
import sys
children = {}
maxjobs = 8 # maximum number of concurrent jobs
jobs = [] # current list of queued jobs
# Default wget options to use for downloading each URL
wget = ["wget", "-q", "-nd", "-np", "-c", "-r"]
# Spawn a new child from jobs[] and record it in children{} using
# its PID as a key.
def spawn(cmd, *args):
argv = [cmd] + list(args)
pid = None
try:
pid = os.spawnlp(os.P_NOWAIT, cmd, *argv)
children[pid] = {'pid': pid, 'cmd': argv}
except Exception, inst:
print "'%s': %s" % ("\x20".join(argv), str(inst))
print "spawned pid %d of nproc=%d njobs=%d for '%s'" % \
(pid, len(children), len(jobs), "\x20".join(argv))
return pid
if __name__ == "__main__":
params = sys.argv[1:]
# '--' splits wget arguments from url link files, if '--' is not
# specified the last argument is used as the url file
if('--' in sys.argv):
index = params.index('--')
wget_args = params[:index]
url_files = params[index+1:]
else:
url_files = [params.pop()]
wget_args = params
# In case '-' is specified read from stdin
if url_files == ['-']:
url_files = [sys.stdin]
else:
url_files = map(file, url_files)
# Build a list of wget jobs, one for each URL in our input file(s).
for url_file in url_files:
try:
for u in url_file:
cmd = wget + wget_args + [u.strip(' \r\n')]
jobs.append(cmd)
except IOError:
pass
print "%d wget jobs queued" % len(jobs)
# Spawn at most maxjobs in parallel.
while len(jobs) > 0 and len(children) < maxjobs:
cmd = jobs[0]
if spawn(*cmd):
del jobs[0]
print "%d jobs spawned" % len(children)
# Watch for dying children and keep spawning new jobs while
# we have them, in an effort to keep <= maxjobs children active.
while len(jobs) > 0 or len(children):
(pid, status) = os.wait()
print "pid %d exited. status=%d, nproc=%d, njobs=%d, cmd=%s" % \
(pid, status, len(children) - 1, len(jobs), \
"\x20".join(children[pid]['cmd']))
del children[pid]
if len(children) < maxjobs and len(jobs):
cmd = jobs[0]
if spawn(*cmd):
del jobs[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment