wwwslinger · September 15, 2013 04:02 · wwwslinger · Sep 15, 2013
diff --git a/get_urls_prll.py b/get_urls_prll.py
 '''Creates a directory based on the id in the line 
 and recursively downloads site files to a specified depth.
 Files are save with directory structure of the site according 
 to the wget implementation.

 Example site_list.txt file:

 id_345  http://www.stackoverflow.com
 id_367  http://stats.stackexchange.com
 id_378  http://www.google.com

 '''

 import multiprocessing, subprocess, re

 def getSiteRecursive(id, url, depth=2):
  cmd =  "wget -P " + id + " -r -l " + str(depth) + " " + url
  subprocess.call(cmd, shell=True)

 input_file = "site_list.txt"
 jobs = []
 max_jobs = multiprocessing.cpu_count() * 2 + 1
 with open(input_file) as f:
  for line in f:
    id_url = re.compile("\s+").split(line)
    if len(id_url) >= 2:
      try:
        print "Grabbing " + id_url[1] + " into " + id_url[0] + " recursively..."
        if len(jobs) >= max_jobs:
          jobs[0].join()
          del jobs[0]
        p = multiprocessing.Process(target=getSiteRecursive,args=(id_url[0],id_url[1],2,))
        jobs.append(p)
        p.start()
      except Exception, e:
        print "Error for " + id_url[1] + ": " + str(e)
        pass
  for j in jobs:
    j.join()
	'''Creates a directory based on the id in the line
	and recursively downloads site files to a specified depth.
	Files are save with directory structure of the site according
	to the wget implementation.

	Example site_list.txt file:

	id_345 http://www.stackoverflow.com
	id_367 http://stats.stackexchange.com
	id_378 http://www.google.com

	'''

	import multiprocessing, subprocess, re

	def getSiteRecursive(id, url, depth=2):
	cmd = "wget -P " + id + " -r -l " + str(depth) + " " + url
	subprocess.call(cmd, shell=True)

	input_file = "site_list.txt"
	jobs = []
	max_jobs = multiprocessing.cpu_count() * 2 + 1
	with open(input_file) as f:
	for line in f:
	id_url = re.compile("\s+").split(line)
	if len(id_url) >= 2:
	try:
	print "Grabbing " + id_url[1] + " into " + id_url[0] + " recursively..."
	if len(jobs) >= max_jobs:
	jobs[0].join()
	del jobs[0]
	p = multiprocessing.Process(target=getSiteRecursive,args=(id_url[0],id_url[1],2,))
	jobs.append(p)
	p.start()
	except Exception, e:
	print "Error for " + id_url[1] + ": " + str(e)
	pass
	for j in jobs:
	j.join()