Created
September 15, 2013 04:02
-
-
Save wwwslinger/6567919 to your computer and use it in GitHub Desktop.
Get urls from a file list recursively and in parallel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Creates a directory based on the id in the line | |
and recursively downloads site files to a specified depth. | |
Files are save with directory structure of the site according | |
to the wget implementation. | |
Example site_list.txt file: | |
id_345 http://www.stackoverflow.com | |
id_367 http://stats.stackexchange.com | |
id_378 http://www.google.com | |
''' | |
import multiprocessing, subprocess, re | |
def getSiteRecursive(id, url, depth=2): | |
cmd = "wget -P " + id + " -r -l " + str(depth) + " " + url | |
subprocess.call(cmd, shell=True) | |
input_file = "site_list.txt" | |
jobs = [] | |
max_jobs = multiprocessing.cpu_count() * 2 + 1 | |
with open(input_file) as f: | |
for line in f: | |
id_url = re.compile("\s+").split(line) | |
if len(id_url) >= 2: | |
try: | |
print "Grabbing " + id_url[1] + " into " + id_url[0] + " recursively..." | |
if len(jobs) >= max_jobs: | |
jobs[0].join() | |
del jobs[0] | |
p = multiprocessing.Process(target=getSiteRecursive,args=(id_url[0],id_url[1],2,)) | |
jobs.append(p) | |
p.start() | |
except Exception, e: | |
print "Error for " + id_url[1] + ": " + str(e) | |
pass | |
for j in jobs: | |
j.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Part of an answer to http://stackoverflow.com/questions/18806924/wget-reading-from-a-list-with-id-numbers-and-urls/18807093#18807093