Created
December 26, 2013 14:24
-
-
Save c93614/8134411 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: iso-8859-1 -*- | |
# vi:ts=4:et | |
# $Id$ | |
# | |
# Usage: python retriever.py <file with URLs to fetch> [<# of | |
# concurrent connections>] | |
# | |
import sys, threading, Queue | |
import pycurl | |
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see | |
# the libcurl tutorial for more info. | |
try: | |
import signal | |
from signal import SIGPIPE, SIG_IGN | |
signal.signal(signal.SIGPIPE, signal.SIG_IGN) | |
except ImportError: | |
pass | |
# Get args | |
num_conn = 10 | |
try: | |
if sys.argv[1] == "-": | |
urls = sys.stdin.readlines() | |
else: | |
urls = open(sys.argv[1]).readlines() | |
if len(sys.argv) >= 3: | |
num_conn = int(sys.argv[2]) | |
except: | |
print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0] | |
raise SystemExit | |
# Make a queue with (url, filename) tuples | |
queue = Queue.Queue() | |
for url in urls: | |
url = url.strip() | |
if not url or url[0] == "#": | |
continue | |
filename = "doc_%03d.dat" % (len(queue.queue) + 1) | |
queue.put((url, filename)) | |
# Check args | |
assert queue.queue, "no URLs given" | |
num_urls = len(queue.queue) | |
num_conn = min(num_conn, num_urls) | |
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections" | |
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM) | |
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----" | |
class WorkerThread(threading.Thread): | |
def __init__(self, queue): | |
threading.Thread.__init__(self) | |
self.queue = queue | |
def run(self): | |
while 1: | |
try: | |
url, filename = self.queue.get_nowait() | |
except Queue.Empty: | |
raise SystemExit | |
fp = open(filename, "wb") | |
curl = pycurl.Curl() | |
curl.setopt(pycurl.URL, url) | |
curl.setopt(pycurl.FOLLOWLOCATION, 1) | |
curl.setopt(pycurl.MAXREDIRS, 5) | |
curl.setopt(pycurl.CONNECTTIMEOUT, 30) | |
curl.setopt(pycurl.TIMEOUT, 300) | |
curl.setopt(pycurl.NOSIGNAL, 1) | |
curl.setopt(pycurl.WRITEDATA, fp) | |
try: | |
curl.perform() | |
except: | |
import traceback | |
traceback.print_exc(file=sys.stderr) | |
sys.stderr.flush() | |
curl.close() | |
fp.close() | |
sys.stdout.write(".") | |
sys.stdout.flush() | |
# Start a bunch of threads | |
threads = [] | |
for dummy in range(num_conn): | |
t = WorkerThread(queue) | |
t.start() | |
threads.append(t) | |
# Wait for all threads to finish | |
for thread in threads: | |
thread.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment