Skip to content

Instantly share code, notes, and snippets.

@ei-grad
Created October 4, 2017 09:58
Show Gist options
  • Save ei-grad/f2da08481e6271d4a02b0d18f18cfb00 to your computer and use it in GitHub Desktop.
Save ei-grad/f2da08481e6271d4a02b0d18f18cfb00 to your computer and use it in GitHub Desktop.
Asyncronously fetch urls in parallel (via tornado) and apply specified function to response
from tornado.ioloop import IOLoop
from tornado.gen import coroutine
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.queues import Queue
try:
from tqdm import tqdm
except ImportError:
tqdm = None
def map(requests, mapper, parallel=32, httpclient=None):
if tqdm is not None:
requests = tqdm(requests)
if httpclient is None:
httpclient = AsyncHTTPClient()
q = Queue(maxsize=parallel)
@coroutine
def runner():
@coroutine
def worker():
while True:
req = yield q.get()
resp = yield httpclient.fetch(req, raise_error=False)
if resp.code == 599:
resp.rethrow()
mapper(req, resp)
yield q.task_done()
for i in range(parallel):
IOLoop.current().spawn_callback(worker)
for req in requests:
yield q.put(req)
yield q.join()
try:
IOLoop.instance().run_sync(runner)
except:
IOLoop.instance().stop()
raise
if __name__ == "__main__":
import sys
def mapper(req, resp):
if resp.code != 200:
print('%d %s' % (resp.code, req.url))
AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
try:
map([
HTTPRequest(method='HEAD', url=i.strip())
for i in sys.stdin
], mapper)
except KeyboardInterrupt:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment