Skip to content

Instantly share code, notes, and snippets.

@Hellowlol
Last active October 16, 2016 18:15
Show Gist options
  • Save Hellowlol/1aa04a1808534aee6b87a906b1d7e53c to your computer and use it in GitHub Desktop.
Save Hellowlol/1aa04a1808534aee6b87a906b1d7e53c to your computer and use it in GitHub Desktop.
Better http for py2
from functools import partial
from multiprocessing.dummy import Pool as ThreadPool
import urllib3
try:
from ujson import dumps
except ImportError:
try:
from simplejson import dumps
except ImportError:
from json import dumps
def http(urls, headers=None, timeout=10, method='GET', cb=None, workers=10, chunk=None, rtype='content'):
if isinstance(urls, str):
urls = urls.split() or urls.split(',')
if headers is None:
headers = {}
# From cpython
if chunk is None:
chunk, extra = divmod(len(urls), workers * 4)
if extra:
chunk += 1
if len(urls) == 0:
chunk = 0
method = method.upper()
def _http_requests_urllib3(url, session, headers, timeout=10, method='GET', rtype='json', cb=None):
x = session.request(method, url, headers=headers).data
if rtype == 'text':
x = x.decode('utf-8', 'ignore')
elif rtype == 'json':
x = loads(x.decode('utf-8', 'ignore'))
if cb:
return cb(x)
return x
session = urllib3.PoolManager() # default is 10
part = partial(_http_requests_urllib3, headers=headers, session=session, method=method, rtype=rtype, cb=cb)
if len(urls) == 1:
yield part(urls[0])
pool = ThreadPool(workers)
try:
for work in tqdm.tqdm(pool.imap_unordered(part, urls, chunk)):
yield work
except Exception as e:
print(e)
finally:
pool.close()
pool.join()
def cb(func):
print('Im a callback')
return func
for a_url in http(['url1', 'url2'], cb=cb, headers={}, rtype='json'):
do_something(a_url)
@JonnyWong16
Copy link

JonnyWong16 commented Oct 16, 2016

Basically 10 workers requesting chunks from a list of URLS then yielding them as they finish.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment