Skip to content

Instantly share code, notes, and snippets.

@audriusrudalevicius
Created November 28, 2018 08:50
Show Gist options
  • Save audriusrudalevicius/2ef4f36375746fc5270b078d16404323 to your computer and use it in GitHub Desktop.
Save audriusrudalevicius/2ef4f36375746fc5270b078d16404323 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import concurrent
import csv
import hashlib
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor, _base
from contextlib import closing
from urllib.parse import urlparse
import requests
from tqdm import tqdm
class SingleThreadPoolExecutor(ThreadPoolExecutor):
def __init__(self, max_workers=None, thread_name_prefix=''):
self._same_thread_mode = max_workers < 1
if len(thread_name_prefix) < 1:
raise Exception('Thread pool with out name')
if not self._same_thread_mode:
super().__init__(max_workers, thread_name_prefix)
def submit(self, fn, *args, **kwargs):
if not self._same_thread_mode:
return super().submit(fn, *args, **kwargs)
f = _base.Future()
try:
result = fn(*args, **kwargs)
f.set_result(result)
except BaseException as exc:
f.set_exception(exc)
return f
class Downloader:
def __init__(self, download_to, parallel_count):
self._pool = SingleThreadPoolExecutor(parallel_count, 'DownloaderPool')
self._destination_path = download_to
@staticmethod
def _transfer(url, destination, chunk_size):
try:
with requests.get(url, stream=True) as response:
if response.ok is not True:
return
expected_size = int(response.headers.get('Content-Length'))
if os.path.exists(destination):
existing = os.path.getsize(destination)
if existing == expected_size:
return destination
with closing(response), open(destination, 'wb') as file:
for chunk in response.iter_content(chunk_size):
if chunk:
file.write(chunk)
downloaded_size = os.path.getsize(destination)
if downloaded_size < expected_size or expected_size < 1:
os.remove(destination)
return
return destination
except BaseException:
return
@staticmethod
def _download(item):
max_retry = 5
chunk_size = 1 << 15
url, destination = item
_try = 0
while _try < max_retry:
if _try > 0:
print('Retrying to download {} to {} ({}/{})'.format(url, destination, _try, max_retry))
_try = _try + 1
result = Downloader._transfer(url, destination, chunk_size)
if result is not None:
return result
time.sleep(1 * _try)
def download_batch(self, urls):
# return self._pool.map(self._download, )
source = list(map(lambda url: (url, mk_dest_path(url, self._destination_path)), urls))
return tqdm_parallel_map(self._pool, self._download, source)
def tqdm_parallel_map(executor, fn, *iterables, **kwargs):
futures_list = []
for iterable in iterables:
futures_list += [executor.submit(fn, i) for i in iterable]
for f in tqdm(concurrent.futures.as_completed(futures_list), total=len(futures_list), **kwargs):
yield f.result()
def mk_dest_path(url, destination_path):
url_hash = hashlib.md5(url.encode("utf-8")).hexdigest()
pu = urlparse(url)
fn = os.path.basename(pu[2])
if fn is None or fn is '':
fn = url_hash
return destination_path + '/' + fn
if len(sys.argv) < 3:
raise Exception('2 arguments required')
download_to = os.path.realpath(sys.argv[2])
downloads_file = os.path.realpath(sys.argv[1])
downloader = Downloader(download_to, 20)
with open(downloads_file, mode='r') as infile:
reader = csv.reader(infile)
for i in downloader.download_batch(map(lambda columns: columns[0], reader)):
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment