|
#!/usr/bin/env python |
|
import sys |
|
import boto3 |
|
import os |
|
import requests |
|
import botocore |
|
from multiprocessing import Process |
|
import re |
|
|
|
#Collect arguements and remove the first element as it would be the filename |
|
urls = sys.argv |
|
del urls[0] |
|
#Create a new dictionary that will hold the types of urls that will be passed on to the file ( this will be usefull to assign all urls to each type of download ) |
|
filtered_urls = {} |
|
|
|
def is_downloadable(url): |
|
""" |
|
Does the url contain a downloadable resource |
|
""" |
|
h = requests.head(url) |
|
header = h.headers |
|
content_type = header.get('content-type') |
|
if 'text' in content_type.lower(): |
|
return False |
|
if 'html' in content_type.lower(): |
|
return False |
|
return True |
|
|
|
def s3dl(path): |
|
try: |
|
# Match regex and breakdown to get bucket, filepath and filename |
|
uri = re.match(r's3:\/\/(.+?)\/(.+)',path) |
|
bucket = uri.group(1) |
|
file_path = uri.group(2) |
|
if file_path.find('/'): |
|
file_name = file_path.rsplit('/', 1)[1] |
|
else: |
|
file_name = file_path |
|
s3 = boto3.resource('s3') |
|
s3.Bucket(bucket).download_file(file_path, file_name) |
|
return True |
|
except botocore.exceptions.ClientError as e: |
|
if e.response['Error']['Code'] == "404": |
|
print("The object does not exist.") |
|
else: |
|
raise |
|
|
|
def httpdl(path): |
|
try: |
|
# Check if file is downloadable |
|
if is_downloadable(path): |
|
# Get file name from url |
|
filename = path.rsplit('/',1)[-1] |
|
r = requests.get(path, stream=True) |
|
with open(filename, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=1024): |
|
if chunk: |
|
f.write(chunk) |
|
else: |
|
print('File is not downloadable.') |
|
return False |
|
except: |
|
print("Error downloading file.") |
|
return False |
|
|
|
if __name__ == "__main__": |
|
# Itterate through urls and filter them by schema |
|
for url in urls: |
|
explode = url.split('://') |
|
if not explode[0] in filtered_urls: |
|
filtered_urls[explode[0]] = [] |
|
filtered_urls[explode[0]].append(explode[1]) |
|
|
|
#Create an array that will hold the simultaneous processes that will be responsible for downloads |
|
processes = [] |
|
|
|
# Itterate through filtered urls and download them by relative method |
|
for download_type in filtered_urls: |
|
for path in filtered_urls[download_type]: |
|
if download_type == "s3": |
|
download = download_type+'://'+path |
|
process = multiprocessing.Process(target=s3dl,args=(download_type+'://'+path)) |
|
processes.append(process) |
|
if download_type == "http" or download_type == "https": |
|
download = download_type+'://'+path |
|
process = Process(target=httpdl,args=(download,)) |
|
processes.append(process) |
|
|
|
# Start the processes |
|
for process in processes: |
|
process.start() |
|
|
|
# Ensure all processes are done and list their time to download as well |
|
for process in processes: |
|
process.join() |
|
|
|
print(process) |
|
print("All downloads have been completed") |