Bharat-B · February 3, 2024 05:12
diff --git a/README.md b/README.md
diff --git a/multi-downloader.py b/multi-downloader.py
 #!/usr/bin/env python
 import sys
 import boto3
 import os
 import requests
 import botocore
 from multiprocessing import Process
 import re 

 #Collect arguements and remove the first element as it would be the filename
 urls = sys.argv
 del urls[0]
 #Create a new dictionary that will hold the types of urls that will be passed on to the file ( this will be usefull to assign all urls to each type of download )
 filtered_urls = {}

 def is_downloadable(url):
 	"""
 	Does the url contain a downloadable resource
 	"""
 	h = requests.head(url)
 	header = h.headers
 	content_type = header.get('content-type')
 	if 'text' in content_type.lower():
 		return False
 	if 'html' in content_type.lower():
 		return False
 	return True

 def s3dl(path):
 	try:
 		# Match regex and breakdown to get bucket, filepath and filename
 		uri = re.match(r's3:\/\/(.+?)\/(.+)',path)
 		bucket = uri.group(1)
 		file_path = uri.group(2)
 		if file_path.find('/'):
 			file_name = file_path.rsplit('/', 1)[1]
 		else:
 			file_name = file_path
 		s3 = boto3.resource('s3')
 		s3.Bucket(bucket).download_file(file_path, file_name)
 		return True
 	except botocore.exceptions.ClientError as e:
 		if e.response['Error']['Code'] == "404":
 			print("The object does not exist.")
 		else:
 			raise

 def httpdl(path):
 	try:
 		# Check if file is downloadable
 		if is_downloadable(path):
 			# Get file name from url
 			filename = path.rsplit('/',1)[-1]
 			r = requests.get(path, stream=True)
 			with open(filename, 'wb') as f:
 				for chunk in r.iter_content(chunk_size=1024):
 					if chunk:
 						f.write(chunk)
 		else:
 			print('File is not downloadable.')
 			return False
 	except:
 		print("Error downloading file.")
 		return False

 if __name__ == "__main__":
 	# Itterate through urls and filter them by schema
 	for url in urls:
 		explode = url.split('://')
 		if not explode[0] in filtered_urls:
 			filtered_urls[explode[0]] = []
 		filtered_urls[explode[0]].append(explode[1])

 	#Create an array that will hold the simultaneous processes that will be responsible for downloads
 	processes = []

 	# Itterate through filtered urls and download them by relative method
 	for download_type in filtered_urls:
 		for path in filtered_urls[download_type]:
 			if download_type == "s3":
 				download = download_type+'://'+path
 				process = multiprocessing.Process(target=s3dl,args=(download_type+'://'+path))
 				processes.append(process)
 			if download_type == "http" or download_type == "https":
 				download = download_type+'://'+path
 				process = Process(target=httpdl,args=(download,))
 				processes.append(process)

 	# Start the processes
 	for process in processes:
 		process.start()

 	# Ensure all processes are done and list their time to download as well
 	for process in processes:
 		process.join()

 print(process)
 print("All downloads have been completed")
	#!/usr/bin/env python
	import sys
	import boto3
	import os
	import requests
	import botocore
	from multiprocessing import Process
	import re

	#Collect arguements and remove the first element as it would be the filename
	urls = sys.argv
	del urls[0]
	#Create a new dictionary that will hold the types of urls that will be passed on to the file ( this will be usefull to assign all urls to each type of download )
	filtered_urls = {}

	def is_downloadable(url):
	"""
	Does the url contain a downloadable resource
	"""
	h = requests.head(url)
	header = h.headers
	content_type = header.get('content-type')
	if 'text' in content_type.lower():
	return False
	if 'html' in content_type.lower():
	return False
	return True

	def s3dl(path):
	try:
	# Match regex and breakdown to get bucket, filepath and filename
	uri = re.match(r's3:\/\/(.+?)\/(.+)',path)
	bucket = uri.group(1)
	file_path = uri.group(2)
	if file_path.find('/'):
	file_name = file_path.rsplit('/', 1)[1]
	else:
	file_name = file_path
	s3 = boto3.resource('s3')
	s3.Bucket(bucket).download_file(file_path, file_name)
	return True
	except botocore.exceptions.ClientError as e:
	if e.response['Error']['Code'] == "404":
	print("The object does not exist.")
	else:
	raise

	def httpdl(path):
	try:
	# Check if file is downloadable
	if is_downloadable(path):
	# Get file name from url
	filename = path.rsplit('/',1)[-1]
	r = requests.get(path, stream=True)
	with open(filename, 'wb') as f:
	for chunk in r.iter_content(chunk_size=1024):
	if chunk:
	f.write(chunk)
	else:
	print('File is not downloadable.')
	return False
	except:
	print("Error downloading file.")
	return False

	if __name__ == "__main__":
	# Itterate through urls and filter them by schema
	for url in urls:
	explode = url.split('://')
	if not explode[0] in filtered_urls:
	filtered_urls[explode[0]] = []
	filtered_urls[explode[0]].append(explode[1])

	#Create an array that will hold the simultaneous processes that will be responsible for downloads
	processes = []

	# Itterate through filtered urls and download them by relative method
	for download_type in filtered_urls:
	for path in filtered_urls[download_type]:
	if download_type == "s3":
	download = download_type+'://'+path
	process = multiprocessing.Process(target=s3dl,args=(download_type+'://'+path))
	processes.append(process)
	if download_type == "http" or download_type == "https":
	download = download_type+'://'+path
	process = Process(target=httpdl,args=(download,))
	processes.append(process)

	# Start the processes
	for process in processes:
	process.start()

	# Ensure all processes are done and list their time to download as well
	for process in processes:
	process.join()

	print(process)
	print("All downloads have been completed")