randymi01 · May 3, 2023 07:30
diff --git a/main.py b/main.py
 #!/usr/bin/env python
 # coding: utf-8

 import requests as re
 from bs4 import BeautifulSoup
 import os
 import time
 from multiprocessing import cpu_count
 from multiprocessing.pool import ThreadPool
 import sys

 # usage
 # py main.py relative_path_destination_folder url

 folder = sys.argv[1]
 link = sys.argv[2]

 response = re.get(link)

 if not os.path.isdir(folder):
    os.mkdir(folder)

 pics = []

 def image_criteria(href):
    # add additional criteria here
    return True

 soup = BeautifulSoup(response.text, "html.parser")
 for link in soup.find_all("a"):
    href = link.get("href")
    # criteria for proper image links
    if href.endswith(".jpg") and image_criteria(href):
        pics.append(href)

 # multithreaded download
 def download(urls):
    destinations = [folder+"/"+str(j)+".jpg" for i,j in zip(urls, range(1,len(urls)+1))]
    download_parallel(zip(urls,destinations))

 def download_url(args):
    t0 = time.time()
    url, fn = args[0], args[1]
    try:
        r = re.get(url)
        with open(fn, 'wb') as f:
            f.write(r.content)
        return(url, time.time() - t0)
    except Exception as e:
        print('Exception in download_url():', e)

 def download_parallel(args):
    cpus = cpu_count()
    results = ThreadPool(cpus - 1).imap_unordered(download_url, args)
    for result in results:
        print('url:', result[0], 'time (s):', result[1])

 if __name__ == "__main__":        
    download(pics)
	#!/usr/bin/env python
	# coding: utf-8

	import requests as re
	from bs4 import BeautifulSoup
	import os
	import time
	from multiprocessing import cpu_count
	from multiprocessing.pool import ThreadPool
	import sys

	# usage
	# py main.py relative_path_destination_folder url

	folder = sys.argv[1]
	link = sys.argv[2]

	response = re.get(link)

	if not os.path.isdir(folder):
	os.mkdir(folder)

	pics = []

	def image_criteria(href):
	# add additional criteria here
	return True

	soup = BeautifulSoup(response.text, "html.parser")
	for link in soup.find_all("a"):
	href = link.get("href")
	# criteria for proper image links
	if href.endswith(".jpg") and image_criteria(href):
	pics.append(href)

	# multithreaded download
	def download(urls):
	destinations = [folder+"/"+str(j)+".jpg" for i,j in zip(urls, range(1,len(urls)+1))]
	download_parallel(zip(urls,destinations))

	def download_url(args):
	t0 = time.time()
	url, fn = args[0], args[1]
	try:
	r = re.get(url)
	with open(fn, 'wb') as f:
	f.write(r.content)
	return(url, time.time() - t0)
	except Exception as e:
	print('Exception in download_url():', e)

	def download_parallel(args):
	cpus = cpu_count()
	results = ThreadPool(cpus - 1).imap_unordered(download_url, args)
	for result in results:
	print('url:', result[0], 'time (s):', result[1])

	if __name__ == "__main__":
	download(pics)