Created
May 3, 2023 07:30
-
-
Save randymi01/4ecad958c7a9d71ce29dbd611e8956fa to your computer and use it in GitHub Desktop.
Python multithread download images from site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import requests as re | |
from bs4 import BeautifulSoup | |
import os | |
import time | |
from multiprocessing import cpu_count | |
from multiprocessing.pool import ThreadPool | |
import sys | |
# usage | |
# py main.py relative_path_destination_folder url | |
folder = sys.argv[1] | |
link = sys.argv[2] | |
response = re.get(link) | |
if not os.path.isdir(folder): | |
os.mkdir(folder) | |
pics = [] | |
def image_criteria(href): | |
# add additional criteria here | |
return True | |
soup = BeautifulSoup(response.text, "html.parser") | |
for link in soup.find_all("a"): | |
href = link.get("href") | |
# criteria for proper image links | |
if href.endswith(".jpg") and image_criteria(href): | |
pics.append(href) | |
# multithreaded download | |
def download(urls): | |
destinations = [folder+"/"+str(j)+".jpg" for i,j in zip(urls, range(1,len(urls)+1))] | |
download_parallel(zip(urls,destinations)) | |
def download_url(args): | |
t0 = time.time() | |
url, fn = args[0], args[1] | |
try: | |
r = re.get(url) | |
with open(fn, 'wb') as f: | |
f.write(r.content) | |
return(url, time.time() - t0) | |
except Exception as e: | |
print('Exception in download_url():', e) | |
def download_parallel(args): | |
cpus = cpu_count() | |
results = ThreadPool(cpus - 1).imap_unordered(download_url, args) | |
for result in results: | |
print('url:', result[0], 'time (s):', result[1]) | |
if __name__ == "__main__": | |
download(pics) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment