Skip to content

Instantly share code, notes, and snippets.

@thistehneisen
Last active February 3, 2024 11:12
Show Gist options
  • Save thistehneisen/1a9c18006f78c228f6305b61118421bb to your computer and use it in GitHub Desktop.
Save thistehneisen/1a9c18006f78c228f6305b61118421bb to your computer and use it in GitHub Desktop.
Threaded Python script that incrementally downloads files from an address
import os
import time
from urllib.parse import unquote
import requests
from requests.exceptions import RequestException
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
# Base URL and download directory
base_url = "https://www.seq.lv/{}/download"
download_dir = "downloads"
if not os.path.exists(download_dir):
os.makedirs(download_dir)
def download_file(file_number):
url = base_url.format(file_number)
# Standard User-Agent string to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Accept-Language': 'en-US,en;q=0.9',
}
# Retry strategy with backoff factor
retry_strategy = Retry(
total=5, # increase the number of retries
backoff_factor=1, # backoff factor to apply between attempts after the second try
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["HEAD", "GET", "OPTIONS"],
raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session = requests.Session()
session.mount("http://", adapter)
session.mount("https://", adapter)
try:
start_time = time.time()
# Increase timeout (connect, read) to handle slow responses
response = session.get(url, headers=headers, timeout=(10, 30)) # 10 seconds to connect, 30 seconds to read
if response.status_code == 200:
content_disposition = response.headers.get('content-disposition')
if content_disposition:
filename = unquote(content_disposition.split('filename=')[1].strip('"'))
else:
filename = f"file_{file_number}"
file_path = os.path.join(download_dir, filename)
with open(file_path, 'wb') as file:
file.write(response.content)
end_time = time.time()
download_time = end_time - start_time
file_size = os.path.getsize(file_path)
print(f"Downloaded {filename} ({file_size} bytes) in {download_time:.2f} seconds.")
return True
else:
print(f"Error downloading file {file_number}: HTTP {response.status_code}")
return False
except requests.exceptions.RequestException as e:
print(f"Error downloading file {file_number}: {e}")
return False
num_threads = 2
max_consecutive_failures = 10
consecutive_failures = 0
file_number = 1
with ThreadPoolExecutor(max_workers=num_threads) as executor:
future_to_filenum = {executor.submit(download_file, file_num): file_num for file_num in range(file_number, file_number + 10000)}
for future in as_completed(future_to_filenum):
if future.result():
consecutive_failures = 0
else:
consecutive_failures += 1
if consecutive_failures >= max_consecutive_failures:
print(f"No more files to download. Stopping after {consecutive_failures} consecutive failures.")
break
if consecutive_failures < max_consecutive_failures:
file_number += 1
future_to_filenum[executor.submit(download_file, file_number)] = file_number
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment