thistehneisen · February 3, 2024 11:12
diff --git a/incremental-downloader.py b/incremental-downloader.py
 import os
 import time
 from urllib.parse import unquote
 import requests
 from requests.exceptions import RequestException
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from concurrent.futures import ThreadPoolExecutor, as_completed

 # Base URL and download directory
 base_url = "https://www.seq.lv/{}/download"
 download_dir = "downloads"

 if not os.path.exists(download_dir):
    os.makedirs(download_dir)

 def download_file(file_number):
    url = base_url.format(file_number)
    
    # Standard User-Agent string to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    
    # Retry strategy with backoff factor
    retry_strategy = Retry(
        total=5,  # increase the number of retries
        backoff_factor=1,  # backoff factor to apply between attempts after the second try
        status_forcelist=[429, 500, 502, 503, 504],
        method_whitelist=["HEAD", "GET", "OPTIONS"],
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    try:
        start_time = time.time()
        # Increase timeout (connect, read) to handle slow responses
        response = session.get(url, headers=headers, timeout=(10, 30))  # 10 seconds to connect, 30 seconds to read
        if response.status_code == 200:
            content_disposition = response.headers.get('content-disposition')
            if content_disposition:
                filename = unquote(content_disposition.split('filename=')[1].strip('"'))
            else:
                filename = f"file_{file_number}"

            file_path = os.path.join(download_dir, filename)
            with open(file_path, 'wb') as file:
                file.write(response.content)
            
            end_time = time.time()
            download_time = end_time - start_time
            file_size = os.path.getsize(file_path)
            
            print(f"Downloaded {filename} ({file_size} bytes) in {download_time:.2f} seconds.")
            return True
        else:
            print(f"Error downloading file {file_number}: HTTP {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file {file_number}: {e}")
        return False

 num_threads = 2
 max_consecutive_failures = 10
 consecutive_failures = 0
 file_number = 1

 with ThreadPoolExecutor(max_workers=num_threads) as executor:
    future_to_filenum = {executor.submit(download_file, file_num): file_num for file_num in range(file_number, file_number + 10000)}
    for future in as_completed(future_to_filenum):
        if future.result():
            consecutive_failures = 0
        else:
            consecutive_failures += 1
            if consecutive_failures >= max_consecutive_failures:
                print(f"No more files to download. Stopping after {consecutive_failures} consecutive failures.")
                break

        if consecutive_failures < max_consecutive_failures:
            file_number += 1
            future_to_filenum[executor.submit(download_file, file_number)] = file_number
	import os
	import time
	from urllib.parse import unquote
	import requests
	from requests.exceptions import RequestException
	from requests.adapters import HTTPAdapter
	from requests.packages.urllib3.util.retry import Retry
	from concurrent.futures import ThreadPoolExecutor, as_completed

	# Base URL and download directory
	base_url = "https://www.seq.lv/{}/download"
	download_dir = "downloads"

	if not os.path.exists(download_dir):
	os.makedirs(download_dir)

	def download_file(file_number):
	url = base_url.format(file_number)

	# Standard User-Agent string to mimic a browser
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
	'Accept-Language': 'en-US,en;q=0.9',
	}

	# Retry strategy with backoff factor
	retry_strategy = Retry(
	total=5, # increase the number of retries
	backoff_factor=1, # backoff factor to apply between attempts after the second try
	status_forcelist=[429, 500, 502, 503, 504],
	method_whitelist=["HEAD", "GET", "OPTIONS"],
	raise_on_status=False
	)
	adapter = HTTPAdapter(max_retries=retry_strategy)
	session = requests.Session()
	session.mount("http://", adapter)
	session.mount("https://", adapter)

	try:
	start_time = time.time()
	# Increase timeout (connect, read) to handle slow responses
	response = session.get(url, headers=headers, timeout=(10, 30)) # 10 seconds to connect, 30 seconds to read
	if response.status_code == 200:
	content_disposition = response.headers.get('content-disposition')
	if content_disposition:
	filename = unquote(content_disposition.split('filename=')[1].strip('"'))
	else:
	filename = f"file_{file_number}"

	file_path = os.path.join(download_dir, filename)
	with open(file_path, 'wb') as file:
	file.write(response.content)

	end_time = time.time()
	download_time = end_time - start_time
	file_size = os.path.getsize(file_path)

	print(f"Downloaded {filename} ({file_size} bytes) in {download_time:.2f} seconds.")
	return True
	else:
	print(f"Error downloading file {file_number}: HTTP {response.status_code}")
	return False
	except requests.exceptions.RequestException as e:
	print(f"Error downloading file {file_number}: {e}")
	return False

	num_threads = 2
	max_consecutive_failures = 10
	consecutive_failures = 0
	file_number = 1

	with ThreadPoolExecutor(max_workers=num_threads) as executor:
	future_to_filenum = {executor.submit(download_file, file_num): file_num for file_num in range(file_number, file_number + 10000)}
	for future in as_completed(future_to_filenum):
	if future.result():
	consecutive_failures = 0
	else:
	consecutive_failures += 1
	if consecutive_failures >= max_consecutive_failures:
	print(f"No more files to download. Stopping after {consecutive_failures} consecutive failures.")
	break

	if consecutive_failures < max_consecutive_failures:
	file_number += 1
	future_to_filenum[executor.submit(download_file, file_number)] = file_number