tigrouind · August 4, 2025 08:23
diff --git a/abandonware_scrapper.py b/abandonware_scrapper.py
 import requests
 from bs4 import BeautifulSoup
 import os
 import time
 import zipfile
 from urllib.parse import urljoin, urlparse
 import urllib.request
 import sys

 BASE_URL = "https://www.abandonware-magazines.org/affiche_mag.php?mag={}&page={}"
 DOWNLOAD_PAGE_BASE = "https://www.abandonware-magazines.org/"
 DOWNLOAD_FOLDER = "downloads"
 os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)

 def get_html(url):
 	response = requests.get(url)
 	response.raise_for_status()
 	return response.text

 def find_download_links(page_html):
 	soup = BeautifulSoup(page_html, "html.parser")
 	results = []
 	# Find all <a> tags with "download.php" in href
 	for a_tag in soup.find_all("a", href=lambda href: href and "download.php" in href):
 		tr = a_tag.find_parent("tr")
 		if not tr:
 			continue
 		next_tr = tr.find_next_sibling("tr")
 		if not next_tr:
 			continue
 		# Look for image with "images/album_off.png" in src
 		img = next_tr.find("img", src=lambda src: src and "images/album_off.png" in src)
 		if img:
 			results.append(urljoin(DOWNLOAD_PAGE_BASE, a_tag['href']))
 	return results

 def get_final_download_url(download_php_url):
 	html = get_html(download_php_url)
 	soup = BeautifulSoup(html, "html.parser")
 	# Find <a> to download.abandonware.org
 	for a_tag in soup.find_all("a", href=lambda href: href and "download.abandonware.org" in href):
 		return a_tag['href'].strip()
 	return None

 def download_file(file_url, folder):
 	print(f"Downloading '{file_url}'")
 	local_filename = os.path.join(folder, os.path.basename(urlparse(file_url).path))
 	urllib.request.urlretrieve(file_url, local_filename)
 		
 	with zipfile.ZipFile(local_filename,"r") as zip_ref:
 		zip_ref.extractall(folder)	
 	print(f"Downloaded: {local_filename}")
 	
 	os.remove(local_filename)

 def main():
 	if len(sys.argv) != 4:
 			print("Usage: python abandonware_scrapper.py <magazine_id> <pagefrom> <pageto>")
 			sys.exit(1)

 	magazine_id = int(sys.argv[1])
 	pagefrom = int(sys.argv[2])
 	pageto = int(sys.argv[3])
 		
 	for page in range(pagefrom, pageto):
 		print(f"Processing page {page}")
 		url = BASE_URL.format(magazine_id, page)
 		html = get_html(url)
 		download_php_links = find_download_links(html)
 		for link in download_php_links:
 			final_url = get_final_download_url(link)
 			if final_url:
 				try:					
 					download_file(final_url, DOWNLOAD_FOLDER)
 					time.sleep(1)  # Be nice to the server
 				except Exception as e:
 					print(f"Failed to download {final_url}: {e}")
 			else:
 				print("No abandonware.org download link found on page.")

 if __name__ == "__main__":
 	main()
	import requests
	from bs4 import BeautifulSoup
	import os
	import time
	import zipfile
	from urllib.parse import urljoin, urlparse
	import urllib.request
	import sys

	BASE_URL = "https://www.abandonware-magazines.org/affiche_mag.php?mag={}&page={}"
	DOWNLOAD_PAGE_BASE = "https://www.abandonware-magazines.org/"
	DOWNLOAD_FOLDER = "downloads"
	os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)

	def get_html(url):
	response = requests.get(url)
	response.raise_for_status()
	return response.text

	def find_download_links(page_html):
	soup = BeautifulSoup(page_html, "html.parser")
	results = []
	# Find all <a> tags with "download.php" in href
	for a_tag in soup.find_all("a", href=lambda href: href and "download.php" in href):
	tr = a_tag.find_parent("tr")
	if not tr:
	continue
	next_tr = tr.find_next_sibling("tr")
	if not next_tr:
	continue
	# Look for image with "images/album_off.png" in src
	img = next_tr.find("img", src=lambda src: src and "images/album_off.png" in src)
	if img:
	results.append(urljoin(DOWNLOAD_PAGE_BASE, a_tag['href']))
	return results

	def get_final_download_url(download_php_url):
	html = get_html(download_php_url)
	soup = BeautifulSoup(html, "html.parser")
	# Find <a> to download.abandonware.org
	for a_tag in soup.find_all("a", href=lambda href: href and "download.abandonware.org" in href):
	return a_tag['href'].strip()
	return None

	def download_file(file_url, folder):
	print(f"Downloading '{file_url}'")
	local_filename = os.path.join(folder, os.path.basename(urlparse(file_url).path))
	urllib.request.urlretrieve(file_url, local_filename)

	with zipfile.ZipFile(local_filename,"r") as zip_ref:
	zip_ref.extractall(folder)
	print(f"Downloaded: {local_filename}")

	os.remove(local_filename)

	def main():
	if len(sys.argv) != 4:
	print("Usage: python abandonware_scrapper.py <magazine_id> <pagefrom> <pageto>")
	sys.exit(1)

	magazine_id = int(sys.argv[1])
	pagefrom = int(sys.argv[2])
	pageto = int(sys.argv[3])

	for page in range(pagefrom, pageto):
	print(f"Processing page {page}")
	url = BASE_URL.format(magazine_id, page)
	html = get_html(url)
	download_php_links = find_download_links(html)
	for link in download_php_links:
	final_url = get_final_download_url(link)
	if final_url:
	try:
	download_file(final_url, DOWNLOAD_FOLDER)
	time.sleep(1) # Be nice to the server
	except Exception as e:
	print(f"Failed to download {final_url}: {e}")
	else:
	print("No abandonware.org download link found on page.")

	if __name__ == "__main__":
	main()