Skip to content

Instantly share code, notes, and snippets.

@tigrouind
Last active August 4, 2025 08:23
Show Gist options
  • Save tigrouind/b4cb9aa80878ae3dffa7718b636d3460 to your computer and use it in GitHub Desktop.
Save tigrouind/b4cb9aa80878ae3dffa7718b636d3460 to your computer and use it in GitHub Desktop.
Download all PDF files from abandonware-magazines.org that have no album
import requests
from bs4 import BeautifulSoup
import os
import time
import zipfile
from urllib.parse import urljoin, urlparse
import urllib.request
import sys
BASE_URL = "https://www.abandonware-magazines.org/affiche_mag.php?mag={}&page={}"
DOWNLOAD_PAGE_BASE = "https://www.abandonware-magazines.org/"
DOWNLOAD_FOLDER = "downloads"
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
def get_html(url):
response = requests.get(url)
response.raise_for_status()
return response.text
def find_download_links(page_html):
soup = BeautifulSoup(page_html, "html.parser")
results = []
# Find all <a> tags with "download.php" in href
for a_tag in soup.find_all("a", href=lambda href: href and "download.php" in href):
tr = a_tag.find_parent("tr")
if not tr:
continue
next_tr = tr.find_next_sibling("tr")
if not next_tr:
continue
# Look for image with "images/album_off.png" in src
img = next_tr.find("img", src=lambda src: src and "images/album_off.png" in src)
if img:
results.append(urljoin(DOWNLOAD_PAGE_BASE, a_tag['href']))
return results
def get_final_download_url(download_php_url):
html = get_html(download_php_url)
soup = BeautifulSoup(html, "html.parser")
# Find <a> to download.abandonware.org
for a_tag in soup.find_all("a", href=lambda href: href and "download.abandonware.org" in href):
return a_tag['href'].strip()
return None
def download_file(file_url, folder):
print(f"Downloading '{file_url}'")
local_filename = os.path.join(folder, os.path.basename(urlparse(file_url).path))
urllib.request.urlretrieve(file_url, local_filename)
with zipfile.ZipFile(local_filename,"r") as zip_ref:
zip_ref.extractall(folder)
print(f"Downloaded: {local_filename}")
os.remove(local_filename)
def main():
if len(sys.argv) != 4:
print("Usage: python abandonware_scrapper.py <magazine_id> <pagefrom> <pageto>")
sys.exit(1)
magazine_id = int(sys.argv[1])
pagefrom = int(sys.argv[2])
pageto = int(sys.argv[3])
for page in range(pagefrom, pageto):
print(f"Processing page {page}")
url = BASE_URL.format(magazine_id, page)
html = get_html(url)
download_php_links = find_download_links(html)
for link in download_php_links:
final_url = get_final_download_url(link)
if final_url:
try:
download_file(final_url, DOWNLOAD_FOLDER)
time.sleep(1) # Be nice to the server
except Exception as e:
print(f"Failed to download {final_url}: {e}")
else:
print("No abandonware.org download link found on page.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment