Last active
August 4, 2025 08:23
-
-
Save tigrouind/b4cb9aa80878ae3dffa7718b636d3460 to your computer and use it in GitHub Desktop.
Download all PDF files from abandonware-magazines.org that have no album
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
import time | |
import zipfile | |
from urllib.parse import urljoin, urlparse | |
import urllib.request | |
import sys | |
BASE_URL = "https://www.abandonware-magazines.org/affiche_mag.php?mag={}&page={}" | |
DOWNLOAD_PAGE_BASE = "https://www.abandonware-magazines.org/" | |
DOWNLOAD_FOLDER = "downloads" | |
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True) | |
def get_html(url): | |
response = requests.get(url) | |
response.raise_for_status() | |
return response.text | |
def find_download_links(page_html): | |
soup = BeautifulSoup(page_html, "html.parser") | |
results = [] | |
# Find all <a> tags with "download.php" in href | |
for a_tag in soup.find_all("a", href=lambda href: href and "download.php" in href): | |
tr = a_tag.find_parent("tr") | |
if not tr: | |
continue | |
next_tr = tr.find_next_sibling("tr") | |
if not next_tr: | |
continue | |
# Look for image with "images/album_off.png" in src | |
img = next_tr.find("img", src=lambda src: src and "images/album_off.png" in src) | |
if img: | |
results.append(urljoin(DOWNLOAD_PAGE_BASE, a_tag['href'])) | |
return results | |
def get_final_download_url(download_php_url): | |
html = get_html(download_php_url) | |
soup = BeautifulSoup(html, "html.parser") | |
# Find <a> to download.abandonware.org | |
for a_tag in soup.find_all("a", href=lambda href: href and "download.abandonware.org" in href): | |
return a_tag['href'].strip() | |
return None | |
def download_file(file_url, folder): | |
print(f"Downloading '{file_url}'") | |
local_filename = os.path.join(folder, os.path.basename(urlparse(file_url).path)) | |
urllib.request.urlretrieve(file_url, local_filename) | |
with zipfile.ZipFile(local_filename,"r") as zip_ref: | |
zip_ref.extractall(folder) | |
print(f"Downloaded: {local_filename}") | |
os.remove(local_filename) | |
def main(): | |
if len(sys.argv) != 4: | |
print("Usage: python abandonware_scrapper.py <magazine_id> <pagefrom> <pageto>") | |
sys.exit(1) | |
magazine_id = int(sys.argv[1]) | |
pagefrom = int(sys.argv[2]) | |
pageto = int(sys.argv[3]) | |
for page in range(pagefrom, pageto): | |
print(f"Processing page {page}") | |
url = BASE_URL.format(magazine_id, page) | |
html = get_html(url) | |
download_php_links = find_download_links(html) | |
for link in download_php_links: | |
final_url = get_final_download_url(link) | |
if final_url: | |
try: | |
download_file(final_url, DOWNLOAD_FOLDER) | |
time.sleep(1) # Be nice to the server | |
except Exception as e: | |
print(f"Failed to download {final_url}: {e}") | |
else: | |
print("No abandonware.org download link found on page.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment