Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created May 28, 2026 06:49
Show Gist options
  • Select an option

  • Save me-suzy/8aa832714f2e9e15bb3ed9e54c4635a6 to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/8aa832714f2e9e15bb3ed9e54c4635a6 to your computer and use it in GitHub Desktop.
biiifghfg.py
#!/usr/bin/env python3
"""
Generic PDF downloader for biblioteca-digitala.ro
Works with any publication page.
Usage:
python download_biblioteca_digitala.py "https://biblioteca-digitala.ro/?pub=7758-revista-romana-de-sociologie"
python download_biblioteca_digitala.py "https://biblioteca-digitala.ro/?pub=6464-studii-si-cercetari-de-chimie"
"""
import requests
from bs4 import BeautifulSoup
import os
import re
import shutil
import sys
import time
import unicodedata
from urllib.parse import urljoin, urlparse, parse_qs, unquote
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8',
}
BASE_OUTPUT_DIR = r"G:\DOWNLOAD BIBLIOTECA"
TITLE_MATCH_STOPWORDS = {
'anul', 'cercetari', 'cercetare', 'colectia', 'de', 'din', 'jurnal',
'periodic', 'publicatie', 'revista', 'roman', 'romana', 'romane',
'romanesc', 'romanesti', 'si', 'studii'
}
def get_soup(url, session):
"""Fetch page and return BeautifulSoup object"""
try:
response = session.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except Exception as e:
print(f" [ERROR] Failed to fetch {url}: {e}")
return None
def sanitize_filename(name):
"""Clean filename for filesystem"""
name = re.sub(r'[<>:"/\\|?*]', '_', name)
name = re.sub(r'\s+', '_', name)
name = name.strip('._')
return name[:200]
def strip_diacritics(text):
"""Convert Romanian/European diacritics to plain ASCII."""
return unicodedata.normalize('NFKD', text or '').encode('ascii', 'ignore').decode('ascii')
def sanitize_folder_name(name):
"""Clean publication title for a readable Windows folder name."""
name = strip_diacritics(name)
name = re.sub(r'[<>:"/\\|?*]', ' ', name)
name = re.sub(r'[_-]+', ' ', name)
name = re.sub(r'\s+', ' ', name)
name = name.strip(' ._')
if not name:
return "Unknown Publication"
return name.title()[:200]
def slugify_text(text):
"""Create a lowercase ASCII key for matching PDF filename prefixes."""
text = strip_diacritics(text).lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')
def get_publication_slug(pub_url):
"""Extract URL slug from ?pub=10323-some-title."""
query_params = parse_qs(urlparse(pub_url).query)
pub_value = query_params.get('pub', [''])[0]
match = re.match(r'\d+-(.+)', pub_value)
if match:
return match.group(1)
return pub_value
def get_significant_tokens(text):
"""Return useful tokens for checking that the fetched page matches the requested URL."""
return [
token for token in slugify_text(text).split('-')
if len(token) >= 4 and token not in TITLE_MATCH_STOPWORDS
]
def publication_title_matches_url(pub_title, pub_url):
"""Avoid downloading into the wrong folder when the fetched page does not match the URL slug."""
pub_slug = get_publication_slug(pub_url)
if not pub_slug or not pub_title or pub_title == "Unknown_Publication":
return True
url_tokens = get_significant_tokens(pub_slug)
title_tokens = set(get_significant_tokens(pub_title))
if not url_tokens or not title_tokens:
return True
key_tokens = url_tokens[-2:] if len(url_tokens) >= 2 else url_tokens
return any(token in title_tokens for token in key_tokens)
def get_collection_dir(base_output_dir, pub_title, pub_url=None):
"""Return the per-publication download directory."""
folder_source = pub_title
if not folder_source or folder_source == "Unknown_Publication":
folder_source = get_publication_slug(pub_url or '') or "Unknown Publication"
return os.path.join(base_output_dir, sanitize_folder_name(folder_source))
def infer_collection_prefix(filename):
"""Infer collection prefix from known PDF filenames."""
stem = os.path.splitext(os.path.basename(filename or ''))[0]
key = slugify_text(stem)
if not key:
return None
match = re.match(
r'^(.+?)(?:-(?:no|nr|numar|numarul|tom|vol|volum)-?\d|-(?:1[5-9]\d{2}|20\d{2})(?:$|-))',
key
)
if match:
return match.group(1).strip('-')
return key
def build_existing_pdf_prefixes(pub_title, pub_url, pdf_links):
"""Build specific prefixes used to move PDFs left in the base folder."""
prefixes = set()
title_slug = slugify_text(pub_title)
if title_slug:
prefixes.add(title_slug)
for title_part in re.split(r'\s*/\s*', pub_title or ''):
title_part_slug = slugify_text(title_part)
if title_part_slug:
prefixes.add(title_part_slug)
pub_slug = slugify_text(get_publication_slug(pub_url))
if pub_slug:
prefixes.add(pub_slug)
for pdf_url, suggested_name in pdf_links:
filename = extract_filename_from_url(pdf_url) or suggested_name
prefix = infer_collection_prefix(filename)
if prefix:
prefixes.add(prefix)
return sorted(prefix for prefix in prefixes if len(prefix) >= 8)
def make_unique_path(path):
"""Avoid overwriting an existing file while moving old downloads."""
if not os.path.exists(path):
return path
root, ext = os.path.splitext(path)
counter = 1
while True:
candidate = f"{root}_{counter}{ext}"
if not os.path.exists(candidate):
return candidate
counter += 1
def get_existing_collection_dir_for_prefix(base_output_dir, prefix):
"""Find an existing collection folder that already matches a PDF prefix."""
matches = []
for entry in os.scandir(base_output_dir):
if not entry.is_dir():
continue
folder_key = slugify_text(entry.name)
if (
folder_key == prefix
or folder_key.startswith(prefix + '-')
or folder_key.endswith('-' + prefix)
or prefix.startswith(folder_key + '-')
):
matches.append(entry.path)
if matches:
matches.sort(key=lambda path: (slugify_text(os.path.basename(path)) != prefix, len(os.path.basename(path))))
return matches[0]
return os.path.join(base_output_dir, sanitize_folder_name(prefix.replace('-', ' ')))
def organize_existing_pdf_groups(base_output_dir):
"""Group any PDFs left directly in the base folder into collection folders."""
if not os.path.isdir(base_output_dir):
return 0
groups = {}
for entry in os.scandir(base_output_dir):
if not entry.is_file() or not entry.name.lower().endswith('.pdf'):
continue
prefix = infer_collection_prefix(entry.name)
if prefix and len(prefix) >= 8:
groups.setdefault(prefix, []).append(entry.path)
moved_count = 0
for prefix, file_paths in sorted(groups.items()):
destination_dir = get_existing_collection_dir_for_prefix(base_output_dir, prefix)
os.makedirs(destination_dir, exist_ok=True)
for file_path in file_paths:
filename = os.path.basename(file_path)
destination = make_unique_path(os.path.join(destination_dir, filename))
shutil.move(file_path, destination)
print(f" [MOVE] Existing orphan PDF moved to {os.path.basename(destination_dir)}: {filename}")
moved_count += 1
return moved_count
def move_existing_collection_folders_to_collection(base_output_dir, collection_dir, pub_title, pub_url, pdf_links):
"""Move PDFs from compatible short-name folders into the current collection folder."""
if not os.path.isdir(base_output_dir):
return 0
prefixes = build_existing_pdf_prefixes(pub_title, pub_url, pdf_links)
if not prefixes:
return 0
collection_dir = os.path.abspath(collection_dir)
moved_count = 0
for entry in os.scandir(base_output_dir):
if not entry.is_dir():
continue
source_dir = os.path.abspath(entry.path)
if os.path.normcase(source_dir) == os.path.normcase(collection_dir):
continue
folder_key = slugify_text(entry.name)
if not any(
folder_key == prefix
or folder_key.startswith(prefix + '-')
or folder_key.endswith('-' + prefix)
or prefix.startswith(folder_key + '-')
for prefix in prefixes
):
continue
os.makedirs(collection_dir, exist_ok=True)
for file_entry in os.scandir(source_dir):
if not file_entry.is_file() or not file_entry.name.lower().endswith('.pdf'):
continue
destination = make_unique_path(os.path.join(collection_dir, file_entry.name))
shutil.move(file_entry.path, destination)
print(f" [MOVE] Existing PDF moved from {entry.name}: {file_entry.name}")
moved_count += 1
try:
os.rmdir(source_dir)
except OSError:
pass
return moved_count
def move_existing_pdfs_to_collection(base_output_dir, collection_dir, pub_title, pub_url, pdf_links):
"""Move matching PDFs from the base folder into the current collection folder."""
if not os.path.isdir(base_output_dir):
return 0
prefixes = build_existing_pdf_prefixes(pub_title, pub_url, pdf_links)
if not prefixes:
return 0
os.makedirs(collection_dir, exist_ok=True)
moved_count = 0
for entry in os.scandir(base_output_dir):
if not entry.is_file() or not entry.name.lower().endswith('.pdf'):
continue
file_key = slugify_text(os.path.splitext(entry.name)[0])
if not any(file_key == prefix or file_key.startswith(prefix + '-') for prefix in prefixes):
continue
destination = make_unique_path(os.path.join(collection_dir, entry.name))
shutil.move(entry.path, destination)
print(f" [MOVE] Existing PDF moved: {entry.name}")
moved_count += 1
return moved_count
def extract_filename_from_url(url):
"""Extract filename from dl.asp?filename=... URL or from path"""
parsed = urlparse(url)
# Try to get from query parameter 'filename'
query_params = parse_qs(parsed.query)
if 'filename' in query_params:
filename = query_params['filename'][0]
return unquote(filename)
# Try to get from onclick attribute (track_pdf URL)
# This is handled separately
# Fallback: get from path
path = parsed.path
if path:
filename = os.path.basename(path)
if filename and filename.lower() != 'dl.asp':
return unquote(filename)
return None
def extract_filename_from_response(response):
"""Extract filename from Content-Disposition header"""
cd = response.headers.get('Content-Disposition', '')
if cd:
# Try standard filename
match = re.search(r'filename[^;=\n]*=([\'"]?)([^\'";\n]+)\1', cd, re.IGNORECASE)
if match:
filename = unquote(match.group(2))
if filename and filename.lower() != 'dl.asp' and not filename.lower().startswith('dl.asp'):
return filename
# Try filename* (RFC 5987)
match = re.search(r"filename\*=(?:UTF-8''|utf-8'')([^;\n]+)", cd, re.IGNORECASE)
if match:
filename = unquote(match.group(1))
if filename and filename.lower() != 'dl.asp':
return filename
return None
def get_publication_title(soup):
"""Extract publication title from page"""
h2 = soup.find('h2', class_='text-color-light')
if h2:
return h2.get_text(strip=True)
return "Unknown_Publication"
def get_pdf_links_from_page(soup, base_url):
"""Extract all PDF download links from page"""
pdf_links = []
# Find download links in the table
table = soup.find('table', {'id': 'datatable-default'})
if not table:
print("[WARNING] Could not find data table, searching entire page...")
search_area = soup
else:
search_area = table
for link in search_area.find_all('a', href=True):
href = link['href']
# Check for PDF links (dl.asp?filename=... or direct .pdf)
if 'dl.asp' in href.lower() or '.pdf' in href.lower():
# PRIORITY: Check onclick for track_pdf which has the DIRECT PDF URL
onclick = link.get('onclick', '')
if 'track_pdf' in onclick:
# Extract URL from track_pdf('...')
match = re.search(r"track_pdf\(['\"]([^'\"]+)['\"]", onclick)
if match:
direct_pdf_url = match.group(1)
# This is the real PDF URL, not the dl.asp wrapper
filename = os.path.basename(urlparse(direct_pdf_url).path)
pdf_links.append((direct_pdf_url, filename))
continue
# Fallback: use href (dl.asp URL)
full_url = urljoin(base_url, href)
filename = extract_filename_from_url(full_url)
pdf_links.append((full_url, filename))
return pdf_links
def get_volume_links(soup, base_url):
"""Extract volume links if PDFs are not directly on publication page"""
volume_links = []
table = soup.find('table', {'id': 'datatable-default'})
if not table:
return volume_links
for link in table.find_all('a', href=True):
href = link['href']
if 'volum=' in href:
full_url = urljoin(base_url, href)
volume_name = link.get_text(strip=True)
volume_links.append((full_url, volume_name))
return volume_links
def download_pdf(url, output_dir, suggested_filename, session):
"""Download PDF file"""
try:
response = session.get(url, headers=HEADERS, timeout=60, stream=True)
response.raise_for_status()
# Determine filename: URL param > Content-Disposition > suggested
filename = extract_filename_from_url(url)
if not filename or filename.lower() == 'dl.asp':
filename = extract_filename_from_response(response)
if not filename:
filename = suggested_filename
if not filename:
filename = f"download_{int(time.time())}.pdf"
if not filename.lower().endswith('.pdf'):
filename += '.pdf'
filename = sanitize_filename(filename)
output_path = os.path.join(output_dir, filename)
# Check if already exists
if os.path.exists(output_path):
print(f" [SKIP] Already exists: {filename}")
return True, filename
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return True, filename
except Exception as e:
print(f" [ERROR] Download failed: {e}")
return False, None
def main():
if len(sys.argv) >= 2:
pub_url = sys.argv[1]
else:
print("=" * 70)
print("Biblioteca Digitală - PDF Downloader")
print("=" * 70)
print("\nExemple de URL-uri:")
print(" https://biblioteca-digitala.ro/?pub=7758-revista-romana-de-sociologie")
print(" https://biblioteca-digitala.ro/?pub=6464-studii-si-cercetari-de-chimie")
print()
pub_url = input("Introdu URL-ul publicației: ").strip()
if not pub_url:
print("[ERROR] Nu ai introdus niciun URL!")
sys.exit(1)
print("=" * 70)
print("Biblioteca Digitală - PDF Downloader")
print("=" * 70)
print(f"Publication URL: {pub_url}")
session = requests.Session()
# Step 1: Get publication page
print(f"\n[1] Fetching publication page...")
soup = get_soup(pub_url, session)
if not soup:
print("[FATAL] Could not load publication page")
return
pub_title = get_publication_title(soup)
print(f" Publication: {pub_title}")
if not publication_title_matches_url(pub_title, pub_url):
expected_slug = get_publication_slug(pub_url)
print("[FATAL] Pagina primita nu pare sa corespunda URL-ului introdus.")
print(f" URL slug: {expected_slug}")
print(f" Titlu pagina: {pub_title}")
print(" Oprire ca sa nu descarce in folderul gresit.")
return
# Create per-publication output directory
base_output_dir = BASE_OUTPUT_DIR
output_dir = get_collection_dir(base_output_dir, pub_title, pub_url)
os.makedirs(output_dir, exist_ok=True)
print(f" Base output directory: {base_output_dir}")
print(f" Collection directory: {output_dir}")
# Step 2: Check if PDFs are directly on page or need to go to volumes
pdf_links = get_pdf_links_from_page(soup, pub_url)
total_pdfs = 0
if pdf_links:
# PDFs are directly on the publication page
print(f"\n[2] Found {len(pdf_links)} PDF links directly on page")
moved_count = move_existing_pdfs_to_collection(
base_output_dir, output_dir, pub_title, pub_url, pdf_links
)
print(f" Existing PDFs moved from base folder: {moved_count}")
folder_moved_count = move_existing_collection_folders_to_collection(
base_output_dir, output_dir, pub_title, pub_url, pdf_links
)
print(f" Existing PDFs moved from matching folders: {folder_moved_count}")
orphan_moved_count = organize_existing_pdf_groups(base_output_dir)
print(f" Other orphan PDFs grouped from base folder: {orphan_moved_count}")
for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1):
print(f"\n [{i}/{len(pdf_links)}] Downloading...")
success, filename = download_pdf(pdf_url, output_dir, suggested_name, session)
if success and filename:
print(f" Saved as: {filename}")
total_pdfs += 1
time.sleep(0.5)
else:
# Need to check volume pages for PDFs
print(f"\n[2] No direct PDF links found, checking volume pages...")
volume_links = get_volume_links(soup, pub_url)
print(f" Found {len(volume_links)} volumes")
pdf_links = []
for i, (vol_url, vol_name) in enumerate(volume_links, 1):
print(f"\n[{i}/{len(volume_links)}] Scanning: {vol_name}")
vol_soup = get_soup(vol_url, session)
if not vol_soup:
continue
vol_pdf_links = get_pdf_links_from_page(vol_soup, vol_url)
for pdf_url, suggested_name in vol_pdf_links:
if not suggested_name:
suggested_name = f"{sanitize_filename(vol_name)}.pdf"
pdf_links.append((pdf_url, suggested_name))
time.sleep(0.5)
print(f"\n[3] Found {len(pdf_links)} PDF links in volume pages")
moved_count = move_existing_pdfs_to_collection(
base_output_dir, output_dir, pub_title, pub_url, pdf_links
)
print(f" Existing PDFs moved from base folder: {moved_count}")
folder_moved_count = move_existing_collection_folders_to_collection(
base_output_dir, output_dir, pub_title, pub_url, pdf_links
)
print(f" Existing PDFs moved from matching folders: {folder_moved_count}")
orphan_moved_count = organize_existing_pdf_groups(base_output_dir)
print(f" Other orphan PDFs grouped from base folder: {orphan_moved_count}")
for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1):
print(f"\n [{i}/{len(pdf_links)}] Downloading: {suggested_name[:50]}...")
success, filename = download_pdf(pdf_url, output_dir, suggested_name, session)
if success and filename:
print(f" Saved as: {filename}")
total_pdfs += 1
time.sleep(0.5)
# Summary
if total_pdfs == 0:
print("\n[INFO] Nu am gasit linkuri PDF descarcabile pentru aceasta publicatie.")
print(" Daca site-ul afiseaza stelute negre in coloana de download,")
print(" publicația are doar metadate/articole, fara fisiere PDF disponibile.")
try:
if os.path.isdir(output_dir) and not os.listdir(output_dir):
os.rmdir(output_dir)
print(f"\n[INFO] No PDFs found; empty collection directory removed: {output_dir}")
except OSError:
pass
print("\n" + "=" * 70)
print("DOWNLOAD COMPLETE")
print("=" * 70)
print(f"Total PDFs downloaded: {total_pdfs}")
print(f"Output directory: {os.path.abspath(output_dir)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment