Created
May 28, 2026 06:49
-
-
Save me-suzy/8aa832714f2e9e15bb3ed9e54c4635a6 to your computer and use it in GitHub Desktop.
biiifghfg.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Generic PDF downloader for biblioteca-digitala.ro | |
| Works with any publication page. | |
| Usage: | |
| python download_biblioteca_digitala.py "https://biblioteca-digitala.ro/?pub=7758-revista-romana-de-sociologie" | |
| python download_biblioteca_digitala.py "https://biblioteca-digitala.ro/?pub=6464-studii-si-cercetari-de-chimie" | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import re | |
| import shutil | |
| import sys | |
| import time | |
| import unicodedata | |
| from urllib.parse import urljoin, urlparse, parse_qs, unquote | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8', | |
| } | |
| BASE_OUTPUT_DIR = r"G:\DOWNLOAD BIBLIOTECA" | |
| TITLE_MATCH_STOPWORDS = { | |
| 'anul', 'cercetari', 'cercetare', 'colectia', 'de', 'din', 'jurnal', | |
| 'periodic', 'publicatie', 'revista', 'roman', 'romana', 'romane', | |
| 'romanesc', 'romanesti', 'si', 'studii' | |
| } | |
| def get_soup(url, session): | |
| """Fetch page and return BeautifulSoup object""" | |
| try: | |
| response = session.get(url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| return BeautifulSoup(response.content, 'html.parser') | |
| except Exception as e: | |
| print(f" [ERROR] Failed to fetch {url}: {e}") | |
| return None | |
| def sanitize_filename(name): | |
| """Clean filename for filesystem""" | |
| name = re.sub(r'[<>:"/\\|?*]', '_', name) | |
| name = re.sub(r'\s+', '_', name) | |
| name = name.strip('._') | |
| return name[:200] | |
| def strip_diacritics(text): | |
| """Convert Romanian/European diacritics to plain ASCII.""" | |
| return unicodedata.normalize('NFKD', text or '').encode('ascii', 'ignore').decode('ascii') | |
| def sanitize_folder_name(name): | |
| """Clean publication title for a readable Windows folder name.""" | |
| name = strip_diacritics(name) | |
| name = re.sub(r'[<>:"/\\|?*]', ' ', name) | |
| name = re.sub(r'[_-]+', ' ', name) | |
| name = re.sub(r'\s+', ' ', name) | |
| name = name.strip(' ._') | |
| if not name: | |
| return "Unknown Publication" | |
| return name.title()[:200] | |
| def slugify_text(text): | |
| """Create a lowercase ASCII key for matching PDF filename prefixes.""" | |
| text = strip_diacritics(text).lower() | |
| text = re.sub(r'[^a-z0-9]+', '-', text) | |
| return text.strip('-') | |
| def get_publication_slug(pub_url): | |
| """Extract URL slug from ?pub=10323-some-title.""" | |
| query_params = parse_qs(urlparse(pub_url).query) | |
| pub_value = query_params.get('pub', [''])[0] | |
| match = re.match(r'\d+-(.+)', pub_value) | |
| if match: | |
| return match.group(1) | |
| return pub_value | |
| def get_significant_tokens(text): | |
| """Return useful tokens for checking that the fetched page matches the requested URL.""" | |
| return [ | |
| token for token in slugify_text(text).split('-') | |
| if len(token) >= 4 and token not in TITLE_MATCH_STOPWORDS | |
| ] | |
| def publication_title_matches_url(pub_title, pub_url): | |
| """Avoid downloading into the wrong folder when the fetched page does not match the URL slug.""" | |
| pub_slug = get_publication_slug(pub_url) | |
| if not pub_slug or not pub_title or pub_title == "Unknown_Publication": | |
| return True | |
| url_tokens = get_significant_tokens(pub_slug) | |
| title_tokens = set(get_significant_tokens(pub_title)) | |
| if not url_tokens or not title_tokens: | |
| return True | |
| key_tokens = url_tokens[-2:] if len(url_tokens) >= 2 else url_tokens | |
| return any(token in title_tokens for token in key_tokens) | |
| def get_collection_dir(base_output_dir, pub_title, pub_url=None): | |
| """Return the per-publication download directory.""" | |
| folder_source = pub_title | |
| if not folder_source or folder_source == "Unknown_Publication": | |
| folder_source = get_publication_slug(pub_url or '') or "Unknown Publication" | |
| return os.path.join(base_output_dir, sanitize_folder_name(folder_source)) | |
| def infer_collection_prefix(filename): | |
| """Infer collection prefix from known PDF filenames.""" | |
| stem = os.path.splitext(os.path.basename(filename or ''))[0] | |
| key = slugify_text(stem) | |
| if not key: | |
| return None | |
| match = re.match( | |
| r'^(.+?)(?:-(?:no|nr|numar|numarul|tom|vol|volum)-?\d|-(?:1[5-9]\d{2}|20\d{2})(?:$|-))', | |
| key | |
| ) | |
| if match: | |
| return match.group(1).strip('-') | |
| return key | |
| def build_existing_pdf_prefixes(pub_title, pub_url, pdf_links): | |
| """Build specific prefixes used to move PDFs left in the base folder.""" | |
| prefixes = set() | |
| title_slug = slugify_text(pub_title) | |
| if title_slug: | |
| prefixes.add(title_slug) | |
| for title_part in re.split(r'\s*/\s*', pub_title or ''): | |
| title_part_slug = slugify_text(title_part) | |
| if title_part_slug: | |
| prefixes.add(title_part_slug) | |
| pub_slug = slugify_text(get_publication_slug(pub_url)) | |
| if pub_slug: | |
| prefixes.add(pub_slug) | |
| for pdf_url, suggested_name in pdf_links: | |
| filename = extract_filename_from_url(pdf_url) or suggested_name | |
| prefix = infer_collection_prefix(filename) | |
| if prefix: | |
| prefixes.add(prefix) | |
| return sorted(prefix for prefix in prefixes if len(prefix) >= 8) | |
| def make_unique_path(path): | |
| """Avoid overwriting an existing file while moving old downloads.""" | |
| if not os.path.exists(path): | |
| return path | |
| root, ext = os.path.splitext(path) | |
| counter = 1 | |
| while True: | |
| candidate = f"{root}_{counter}{ext}" | |
| if not os.path.exists(candidate): | |
| return candidate | |
| counter += 1 | |
| def get_existing_collection_dir_for_prefix(base_output_dir, prefix): | |
| """Find an existing collection folder that already matches a PDF prefix.""" | |
| matches = [] | |
| for entry in os.scandir(base_output_dir): | |
| if not entry.is_dir(): | |
| continue | |
| folder_key = slugify_text(entry.name) | |
| if ( | |
| folder_key == prefix | |
| or folder_key.startswith(prefix + '-') | |
| or folder_key.endswith('-' + prefix) | |
| or prefix.startswith(folder_key + '-') | |
| ): | |
| matches.append(entry.path) | |
| if matches: | |
| matches.sort(key=lambda path: (slugify_text(os.path.basename(path)) != prefix, len(os.path.basename(path)))) | |
| return matches[0] | |
| return os.path.join(base_output_dir, sanitize_folder_name(prefix.replace('-', ' '))) | |
| def organize_existing_pdf_groups(base_output_dir): | |
| """Group any PDFs left directly in the base folder into collection folders.""" | |
| if not os.path.isdir(base_output_dir): | |
| return 0 | |
| groups = {} | |
| for entry in os.scandir(base_output_dir): | |
| if not entry.is_file() or not entry.name.lower().endswith('.pdf'): | |
| continue | |
| prefix = infer_collection_prefix(entry.name) | |
| if prefix and len(prefix) >= 8: | |
| groups.setdefault(prefix, []).append(entry.path) | |
| moved_count = 0 | |
| for prefix, file_paths in sorted(groups.items()): | |
| destination_dir = get_existing_collection_dir_for_prefix(base_output_dir, prefix) | |
| os.makedirs(destination_dir, exist_ok=True) | |
| for file_path in file_paths: | |
| filename = os.path.basename(file_path) | |
| destination = make_unique_path(os.path.join(destination_dir, filename)) | |
| shutil.move(file_path, destination) | |
| print(f" [MOVE] Existing orphan PDF moved to {os.path.basename(destination_dir)}: {filename}") | |
| moved_count += 1 | |
| return moved_count | |
| def move_existing_collection_folders_to_collection(base_output_dir, collection_dir, pub_title, pub_url, pdf_links): | |
| """Move PDFs from compatible short-name folders into the current collection folder.""" | |
| if not os.path.isdir(base_output_dir): | |
| return 0 | |
| prefixes = build_existing_pdf_prefixes(pub_title, pub_url, pdf_links) | |
| if not prefixes: | |
| return 0 | |
| collection_dir = os.path.abspath(collection_dir) | |
| moved_count = 0 | |
| for entry in os.scandir(base_output_dir): | |
| if not entry.is_dir(): | |
| continue | |
| source_dir = os.path.abspath(entry.path) | |
| if os.path.normcase(source_dir) == os.path.normcase(collection_dir): | |
| continue | |
| folder_key = slugify_text(entry.name) | |
| if not any( | |
| folder_key == prefix | |
| or folder_key.startswith(prefix + '-') | |
| or folder_key.endswith('-' + prefix) | |
| or prefix.startswith(folder_key + '-') | |
| for prefix in prefixes | |
| ): | |
| continue | |
| os.makedirs(collection_dir, exist_ok=True) | |
| for file_entry in os.scandir(source_dir): | |
| if not file_entry.is_file() or not file_entry.name.lower().endswith('.pdf'): | |
| continue | |
| destination = make_unique_path(os.path.join(collection_dir, file_entry.name)) | |
| shutil.move(file_entry.path, destination) | |
| print(f" [MOVE] Existing PDF moved from {entry.name}: {file_entry.name}") | |
| moved_count += 1 | |
| try: | |
| os.rmdir(source_dir) | |
| except OSError: | |
| pass | |
| return moved_count | |
| def move_existing_pdfs_to_collection(base_output_dir, collection_dir, pub_title, pub_url, pdf_links): | |
| """Move matching PDFs from the base folder into the current collection folder.""" | |
| if not os.path.isdir(base_output_dir): | |
| return 0 | |
| prefixes = build_existing_pdf_prefixes(pub_title, pub_url, pdf_links) | |
| if not prefixes: | |
| return 0 | |
| os.makedirs(collection_dir, exist_ok=True) | |
| moved_count = 0 | |
| for entry in os.scandir(base_output_dir): | |
| if not entry.is_file() or not entry.name.lower().endswith('.pdf'): | |
| continue | |
| file_key = slugify_text(os.path.splitext(entry.name)[0]) | |
| if not any(file_key == prefix or file_key.startswith(prefix + '-') for prefix in prefixes): | |
| continue | |
| destination = make_unique_path(os.path.join(collection_dir, entry.name)) | |
| shutil.move(entry.path, destination) | |
| print(f" [MOVE] Existing PDF moved: {entry.name}") | |
| moved_count += 1 | |
| return moved_count | |
| def extract_filename_from_url(url): | |
| """Extract filename from dl.asp?filename=... URL or from path""" | |
| parsed = urlparse(url) | |
| # Try to get from query parameter 'filename' | |
| query_params = parse_qs(parsed.query) | |
| if 'filename' in query_params: | |
| filename = query_params['filename'][0] | |
| return unquote(filename) | |
| # Try to get from onclick attribute (track_pdf URL) | |
| # This is handled separately | |
| # Fallback: get from path | |
| path = parsed.path | |
| if path: | |
| filename = os.path.basename(path) | |
| if filename and filename.lower() != 'dl.asp': | |
| return unquote(filename) | |
| return None | |
| def extract_filename_from_response(response): | |
| """Extract filename from Content-Disposition header""" | |
| cd = response.headers.get('Content-Disposition', '') | |
| if cd: | |
| # Try standard filename | |
| match = re.search(r'filename[^;=\n]*=([\'"]?)([^\'";\n]+)\1', cd, re.IGNORECASE) | |
| if match: | |
| filename = unquote(match.group(2)) | |
| if filename and filename.lower() != 'dl.asp' and not filename.lower().startswith('dl.asp'): | |
| return filename | |
| # Try filename* (RFC 5987) | |
| match = re.search(r"filename\*=(?:UTF-8''|utf-8'')([^;\n]+)", cd, re.IGNORECASE) | |
| if match: | |
| filename = unquote(match.group(1)) | |
| if filename and filename.lower() != 'dl.asp': | |
| return filename | |
| return None | |
| def get_publication_title(soup): | |
| """Extract publication title from page""" | |
| h2 = soup.find('h2', class_='text-color-light') | |
| if h2: | |
| return h2.get_text(strip=True) | |
| return "Unknown_Publication" | |
| def get_pdf_links_from_page(soup, base_url): | |
| """Extract all PDF download links from page""" | |
| pdf_links = [] | |
| # Find download links in the table | |
| table = soup.find('table', {'id': 'datatable-default'}) | |
| if not table: | |
| print("[WARNING] Could not find data table, searching entire page...") | |
| search_area = soup | |
| else: | |
| search_area = table | |
| for link in search_area.find_all('a', href=True): | |
| href = link['href'] | |
| # Check for PDF links (dl.asp?filename=... or direct .pdf) | |
| if 'dl.asp' in href.lower() or '.pdf' in href.lower(): | |
| # PRIORITY: Check onclick for track_pdf which has the DIRECT PDF URL | |
| onclick = link.get('onclick', '') | |
| if 'track_pdf' in onclick: | |
| # Extract URL from track_pdf('...') | |
| match = re.search(r"track_pdf\(['\"]([^'\"]+)['\"]", onclick) | |
| if match: | |
| direct_pdf_url = match.group(1) | |
| # This is the real PDF URL, not the dl.asp wrapper | |
| filename = os.path.basename(urlparse(direct_pdf_url).path) | |
| pdf_links.append((direct_pdf_url, filename)) | |
| continue | |
| # Fallback: use href (dl.asp URL) | |
| full_url = urljoin(base_url, href) | |
| filename = extract_filename_from_url(full_url) | |
| pdf_links.append((full_url, filename)) | |
| return pdf_links | |
| def get_volume_links(soup, base_url): | |
| """Extract volume links if PDFs are not directly on publication page""" | |
| volume_links = [] | |
| table = soup.find('table', {'id': 'datatable-default'}) | |
| if not table: | |
| return volume_links | |
| for link in table.find_all('a', href=True): | |
| href = link['href'] | |
| if 'volum=' in href: | |
| full_url = urljoin(base_url, href) | |
| volume_name = link.get_text(strip=True) | |
| volume_links.append((full_url, volume_name)) | |
| return volume_links | |
| def download_pdf(url, output_dir, suggested_filename, session): | |
| """Download PDF file""" | |
| try: | |
| response = session.get(url, headers=HEADERS, timeout=60, stream=True) | |
| response.raise_for_status() | |
| # Determine filename: URL param > Content-Disposition > suggested | |
| filename = extract_filename_from_url(url) | |
| if not filename or filename.lower() == 'dl.asp': | |
| filename = extract_filename_from_response(response) | |
| if not filename: | |
| filename = suggested_filename | |
| if not filename: | |
| filename = f"download_{int(time.time())}.pdf" | |
| if not filename.lower().endswith('.pdf'): | |
| filename += '.pdf' | |
| filename = sanitize_filename(filename) | |
| output_path = os.path.join(output_dir, filename) | |
| # Check if already exists | |
| if os.path.exists(output_path): | |
| print(f" [SKIP] Already exists: {filename}") | |
| return True, filename | |
| with open(output_path, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| return True, filename | |
| except Exception as e: | |
| print(f" [ERROR] Download failed: {e}") | |
| return False, None | |
| def main(): | |
| if len(sys.argv) >= 2: | |
| pub_url = sys.argv[1] | |
| else: | |
| print("=" * 70) | |
| print("Biblioteca Digitală - PDF Downloader") | |
| print("=" * 70) | |
| print("\nExemple de URL-uri:") | |
| print(" https://biblioteca-digitala.ro/?pub=7758-revista-romana-de-sociologie") | |
| print(" https://biblioteca-digitala.ro/?pub=6464-studii-si-cercetari-de-chimie") | |
| print() | |
| pub_url = input("Introdu URL-ul publicației: ").strip() | |
| if not pub_url: | |
| print("[ERROR] Nu ai introdus niciun URL!") | |
| sys.exit(1) | |
| print("=" * 70) | |
| print("Biblioteca Digitală - PDF Downloader") | |
| print("=" * 70) | |
| print(f"Publication URL: {pub_url}") | |
| session = requests.Session() | |
| # Step 1: Get publication page | |
| print(f"\n[1] Fetching publication page...") | |
| soup = get_soup(pub_url, session) | |
| if not soup: | |
| print("[FATAL] Could not load publication page") | |
| return | |
| pub_title = get_publication_title(soup) | |
| print(f" Publication: {pub_title}") | |
| if not publication_title_matches_url(pub_title, pub_url): | |
| expected_slug = get_publication_slug(pub_url) | |
| print("[FATAL] Pagina primita nu pare sa corespunda URL-ului introdus.") | |
| print(f" URL slug: {expected_slug}") | |
| print(f" Titlu pagina: {pub_title}") | |
| print(" Oprire ca sa nu descarce in folderul gresit.") | |
| return | |
| # Create per-publication output directory | |
| base_output_dir = BASE_OUTPUT_DIR | |
| output_dir = get_collection_dir(base_output_dir, pub_title, pub_url) | |
| os.makedirs(output_dir, exist_ok=True) | |
| print(f" Base output directory: {base_output_dir}") | |
| print(f" Collection directory: {output_dir}") | |
| # Step 2: Check if PDFs are directly on page or need to go to volumes | |
| pdf_links = get_pdf_links_from_page(soup, pub_url) | |
| total_pdfs = 0 | |
| if pdf_links: | |
| # PDFs are directly on the publication page | |
| print(f"\n[2] Found {len(pdf_links)} PDF links directly on page") | |
| moved_count = move_existing_pdfs_to_collection( | |
| base_output_dir, output_dir, pub_title, pub_url, pdf_links | |
| ) | |
| print(f" Existing PDFs moved from base folder: {moved_count}") | |
| folder_moved_count = move_existing_collection_folders_to_collection( | |
| base_output_dir, output_dir, pub_title, pub_url, pdf_links | |
| ) | |
| print(f" Existing PDFs moved from matching folders: {folder_moved_count}") | |
| orphan_moved_count = organize_existing_pdf_groups(base_output_dir) | |
| print(f" Other orphan PDFs grouped from base folder: {orphan_moved_count}") | |
| for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1): | |
| print(f"\n [{i}/{len(pdf_links)}] Downloading...") | |
| success, filename = download_pdf(pdf_url, output_dir, suggested_name, session) | |
| if success and filename: | |
| print(f" Saved as: {filename}") | |
| total_pdfs += 1 | |
| time.sleep(0.5) | |
| else: | |
| # Need to check volume pages for PDFs | |
| print(f"\n[2] No direct PDF links found, checking volume pages...") | |
| volume_links = get_volume_links(soup, pub_url) | |
| print(f" Found {len(volume_links)} volumes") | |
| pdf_links = [] | |
| for i, (vol_url, vol_name) in enumerate(volume_links, 1): | |
| print(f"\n[{i}/{len(volume_links)}] Scanning: {vol_name}") | |
| vol_soup = get_soup(vol_url, session) | |
| if not vol_soup: | |
| continue | |
| vol_pdf_links = get_pdf_links_from_page(vol_soup, vol_url) | |
| for pdf_url, suggested_name in vol_pdf_links: | |
| if not suggested_name: | |
| suggested_name = f"{sanitize_filename(vol_name)}.pdf" | |
| pdf_links.append((pdf_url, suggested_name)) | |
| time.sleep(0.5) | |
| print(f"\n[3] Found {len(pdf_links)} PDF links in volume pages") | |
| moved_count = move_existing_pdfs_to_collection( | |
| base_output_dir, output_dir, pub_title, pub_url, pdf_links | |
| ) | |
| print(f" Existing PDFs moved from base folder: {moved_count}") | |
| folder_moved_count = move_existing_collection_folders_to_collection( | |
| base_output_dir, output_dir, pub_title, pub_url, pdf_links | |
| ) | |
| print(f" Existing PDFs moved from matching folders: {folder_moved_count}") | |
| orphan_moved_count = organize_existing_pdf_groups(base_output_dir) | |
| print(f" Other orphan PDFs grouped from base folder: {orphan_moved_count}") | |
| for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1): | |
| print(f"\n [{i}/{len(pdf_links)}] Downloading: {suggested_name[:50]}...") | |
| success, filename = download_pdf(pdf_url, output_dir, suggested_name, session) | |
| if success and filename: | |
| print(f" Saved as: {filename}") | |
| total_pdfs += 1 | |
| time.sleep(0.5) | |
| # Summary | |
| if total_pdfs == 0: | |
| print("\n[INFO] Nu am gasit linkuri PDF descarcabile pentru aceasta publicatie.") | |
| print(" Daca site-ul afiseaza stelute negre in coloana de download,") | |
| print(" publicația are doar metadate/articole, fara fisiere PDF disponibile.") | |
| try: | |
| if os.path.isdir(output_dir) and not os.listdir(output_dir): | |
| os.rmdir(output_dir) | |
| print(f"\n[INFO] No PDFs found; empty collection directory removed: {output_dir}") | |
| except OSError: | |
| pass | |
| print("\n" + "=" * 70) | |
| print("DOWNLOAD COMPLETE") | |
| print("=" * 70) | |
| print(f"Total PDFs downloaded: {total_pdfs}") | |
| print(f"Output directory: {os.path.abspath(output_dir)}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment