me-suzy · May 28, 2026 06:49
diff --git a/biiifghfg.py b/biiifghfg.py
 #!/usr/bin/env python3
 """
 Generic PDF downloader for biblioteca-digitala.ro
 Works with any publication page.

 Usage:
    python download_biblioteca_digitala.py "https://biblioteca-digitala.ro/?pub=7758-revista-romana-de-sociologie"
    python download_biblioteca_digitala.py "https://biblioteca-digitala.ro/?pub=6464-studii-si-cercetari-de-chimie"
 """

 import requests
 from bs4 import BeautifulSoup
 import os
 import re
 import shutil
 import sys
 import time
 import unicodedata
 from urllib.parse import urljoin, urlparse, parse_qs, unquote

 HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8',
 }

 BASE_OUTPUT_DIR = r"G:\DOWNLOAD BIBLIOTECA"
 TITLE_MATCH_STOPWORDS = {
    'anul', 'cercetari', 'cercetare', 'colectia', 'de', 'din', 'jurnal',
    'periodic', 'publicatie', 'revista', 'roman', 'romana', 'romane',
    'romanesc', 'romanesti', 'si', 'studii'
 }

 def get_soup(url, session):
    """Fetch page and return BeautifulSoup object"""
    try:
        response = session.get(url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        print(f"  [ERROR] Failed to fetch {url}: {e}")
        return None

 def sanitize_filename(name):
    """Clean filename for filesystem"""
    name = re.sub(r'[<>:"/\\|?*]', '_', name)
    name = re.sub(r'\s+', '_', name)
    name = name.strip('._')
    return name[:200]

 def strip_diacritics(text):
    """Convert Romanian/European diacritics to plain ASCII."""
    return unicodedata.normalize('NFKD', text or '').encode('ascii', 'ignore').decode('ascii')

 def sanitize_folder_name(name):
    """Clean publication title for a readable Windows folder name."""
    name = strip_diacritics(name)
    name = re.sub(r'[<>:"/\\|?*]', ' ', name)
    name = re.sub(r'[_-]+', ' ', name)
    name = re.sub(r'\s+', ' ', name)
    name = name.strip(' ._')
    if not name:
        return "Unknown Publication"
    return name.title()[:200]

 def slugify_text(text):
    """Create a lowercase ASCII key for matching PDF filename prefixes."""
    text = strip_diacritics(text).lower()
    text = re.sub(r'[^a-z0-9]+', '-', text)
    return text.strip('-')

 def get_publication_slug(pub_url):
    """Extract URL slug from ?pub=10323-some-title."""
    query_params = parse_qs(urlparse(pub_url).query)
    pub_value = query_params.get('pub', [''])[0]
    match = re.match(r'\d+-(.+)', pub_value)
    if match:
        return match.group(1)
    return pub_value

 def get_significant_tokens(text):
    """Return useful tokens for checking that the fetched page matches the requested URL."""
    return [
        token for token in slugify_text(text).split('-')
        if len(token) >= 4 and token not in TITLE_MATCH_STOPWORDS
    ]

 def publication_title_matches_url(pub_title, pub_url):
    """Avoid downloading into the wrong folder when the fetched page does not match the URL slug."""
    pub_slug = get_publication_slug(pub_url)
    if not pub_slug or not pub_title or pub_title == "Unknown_Publication":
        return True

    url_tokens = get_significant_tokens(pub_slug)
    title_tokens = set(get_significant_tokens(pub_title))
    if not url_tokens or not title_tokens:
        return True

    key_tokens = url_tokens[-2:] if len(url_tokens) >= 2 else url_tokens
    return any(token in title_tokens for token in key_tokens)

 def get_collection_dir(base_output_dir, pub_title, pub_url=None):
    """Return the per-publication download directory."""
    folder_source = pub_title
    if not folder_source or folder_source == "Unknown_Publication":
        folder_source = get_publication_slug(pub_url or '') or "Unknown Publication"
    return os.path.join(base_output_dir, sanitize_folder_name(folder_source))

 def infer_collection_prefix(filename):
    """Infer collection prefix from known PDF filenames."""
    stem = os.path.splitext(os.path.basename(filename or ''))[0]
    key = slugify_text(stem)
    if not key:
        return None

    match = re.match(
        r'^(.+?)(?:-(?:no|nr|numar|numarul|tom|vol|volum)-?\d|-(?:1[5-9]\d{2}|20\d{2})(?:$|-))',
        key
    )
    if match:
        return match.group(1).strip('-')

    return key

 def build_existing_pdf_prefixes(pub_title, pub_url, pdf_links):
    """Build specific prefixes used to move PDFs left in the base folder."""
    prefixes = set()

    title_slug = slugify_text(pub_title)
    if title_slug:
        prefixes.add(title_slug)

    for title_part in re.split(r'\s*/\s*', pub_title or ''):
        title_part_slug = slugify_text(title_part)
        if title_part_slug:
            prefixes.add(title_part_slug)

    pub_slug = slugify_text(get_publication_slug(pub_url))
    if pub_slug:
        prefixes.add(pub_slug)

    for pdf_url, suggested_name in pdf_links:
        filename = extract_filename_from_url(pdf_url) or suggested_name
        prefix = infer_collection_prefix(filename)
        if prefix:
            prefixes.add(prefix)

    return sorted(prefix for prefix in prefixes if len(prefix) >= 8)

 def make_unique_path(path):
    """Avoid overwriting an existing file while moving old downloads."""
    if not os.path.exists(path):
        return path

    root, ext = os.path.splitext(path)
    counter = 1
    while True:
        candidate = f"{root}_{counter}{ext}"
        if not os.path.exists(candidate):
            return candidate
        counter += 1

 def get_existing_collection_dir_for_prefix(base_output_dir, prefix):
    """Find an existing collection folder that already matches a PDF prefix."""
    matches = []

    for entry in os.scandir(base_output_dir):
        if not entry.is_dir():
            continue

        folder_key = slugify_text(entry.name)
        if (
            folder_key == prefix
            or folder_key.startswith(prefix + '-')
            or folder_key.endswith('-' + prefix)
            or prefix.startswith(folder_key + '-')
        ):
            matches.append(entry.path)

    if matches:
        matches.sort(key=lambda path: (slugify_text(os.path.basename(path)) != prefix, len(os.path.basename(path))))
        return matches[0]

    return os.path.join(base_output_dir, sanitize_folder_name(prefix.replace('-', ' ')))

 def organize_existing_pdf_groups(base_output_dir):
    """Group any PDFs left directly in the base folder into collection folders."""
    if not os.path.isdir(base_output_dir):
        return 0

    groups = {}
    for entry in os.scandir(base_output_dir):
        if not entry.is_file() or not entry.name.lower().endswith('.pdf'):
            continue

        prefix = infer_collection_prefix(entry.name)
        if prefix and len(prefix) >= 8:
            groups.setdefault(prefix, []).append(entry.path)

    moved_count = 0
    for prefix, file_paths in sorted(groups.items()):
        destination_dir = get_existing_collection_dir_for_prefix(base_output_dir, prefix)
        os.makedirs(destination_dir, exist_ok=True)

        for file_path in file_paths:
            filename = os.path.basename(file_path)
            destination = make_unique_path(os.path.join(destination_dir, filename))
            shutil.move(file_path, destination)
            print(f"    [MOVE] Existing orphan PDF moved to {os.path.basename(destination_dir)}: {filename}")
            moved_count += 1

    return moved_count

 def move_existing_collection_folders_to_collection(base_output_dir, collection_dir, pub_title, pub_url, pdf_links):
    """Move PDFs from compatible short-name folders into the current collection folder."""
    if not os.path.isdir(base_output_dir):
        return 0

    prefixes = build_existing_pdf_prefixes(pub_title, pub_url, pdf_links)
    if not prefixes:
        return 0

    collection_dir = os.path.abspath(collection_dir)
    moved_count = 0

    for entry in os.scandir(base_output_dir):
        if not entry.is_dir():
            continue

        source_dir = os.path.abspath(entry.path)
        if os.path.normcase(source_dir) == os.path.normcase(collection_dir):
            continue

        folder_key = slugify_text(entry.name)
        if not any(
            folder_key == prefix
            or folder_key.startswith(prefix + '-')
            or folder_key.endswith('-' + prefix)
            or prefix.startswith(folder_key + '-')
            for prefix in prefixes
        ):
            continue

        os.makedirs(collection_dir, exist_ok=True)
        for file_entry in os.scandir(source_dir):
            if not file_entry.is_file() or not file_entry.name.lower().endswith('.pdf'):
                continue

            destination = make_unique_path(os.path.join(collection_dir, file_entry.name))
            shutil.move(file_entry.path, destination)
            print(f"    [MOVE] Existing PDF moved from {entry.name}: {file_entry.name}")
            moved_count += 1

        try:
            os.rmdir(source_dir)
        except OSError:
            pass

    return moved_count

 def move_existing_pdfs_to_collection(base_output_dir, collection_dir, pub_title, pub_url, pdf_links):
    """Move matching PDFs from the base folder into the current collection folder."""
    if not os.path.isdir(base_output_dir):
        return 0

    prefixes = build_existing_pdf_prefixes(pub_title, pub_url, pdf_links)
    if not prefixes:
        return 0

    os.makedirs(collection_dir, exist_ok=True)
    moved_count = 0

    for entry in os.scandir(base_output_dir):
        if not entry.is_file() or not entry.name.lower().endswith('.pdf'):
            continue

        file_key = slugify_text(os.path.splitext(entry.name)[0])
        if not any(file_key == prefix or file_key.startswith(prefix + '-') for prefix in prefixes):
            continue

        destination = make_unique_path(os.path.join(collection_dir, entry.name))
        shutil.move(entry.path, destination)
        print(f"    [MOVE] Existing PDF moved: {entry.name}")
        moved_count += 1

    return moved_count

 def extract_filename_from_url(url):
    """Extract filename from dl.asp?filename=... URL or from path"""
    parsed = urlparse(url)

    # Try to get from query parameter 'filename'
    query_params = parse_qs(parsed.query)
    if 'filename' in query_params:
        filename = query_params['filename'][0]
        return unquote(filename)

    # Try to get from onclick attribute (track_pdf URL)
    # This is handled separately

    # Fallback: get from path
    path = parsed.path
    if path:
        filename = os.path.basename(path)
        if filename and filename.lower() != 'dl.asp':
            return unquote(filename)

    return None

 def extract_filename_from_response(response):
    """Extract filename from Content-Disposition header"""
    cd = response.headers.get('Content-Disposition', '')
    if cd:
        # Try standard filename
        match = re.search(r'filename[^;=\n]*=([\'"]?)([^\'";\n]+)\1', cd, re.IGNORECASE)
        if match:
            filename = unquote(match.group(2))
            if filename and filename.lower() != 'dl.asp' and not filename.lower().startswith('dl.asp'):
                return filename

        # Try filename* (RFC 5987)
        match = re.search(r"filename\*=(?:UTF-8''|utf-8'')([^;\n]+)", cd, re.IGNORECASE)
        if match:
            filename = unquote(match.group(1))
            if filename and filename.lower() != 'dl.asp':
                return filename

    return None

 def get_publication_title(soup):
    """Extract publication title from page"""
    h2 = soup.find('h2', class_='text-color-light')
    if h2:
        return h2.get_text(strip=True)
    return "Unknown_Publication"

 def get_pdf_links_from_page(soup, base_url):
    """Extract all PDF download links from page"""
    pdf_links = []

    # Find download links in the table
    table = soup.find('table', {'id': 'datatable-default'})
    if not table:
        print("[WARNING] Could not find data table, searching entire page...")
        search_area = soup
    else:
        search_area = table

    for link in search_area.find_all('a', href=True):
        href = link['href']

        # Check for PDF links (dl.asp?filename=... or direct .pdf)
        if 'dl.asp' in href.lower() or '.pdf' in href.lower():

            # PRIORITY: Check onclick for track_pdf which has the DIRECT PDF URL
            onclick = link.get('onclick', '')
            if 'track_pdf' in onclick:
                # Extract URL from track_pdf('...')
                match = re.search(r"track_pdf\(['\"]([^'\"]+)['\"]", onclick)
                if match:
                    direct_pdf_url = match.group(1)
                    # This is the real PDF URL, not the dl.asp wrapper
                    filename = os.path.basename(urlparse(direct_pdf_url).path)
                    pdf_links.append((direct_pdf_url, filename))
                    continue

            # Fallback: use href (dl.asp URL)
            full_url = urljoin(base_url, href)
            filename = extract_filename_from_url(full_url)
            pdf_links.append((full_url, filename))

    return pdf_links

 def get_volume_links(soup, base_url):
    """Extract volume links if PDFs are not directly on publication page"""
    volume_links = []

    table = soup.find('table', {'id': 'datatable-default'})
    if not table:
        return volume_links

    for link in table.find_all('a', href=True):
        href = link['href']
        if 'volum=' in href:
            full_url = urljoin(base_url, href)
            volume_name = link.get_text(strip=True)
            volume_links.append((full_url, volume_name))

    return volume_links

 def download_pdf(url, output_dir, suggested_filename, session):
    """Download PDF file"""
    try:
        response = session.get(url, headers=HEADERS, timeout=60, stream=True)
        response.raise_for_status()

        # Determine filename: URL param > Content-Disposition > suggested
        filename = extract_filename_from_url(url)

        if not filename or filename.lower() == 'dl.asp':
            filename = extract_filename_from_response(response)

        if not filename:
            filename = suggested_filename

        if not filename:
            filename = f"download_{int(time.time())}.pdf"

        if not filename.lower().endswith('.pdf'):
            filename += '.pdf'

        filename = sanitize_filename(filename)
        output_path = os.path.join(output_dir, filename)

        # Check if already exists
        if os.path.exists(output_path):
            print(f"    [SKIP] Already exists: {filename}")
            return True, filename

        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        return True, filename
    except Exception as e:
        print(f"    [ERROR] Download failed: {e}")
        return False, None

 def main():
    if len(sys.argv) >= 2:
        pub_url = sys.argv[1]
    else:
        print("=" * 70)
        print("Biblioteca Digitală - PDF Downloader")
        print("=" * 70)
        print("\nExemple de URL-uri:")
        print("  https://biblioteca-digitala.ro/?pub=7758-revista-romana-de-sociologie")
        print("  https://biblioteca-digitala.ro/?pub=6464-studii-si-cercetari-de-chimie")
        print()
        pub_url = input("Introdu URL-ul publicației: ").strip()

        if not pub_url:
            print("[ERROR] Nu ai introdus niciun URL!")
            sys.exit(1)

    print("=" * 70)
    print("Biblioteca Digitală - PDF Downloader")
    print("=" * 70)
    print(f"Publication URL: {pub_url}")

    session = requests.Session()

    # Step 1: Get publication page
    print(f"\n[1] Fetching publication page...")
    soup = get_soup(pub_url, session)
    if not soup:
        print("[FATAL] Could not load publication page")
        return

    pub_title = get_publication_title(soup)
    print(f"    Publication: {pub_title}")

    if not publication_title_matches_url(pub_title, pub_url):
        expected_slug = get_publication_slug(pub_url)
        print("[FATAL] Pagina primita nu pare sa corespunda URL-ului introdus.")
        print(f"        URL slug: {expected_slug}")
        print(f"        Titlu pagina: {pub_title}")
        print("        Oprire ca sa nu descarce in folderul gresit.")
        return

    # Create per-publication output directory
    base_output_dir = BASE_OUTPUT_DIR
    output_dir = get_collection_dir(base_output_dir, pub_title, pub_url)
    os.makedirs(output_dir, exist_ok=True)
    print(f"    Base output directory: {base_output_dir}")
    print(f"    Collection directory: {output_dir}")

    # Step 2: Check if PDFs are directly on page or need to go to volumes
    pdf_links = get_pdf_links_from_page(soup, pub_url)

    total_pdfs = 0

    if pdf_links:
        # PDFs are directly on the publication page
        print(f"\n[2] Found {len(pdf_links)} PDF links directly on page")

        moved_count = move_existing_pdfs_to_collection(
            base_output_dir, output_dir, pub_title, pub_url, pdf_links
        )
        print(f"    Existing PDFs moved from base folder: {moved_count}")
        folder_moved_count = move_existing_collection_folders_to_collection(
            base_output_dir, output_dir, pub_title, pub_url, pdf_links
        )
        print(f"    Existing PDFs moved from matching folders: {folder_moved_count}")
        orphan_moved_count = organize_existing_pdf_groups(base_output_dir)
        print(f"    Other orphan PDFs grouped from base folder: {orphan_moved_count}")

        for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1):
            print(f"\n    [{i}/{len(pdf_links)}] Downloading...")
            success, filename = download_pdf(pdf_url, output_dir, suggested_name, session)
            if success and filename:
                print(f"    Saved as: {filename}")
                total_pdfs += 1
            time.sleep(0.5)
    else:
        # Need to check volume pages for PDFs
        print(f"\n[2] No direct PDF links found, checking volume pages...")
        volume_links = get_volume_links(soup, pub_url)
        print(f"    Found {len(volume_links)} volumes")

        pdf_links = []

        for i, (vol_url, vol_name) in enumerate(volume_links, 1):
            print(f"\n[{i}/{len(volume_links)}] Scanning: {vol_name}")

            vol_soup = get_soup(vol_url, session)
            if not vol_soup:
                continue

            vol_pdf_links = get_pdf_links_from_page(vol_soup, vol_url)

            for pdf_url, suggested_name in vol_pdf_links:
                if not suggested_name:
                    suggested_name = f"{sanitize_filename(vol_name)}.pdf"
                pdf_links.append((pdf_url, suggested_name))

            time.sleep(0.5)

        print(f"\n[3] Found {len(pdf_links)} PDF links in volume pages")
        moved_count = move_existing_pdfs_to_collection(
            base_output_dir, output_dir, pub_title, pub_url, pdf_links
        )
        print(f"    Existing PDFs moved from base folder: {moved_count}")
        folder_moved_count = move_existing_collection_folders_to_collection(
            base_output_dir, output_dir, pub_title, pub_url, pdf_links
        )
        print(f"    Existing PDFs moved from matching folders: {folder_moved_count}")
        orphan_moved_count = organize_existing_pdf_groups(base_output_dir)
        print(f"    Other orphan PDFs grouped from base folder: {orphan_moved_count}")

        for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1):
            print(f"\n    [{i}/{len(pdf_links)}] Downloading: {suggested_name[:50]}...")
            success, filename = download_pdf(pdf_url, output_dir, suggested_name, session)
            if success and filename:
                print(f"    Saved as: {filename}")
                total_pdfs += 1
            time.sleep(0.5)

    # Summary
    if total_pdfs == 0:
        print("\n[INFO] Nu am gasit linkuri PDF descarcabile pentru aceasta publicatie.")
        print("       Daca site-ul afiseaza stelute negre in coloana de download,")
        print("       publicația are doar metadate/articole, fara fisiere PDF disponibile.")
        try:
            if os.path.isdir(output_dir) and not os.listdir(output_dir):
                os.rmdir(output_dir)
                print(f"\n[INFO] No PDFs found; empty collection directory removed: {output_dir}")
        except OSError:
            pass

    print("\n" + "=" * 70)
    print("DOWNLOAD COMPLETE")
    print("=" * 70)
    print(f"Total PDFs downloaded: {total_pdfs}")
    print(f"Output directory: {os.path.abspath(output_dir)}")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Generic PDF downloader for biblioteca-digitala.ro
	Works with any publication page.

	Usage:
	python download_biblioteca_digitala.py "https://biblioteca-digitala.ro/?pub=7758-revista-romana-de-sociologie"
	python download_biblioteca_digitala.py "https://biblioteca-digitala.ro/?pub=6464-studii-si-cercetari-de-chimie"
	"""

	import requests
	from bs4 import BeautifulSoup
	import os
	import re
	import shutil
	import sys
	import time
	import unicodedata
	from urllib.parse import urljoin, urlparse, parse_qs, unquote

	HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8',
	}

	BASE_OUTPUT_DIR = r"G:\DOWNLOAD BIBLIOTECA"
	TITLE_MATCH_STOPWORDS = {
	'anul', 'cercetari', 'cercetare', 'colectia', 'de', 'din', 'jurnal',
	'periodic', 'publicatie', 'revista', 'roman', 'romana', 'romane',
	'romanesc', 'romanesti', 'si', 'studii'
	}

	def get_soup(url, session):
	"""Fetch page and return BeautifulSoup object"""
	try:
	response = session.get(url, headers=HEADERS, timeout=30)
	response.raise_for_status()
	return BeautifulSoup(response.content, 'html.parser')
	except Exception as e:
	print(f" [ERROR] Failed to fetch {url}: {e}")
	return None

	def sanitize_filename(name):
	"""Clean filename for filesystem"""
	name = re.sub(r'[<>:"/\\\|?*]', '_', name)
	name = re.sub(r'\s+', '_', name)
	name = name.strip('._')
	return name[:200]

	def strip_diacritics(text):
	"""Convert Romanian/European diacritics to plain ASCII."""
	return unicodedata.normalize('NFKD', text or '').encode('ascii', 'ignore').decode('ascii')

	def sanitize_folder_name(name):
	"""Clean publication title for a readable Windows folder name."""
	name = strip_diacritics(name)
	name = re.sub(r'[<>:"/\\\|?*]', ' ', name)
	name = re.sub(r'[_-]+', ' ', name)
	name = re.sub(r'\s+', ' ', name)
	name = name.strip(' ._')
	if not name:
	return "Unknown Publication"
	return name.title()[:200]

	def slugify_text(text):
	"""Create a lowercase ASCII key for matching PDF filename prefixes."""
	text = strip_diacritics(text).lower()
	text = re.sub(r'[^a-z0-9]+', '-', text)
	return text.strip('-')

	def get_publication_slug(pub_url):
	"""Extract URL slug from ?pub=10323-some-title."""
	query_params = parse_qs(urlparse(pub_url).query)
	pub_value = query_params.get('pub', [''])[0]
	match = re.match(r'\d+-(.+)', pub_value)
	if match:
	return match.group(1)
	return pub_value

	def get_significant_tokens(text):
	"""Return useful tokens for checking that the fetched page matches the requested URL."""
	return [
	token for token in slugify_text(text).split('-')
	if len(token) >= 4 and token not in TITLE_MATCH_STOPWORDS
	]

	def publication_title_matches_url(pub_title, pub_url):
	"""Avoid downloading into the wrong folder when the fetched page does not match the URL slug."""
	pub_slug = get_publication_slug(pub_url)
	if not pub_slug or not pub_title or pub_title == "Unknown_Publication":
	return True

	url_tokens = get_significant_tokens(pub_slug)
	title_tokens = set(get_significant_tokens(pub_title))
	if not url_tokens or not title_tokens:
	return True

	key_tokens = url_tokens[-2:] if len(url_tokens) >= 2 else url_tokens
	return any(token in title_tokens for token in key_tokens)

	def get_collection_dir(base_output_dir, pub_title, pub_url=None):
	"""Return the per-publication download directory."""
	folder_source = pub_title
	if not folder_source or folder_source == "Unknown_Publication":
	folder_source = get_publication_slug(pub_url or '') or "Unknown Publication"
	return os.path.join(base_output_dir, sanitize_folder_name(folder_source))

	def infer_collection_prefix(filename):
	"""Infer collection prefix from known PDF filenames."""
	stem = os.path.splitext(os.path.basename(filename or ''))[0]
	key = slugify_text(stem)
	if not key:
	return None

	match = re.match(
	r'^(.+?)(?:-(?:no\|nr\|numar\|numarul\|tom\|vol\|volum)-?\d\|-(?:1[5-9]\d{2}\|20\d{2})(?:$\|-))',
	key
	)
	if match:
	return match.group(1).strip('-')

	return key

	def build_existing_pdf_prefixes(pub_title, pub_url, pdf_links):
	"""Build specific prefixes used to move PDFs left in the base folder."""
	prefixes = set()

	title_slug = slugify_text(pub_title)
	if title_slug:
	prefixes.add(title_slug)

	for title_part in re.split(r'\s/\s', pub_title or ''):
	title_part_slug = slugify_text(title_part)
	if title_part_slug:
	prefixes.add(title_part_slug)

	pub_slug = slugify_text(get_publication_slug(pub_url))
	if pub_slug:
	prefixes.add(pub_slug)

	for pdf_url, suggested_name in pdf_links:
	filename = extract_filename_from_url(pdf_url) or suggested_name
	prefix = infer_collection_prefix(filename)
	if prefix:
	prefixes.add(prefix)

	return sorted(prefix for prefix in prefixes if len(prefix) >= 8)

	def make_unique_path(path):
	"""Avoid overwriting an existing file while moving old downloads."""
	if not os.path.exists(path):
	return path

	root, ext = os.path.splitext(path)
	counter = 1
	while True:
	candidate = f"{root}_{counter}{ext}"
	if not os.path.exists(candidate):
	return candidate
	counter += 1

	def get_existing_collection_dir_for_prefix(base_output_dir, prefix):
	"""Find an existing collection folder that already matches a PDF prefix."""
	matches = []

	for entry in os.scandir(base_output_dir):
	if not entry.is_dir():
	continue

	folder_key = slugify_text(entry.name)
	if (
	folder_key == prefix
	or folder_key.startswith(prefix + '-')
	or folder_key.endswith('-' + prefix)
	or prefix.startswith(folder_key + '-')
	):
	matches.append(entry.path)

	if matches:
	matches.sort(key=lambda path: (slugify_text(os.path.basename(path)) != prefix, len(os.path.basename(path))))
	return matches[0]

	return os.path.join(base_output_dir, sanitize_folder_name(prefix.replace('-', ' ')))

	def organize_existing_pdf_groups(base_output_dir):
	"""Group any PDFs left directly in the base folder into collection folders."""
	if not os.path.isdir(base_output_dir):
	return 0

	groups = {}
	for entry in os.scandir(base_output_dir):
	if not entry.is_file() or not entry.name.lower().endswith('.pdf'):
	continue

	prefix = infer_collection_prefix(entry.name)
	if prefix and len(prefix) >= 8:
	groups.setdefault(prefix, []).append(entry.path)

	moved_count = 0
	for prefix, file_paths in sorted(groups.items()):
	destination_dir = get_existing_collection_dir_for_prefix(base_output_dir, prefix)
	os.makedirs(destination_dir, exist_ok=True)

	for file_path in file_paths:
	filename = os.path.basename(file_path)
	destination = make_unique_path(os.path.join(destination_dir, filename))
	shutil.move(file_path, destination)
	print(f" [MOVE] Existing orphan PDF moved to {os.path.basename(destination_dir)}: {filename}")
	moved_count += 1

	return moved_count

	def move_existing_collection_folders_to_collection(base_output_dir, collection_dir, pub_title, pub_url, pdf_links):
	"""Move PDFs from compatible short-name folders into the current collection folder."""
	if not os.path.isdir(base_output_dir):
	return 0

	prefixes = build_existing_pdf_prefixes(pub_title, pub_url, pdf_links)
	if not prefixes:
	return 0

	collection_dir = os.path.abspath(collection_dir)
	moved_count = 0

	for entry in os.scandir(base_output_dir):
	if not entry.is_dir():
	continue

	source_dir = os.path.abspath(entry.path)
	if os.path.normcase(source_dir) == os.path.normcase(collection_dir):
	continue

	folder_key = slugify_text(entry.name)
	if not any(
	folder_key == prefix
	or folder_key.startswith(prefix + '-')
	or folder_key.endswith('-' + prefix)
	or prefix.startswith(folder_key + '-')
	for prefix in prefixes
	):
	continue

	os.makedirs(collection_dir, exist_ok=True)
	for file_entry in os.scandir(source_dir):
	if not file_entry.is_file() or not file_entry.name.lower().endswith('.pdf'):
	continue

	destination = make_unique_path(os.path.join(collection_dir, file_entry.name))
	shutil.move(file_entry.path, destination)
	print(f" [MOVE] Existing PDF moved from {entry.name}: {file_entry.name}")
	moved_count += 1

	try:
	os.rmdir(source_dir)
	except OSError:
	pass

	return moved_count

	def move_existing_pdfs_to_collection(base_output_dir, collection_dir, pub_title, pub_url, pdf_links):
	"""Move matching PDFs from the base folder into the current collection folder."""
	if not os.path.isdir(base_output_dir):
	return 0

	prefixes = build_existing_pdf_prefixes(pub_title, pub_url, pdf_links)
	if not prefixes:
	return 0

	os.makedirs(collection_dir, exist_ok=True)
	moved_count = 0

	for entry in os.scandir(base_output_dir):
	if not entry.is_file() or not entry.name.lower().endswith('.pdf'):
	continue

	file_key = slugify_text(os.path.splitext(entry.name)[0])
	if not any(file_key == prefix or file_key.startswith(prefix + '-') for prefix in prefixes):
	continue

	destination = make_unique_path(os.path.join(collection_dir, entry.name))
	shutil.move(entry.path, destination)
	print(f" [MOVE] Existing PDF moved: {entry.name}")
	moved_count += 1

	return moved_count

	def extract_filename_from_url(url):
	"""Extract filename from dl.asp?filename=... URL or from path"""
	parsed = urlparse(url)

	# Try to get from query parameter 'filename'
	query_params = parse_qs(parsed.query)
	if 'filename' in query_params:
	filename = query_params['filename'][0]
	return unquote(filename)

	# Try to get from onclick attribute (track_pdf URL)
	# This is handled separately

	# Fallback: get from path
	path = parsed.path
	if path:
	filename = os.path.basename(path)
	if filename and filename.lower() != 'dl.asp':
	return unquote(filename)

	return None

	def extract_filename_from_response(response):
	"""Extract filename from Content-Disposition header"""
	cd = response.headers.get('Content-Disposition', '')
	if cd:
	# Try standard filename
	match = re.search(r'filename[^;=\n]*=([\'"]?)([^\'";\n]+)\1', cd, re.IGNORECASE)
	if match:
	filename = unquote(match.group(2))
	if filename and filename.lower() != 'dl.asp' and not filename.lower().startswith('dl.asp'):
	return filename

	# Try filename* (RFC 5987)
	match = re.search(r"filename\*=(?:UTF-8''\|utf-8'')([^;\n]+)", cd, re.IGNORECASE)
	if match:
	filename = unquote(match.group(1))
	if filename and filename.lower() != 'dl.asp':
	return filename

	return None

	def get_publication_title(soup):
	"""Extract publication title from page"""
	h2 = soup.find('h2', class_='text-color-light')
	if h2:
	return h2.get_text(strip=True)
	return "Unknown_Publication"

	def get_pdf_links_from_page(soup, base_url):
	"""Extract all PDF download links from page"""
	pdf_links = []

	# Find download links in the table
	table = soup.find('table', {'id': 'datatable-default'})
	if not table:
	print("[WARNING] Could not find data table, searching entire page...")
	search_area = soup
	else:
	search_area = table

	for link in search_area.find_all('a', href=True):
	href = link['href']

	# Check for PDF links (dl.asp?filename=... or direct .pdf)
	if 'dl.asp' in href.lower() or '.pdf' in href.lower():

	# PRIORITY: Check onclick for track_pdf which has the DIRECT PDF URL
	onclick = link.get('onclick', '')
	if 'track_pdf' in onclick:
	# Extract URL from track_pdf('...')
	match = re.search(r"track_pdf\(['\"]([^'\"]+)['\"]", onclick)
	if match:
	direct_pdf_url = match.group(1)
	# This is the real PDF URL, not the dl.asp wrapper
	filename = os.path.basename(urlparse(direct_pdf_url).path)
	pdf_links.append((direct_pdf_url, filename))
	continue

	# Fallback: use href (dl.asp URL)
	full_url = urljoin(base_url, href)
	filename = extract_filename_from_url(full_url)
	pdf_links.append((full_url, filename))

	return pdf_links

	def get_volume_links(soup, base_url):
	"""Extract volume links if PDFs are not directly on publication page"""
	volume_links = []

	table = soup.find('table', {'id': 'datatable-default'})
	if not table:
	return volume_links

	for link in table.find_all('a', href=True):
	href = link['href']
	if 'volum=' in href:
	full_url = urljoin(base_url, href)
	volume_name = link.get_text(strip=True)
	volume_links.append((full_url, volume_name))

	return volume_links

	def download_pdf(url, output_dir, suggested_filename, session):
	"""Download PDF file"""
	try:
	response = session.get(url, headers=HEADERS, timeout=60, stream=True)
	response.raise_for_status()

	# Determine filename: URL param > Content-Disposition > suggested
	filename = extract_filename_from_url(url)

	if not filename or filename.lower() == 'dl.asp':
	filename = extract_filename_from_response(response)

	if not filename:
	filename = suggested_filename

	if not filename:
	filename = f"download_{int(time.time())}.pdf"

	if not filename.lower().endswith('.pdf'):
	filename += '.pdf'

	filename = sanitize_filename(filename)
	output_path = os.path.join(output_dir, filename)

	# Check if already exists
	if os.path.exists(output_path):
	print(f" [SKIP] Already exists: {filename}")
	return True, filename

	with open(output_path, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	return True, filename
	except Exception as e:
	print(f" [ERROR] Download failed: {e}")
	return False, None

	def main():
	if len(sys.argv) >= 2:
	pub_url = sys.argv[1]
	else:
	print("=" * 70)
	print("Biblioteca Digitală - PDF Downloader")
	print("=" * 70)
	print("\nExemple de URL-uri:")
	print(" https://biblioteca-digitala.ro/?pub=7758-revista-romana-de-sociologie")
	print(" https://biblioteca-digitala.ro/?pub=6464-studii-si-cercetari-de-chimie")
	print()
	pub_url = input("Introdu URL-ul publicației: ").strip()

	if not pub_url:
	print("[ERROR] Nu ai introdus niciun URL!")
	sys.exit(1)

	print("=" * 70)
	print("Biblioteca Digitală - PDF Downloader")
	print("=" * 70)
	print(f"Publication URL: {pub_url}")

	session = requests.Session()

	# Step 1: Get publication page
	print(f"\n[1] Fetching publication page...")
	soup = get_soup(pub_url, session)
	if not soup:
	print("[FATAL] Could not load publication page")
	return

	pub_title = get_publication_title(soup)
	print(f" Publication: {pub_title}")

	if not publication_title_matches_url(pub_title, pub_url):
	expected_slug = get_publication_slug(pub_url)
	print("[FATAL] Pagina primita nu pare sa corespunda URL-ului introdus.")
	print(f" URL slug: {expected_slug}")
	print(f" Titlu pagina: {pub_title}")
	print(" Oprire ca sa nu descarce in folderul gresit.")
	return

	# Create per-publication output directory
	base_output_dir = BASE_OUTPUT_DIR
	output_dir = get_collection_dir(base_output_dir, pub_title, pub_url)
	os.makedirs(output_dir, exist_ok=True)
	print(f" Base output directory: {base_output_dir}")
	print(f" Collection directory: {output_dir}")

	# Step 2: Check if PDFs are directly on page or need to go to volumes
	pdf_links = get_pdf_links_from_page(soup, pub_url)

	total_pdfs = 0

	if pdf_links:
	# PDFs are directly on the publication page
	print(f"\n[2] Found {len(pdf_links)} PDF links directly on page")

	moved_count = move_existing_pdfs_to_collection(
	base_output_dir, output_dir, pub_title, pub_url, pdf_links
	)
	print(f" Existing PDFs moved from base folder: {moved_count}")
	folder_moved_count = move_existing_collection_folders_to_collection(
	base_output_dir, output_dir, pub_title, pub_url, pdf_links
	)
	print(f" Existing PDFs moved from matching folders: {folder_moved_count}")
	orphan_moved_count = organize_existing_pdf_groups(base_output_dir)
	print(f" Other orphan PDFs grouped from base folder: {orphan_moved_count}")

	for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1):
	print(f"\n [{i}/{len(pdf_links)}] Downloading...")
	success, filename = download_pdf(pdf_url, output_dir, suggested_name, session)
	if success and filename:
	print(f" Saved as: {filename}")
	total_pdfs += 1
	time.sleep(0.5)
	else:
	# Need to check volume pages for PDFs
	print(f"\n[2] No direct PDF links found, checking volume pages...")
	volume_links = get_volume_links(soup, pub_url)
	print(f" Found {len(volume_links)} volumes")

	pdf_links = []

	for i, (vol_url, vol_name) in enumerate(volume_links, 1):
	print(f"\n[{i}/{len(volume_links)}] Scanning: {vol_name}")

	vol_soup = get_soup(vol_url, session)
	if not vol_soup:
	continue

	vol_pdf_links = get_pdf_links_from_page(vol_soup, vol_url)

	for pdf_url, suggested_name in vol_pdf_links:
	if not suggested_name:
	suggested_name = f"{sanitize_filename(vol_name)}.pdf"
	pdf_links.append((pdf_url, suggested_name))

	time.sleep(0.5)

	print(f"\n[3] Found {len(pdf_links)} PDF links in volume pages")
	moved_count = move_existing_pdfs_to_collection(
	base_output_dir, output_dir, pub_title, pub_url, pdf_links
	)
	print(f" Existing PDFs moved from base folder: {moved_count}")
	folder_moved_count = move_existing_collection_folders_to_collection(
	base_output_dir, output_dir, pub_title, pub_url, pdf_links
	)
	print(f" Existing PDFs moved from matching folders: {folder_moved_count}")
	orphan_moved_count = organize_existing_pdf_groups(base_output_dir)
	print(f" Other orphan PDFs grouped from base folder: {orphan_moved_count}")

	for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1):
	print(f"\n [{i}/{len(pdf_links)}] Downloading: {suggested_name[:50]}...")
	success, filename = download_pdf(pdf_url, output_dir, suggested_name, session)
	if success and filename:
	print(f" Saved as: {filename}")
	total_pdfs += 1
	time.sleep(0.5)

	# Summary
	if total_pdfs == 0:
	print("\n[INFO] Nu am gasit linkuri PDF descarcabile pentru aceasta publicatie.")
	print(" Daca site-ul afiseaza stelute negre in coloana de download,")
	print(" publicația are doar metadate/articole, fara fisiere PDF disponibile.")
	try:
	if os.path.isdir(output_dir) and not os.listdir(output_dir):
	os.rmdir(output_dir)
	print(f"\n[INFO] No PDFs found; empty collection directory removed: {output_dir}")
	except OSError:
	pass

	print("\n" + "=" * 70)
	print("DOWNLOAD COMPLETE")
	print("=" * 70)
	print(f"Total PDFs downloaded: {total_pdfs}")
	print(f"Output directory: {os.path.abspath(output_dir)}")

	if __name__ == "__main__":
	main()
No results found