Created
May 29, 2025 21:43
-
-
Save me-suzy/167e23b7f1d7e950953cd3b393bc23d8 to your computer and use it in GitHub Desktop.
teleport pro 4346.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.common.exceptions import TimeoutException | |
from webdriver_manager.chrome import ChromeDriverManager | |
from urllib.parse import urljoin, unquote, urlparse | |
import os | |
import requests | |
from tqdm import tqdm | |
import re | |
import time | |
def setup_driver(): | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
service = Service(ChromeDriverManager().install()) | |
return webdriver.Chrome(service=service, options=chrome_options) | |
def is_valid_url(url): | |
# Acceptă orice URL de pe uyuyuuy legat de secțiunea secolul-20 | |
parsed_url = urlparse(url) | |
return (parsed_url.netloc == 'uyuyuuy' and | |
'secolul-20' in parsed_url.path.lower() and | |
not url.endswith('#')) | |
def clean_filename(filename): | |
return re.sub(r'[<>:"/\\|?*]', '', filename) | |
def get_all_links(driver, base_url): | |
print(f"Colectare link-uri de la: {base_url}") | |
driver.get(base_url) | |
try: | |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) | |
time.sleep(2) | |
except TimeoutException: | |
print(f"Timeout la încărcarea paginii: {base_url}") | |
return [] | |
links = set() | |
elements = driver.find_elements(By.TAG_NAME, "a") | |
print(f"Elemente <a> găsite: {len(elements)}") | |
for element in elements: | |
href = element.get_attribute('href') | |
if href and is_valid_url(href): | |
print(f"Link găsit: {href}") | |
links.add(href) | |
print(f"Total link-uri valide găsite: {len(links)}") | |
return links | |
def is_pdf_url(url): | |
return url.lower().endswith('.pdf') or 'filename=' in url.lower() | |
def extract_filename_from_url(url): | |
# Extrage numele fișierului din parametrul filename al URL-ului dl.asp | |
if 'filename=' in url: | |
filename = re.search(r'filename=([^&]+)', url) | |
if filename: | |
return unquote(filename.group(1)) | |
# Fallback pe ultima parte a URL-ului dacă nu există filename | |
return unquote(url.split('/')[-1].split('?')[0]) | |
def download_pdf(url, folder): | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Referer': 'https://uyuyuuy/?iopl', | |
'Accept': 'application/pdf,*/*' | |
} | |
try: | |
filename = extract_filename_from_url(url) | |
filename = os.path.join(folder, clean_filename(filename)) | |
if os.path.exists(filename): | |
print(f"Fișierul {filename} există deja. Skip descărcare.") | |
return True | |
print(f"Descărcare PDF: {url}") | |
response = requests.get(url, headers=headers, stream=True, timeout=30) | |
total_size = int(response.headers.get('content-length', 0)) | |
if response.status_code == 200: | |
with open(filename, 'wb') as f, tqdm( | |
desc=filename, | |
total=total_size, | |
unit='iB', | |
unit_scale=True, | |
unit_divisor=1024, | |
) as progress_bar: | |
for data in response.iter_content(chunk_size=8192): | |
size = f.write(data) | |
progress_bar.update(size) | |
print(f"✅ Descărcat cu succes: {filename}") | |
return True | |
else: | |
print(f"❌ Nu s-a putut descărca PDF-ul de la {url}. Cod de status: {response.status_code}") | |
return False | |
except requests.RequestException as e: | |
print(f"❌ Eroare la descărcarea {url}: {str(e)}") | |
return False | |
def process_links(driver, base_url, folder): | |
visited = set() | |
to_visit = [base_url] | |
downloaded_pdfs = set() | |
specific_pdf = "https://uyuyuuy/reviste/secolul-20/dl.asp?filename=1961_nr-01-secolul-20-revista-de-literatura-universala.pdf" | |
if is_pdf_url(specific_pdf): | |
print(f"Încercare descărcare PDF specific: {specific_pdf}") | |
download_pdf(specific_pdf, folder) | |
downloaded_pdfs.add(specific_pdf) | |
while to_visit: | |
current_url = to_visit.pop(0) | |
if current_url in visited: | |
continue | |
visited.add(current_url) | |
print(f"\n📄 Procesare pagină: {current_url}") | |
links = get_all_links(driver, current_url) | |
pdf_links = [link for link in links if is_pdf_url(link)] | |
other_links = [link for link in links if not is_pdf_url(link) and link not in visited] | |
print(f"🔍 Link-uri PDF găsite: {len(pdf_links)}") | |
for pdf in pdf_links: | |
print(f" - {pdf}") | |
for link in pdf_links: | |
if link not in downloaded_pdfs: | |
success = download_pdf(link, folder) | |
if success: | |
downloaded_pdfs.add(link) | |
to_visit.extend(other_links) | |
print(f"📊 Status: {len(downloaded_pdfs)} PDF-uri descărcate, {len(visited)} pagini vizitate, {len(to_visit)} pagini în așteptare") | |
time.sleep(2) | |
def main(): | |
base_url = "https://uyuyuuy/?iopl" | |
download_folder = r"g:\Downloads2" | |
if not os.path.exists(download_folder): | |
os.makedirs(download_folder) | |
driver = setup_driver() | |
try: | |
print(f"🚀 Începere procesare de la URL-ul: {base_url}") | |
process_links(driver, base_url, download_folder) | |
print("✅ Procesul de căutare și descărcare a fost finalizat.") | |
print(f"📁 Fișierele au fost salvate în: {download_folder}") | |
except Exception as e: | |
print(f"❌ Eroare neașteptată: {str(e)}") | |
finally: | |
driver.quit() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment