Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created May 29, 2025 21:43
Show Gist options
  • Save me-suzy/167e23b7f1d7e950953cd3b393bc23d8 to your computer and use it in GitHub Desktop.
Save me-suzy/167e23b7f1d7e950953cd3b393bc23d8 to your computer and use it in GitHub Desktop.
teleport pro 4346.py
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urljoin, unquote, urlparse
import os
import requests
from tqdm import tqdm
import re
import time
def setup_driver():
chrome_options = Options()
chrome_options.add_argument("--headless")
service = Service(ChromeDriverManager().install())
return webdriver.Chrome(service=service, options=chrome_options)
def is_valid_url(url):
# Acceptă orice URL de pe uyuyuuy legat de secțiunea secolul-20
parsed_url = urlparse(url)
return (parsed_url.netloc == 'uyuyuuy' and
'secolul-20' in parsed_url.path.lower() and
not url.endswith('#'))
def clean_filename(filename):
return re.sub(r'[<>:"/\\|?*]', '', filename)
def get_all_links(driver, base_url):
print(f"Colectare link-uri de la: {base_url}")
driver.get(base_url)
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
except TimeoutException:
print(f"Timeout la încărcarea paginii: {base_url}")
return []
links = set()
elements = driver.find_elements(By.TAG_NAME, "a")
print(f"Elemente <a> găsite: {len(elements)}")
for element in elements:
href = element.get_attribute('href')
if href and is_valid_url(href):
print(f"Link găsit: {href}")
links.add(href)
print(f"Total link-uri valide găsite: {len(links)}")
return links
def is_pdf_url(url):
return url.lower().endswith('.pdf') or 'filename=' in url.lower()
def extract_filename_from_url(url):
# Extrage numele fișierului din parametrul filename al URL-ului dl.asp
if 'filename=' in url:
filename = re.search(r'filename=([^&]+)', url)
if filename:
return unquote(filename.group(1))
# Fallback pe ultima parte a URL-ului dacă nu există filename
return unquote(url.split('/')[-1].split('?')[0])
def download_pdf(url, folder):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://uyuyuuy/?iopl',
'Accept': 'application/pdf,*/*'
}
try:
filename = extract_filename_from_url(url)
filename = os.path.join(folder, clean_filename(filename))
if os.path.exists(filename):
print(f"Fișierul {filename} există deja. Skip descărcare.")
return True
print(f"Descărcare PDF: {url}")
response = requests.get(url, headers=headers, stream=True, timeout=30)
total_size = int(response.headers.get('content-length', 0))
if response.status_code == 200:
with open(filename, 'wb') as f, tqdm(
desc=filename,
total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as progress_bar:
for data in response.iter_content(chunk_size=8192):
size = f.write(data)
progress_bar.update(size)
print(f"✅ Descărcat cu succes: {filename}")
return True
else:
print(f"❌ Nu s-a putut descărca PDF-ul de la {url}. Cod de status: {response.status_code}")
return False
except requests.RequestException as e:
print(f"❌ Eroare la descărcarea {url}: {str(e)}")
return False
def process_links(driver, base_url, folder):
visited = set()
to_visit = [base_url]
downloaded_pdfs = set()
specific_pdf = "https://uyuyuuy/reviste/secolul-20/dl.asp?filename=1961_nr-01-secolul-20-revista-de-literatura-universala.pdf"
if is_pdf_url(specific_pdf):
print(f"Încercare descărcare PDF specific: {specific_pdf}")
download_pdf(specific_pdf, folder)
downloaded_pdfs.add(specific_pdf)
while to_visit:
current_url = to_visit.pop(0)
if current_url in visited:
continue
visited.add(current_url)
print(f"\n📄 Procesare pagină: {current_url}")
links = get_all_links(driver, current_url)
pdf_links = [link for link in links if is_pdf_url(link)]
other_links = [link for link in links if not is_pdf_url(link) and link not in visited]
print(f"🔍 Link-uri PDF găsite: {len(pdf_links)}")
for pdf in pdf_links:
print(f" - {pdf}")
for link in pdf_links:
if link not in downloaded_pdfs:
success = download_pdf(link, folder)
if success:
downloaded_pdfs.add(link)
to_visit.extend(other_links)
print(f"📊 Status: {len(downloaded_pdfs)} PDF-uri descărcate, {len(visited)} pagini vizitate, {len(to_visit)} pagini în așteptare")
time.sleep(2)
def main():
base_url = "https://uyuyuuy/?iopl"
download_folder = r"g:\Downloads2"
if not os.path.exists(download_folder):
os.makedirs(download_folder)
driver = setup_driver()
try:
print(f"🚀 Începere procesare de la URL-ul: {base_url}")
process_links(driver, base_url, download_folder)
print("✅ Procesul de căutare și descărcare a fost finalizat.")
print(f"📁 Fișierele au fost salvate în: {download_folder}")
except Exception as e:
print(f"❌ Eroare neașteptată: {str(e)}")
finally:
driver.quit()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment