Created
October 4, 2025 18:18
-
-
Save me-suzy/893403603a1e5aef3fb07b0b55a760a6 to your computer and use it in GitHub Desktop.
6687668asll.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
from urllib.parse import urlparse | |
import time | |
# Configurații | |
base_url = "https://www.3253453.ro/" | |
download_folder = "g:\\Downloads2" # Folderul principal | |
# Creează folderul principal dacă nu există | |
if not os.path.exists(download_folder): | |
os.makedirs(download_folder) | |
print(f"Creat folderul principal de descărcare: {download_folder}") | |
# Extrage numele domeniului pentru subfolder (ex. "3253453") | |
parsed_url = urlparse(base_url) | |
domain = parsed_url.netloc.replace('www.', '').rstrip('.ro') # Curățare: "3253453" | |
sub_folder = os.path.join(download_folder, domain) | |
os.makedirs(sub_folder, exist_ok=True) | |
print(f"Creat subfolder: {sub_folder}") | |
# Funcție pentru a descărca un PDF | |
def download_pdf(url, file_path): | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} | |
print(f"Încerc să descarc: {url}") | |
try: | |
response = requests.get(url, headers=headers, stream=True, timeout=10) | |
response.raise_for_status() | |
print(f"Conexiune reușită pentru {url}, status: {response.status_code}") | |
with open(file_path, 'wb') as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
if chunk: | |
f.write(chunk) | |
print(f"Descărcat cu succes: {file_path}") | |
except requests.exceptions.RequestException as e: | |
print(f"Eroare la descărcarea {url}: {e}") | |
# Funcție pentru a găsi și urma link-urile către pagini și PDF-uri | |
def crawl_and_download(start_url): | |
visited_urls = set() | |
def crawl(url): | |
if url in visited_urls or not url.startswith(base_url): | |
return | |
visited_urls.add(url) | |
print(f"Accesare: {url}") | |
try: | |
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}, timeout=10) | |
response.raise_for_status() | |
print(f"Accesare reușită, status: {response.status_code}") | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Găsește și descarcă imediat link-urile către PDF-uri | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
if href.lower().endswith('.pdf'): | |
full_url = href if href.startswith('http') else base_url + href.lstrip('/') | |
if full_url.startswith(base_url): | |
print(f"Găsit PDF: {full_url}") | |
file_name = full_url.split('/')[-1] | |
file_path = os.path.join(sub_folder, file_name) # Folosește subfolder-ul | |
download_pdf(full_url, file_path) | |
# Găsește link-uri către alte pagini/subdirectoare pentru a continua crawling-ul | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
next_url = href if href.startswith('http') else base_url + href.lstrip('/') | |
if next_url.startswith(base_url) and not next_url.lower().endswith('.pdf'): | |
crawl(next_url) | |
time.sleep(2) # Delay pentru a evita blocarea | |
except requests.exceptions.RequestException as e: | |
print(f"Eroare la accesarea {url}: {e}") | |
# Pornire crawling de la URL-ul inițial | |
print(f"Încep crawling de la: {start_url}") | |
crawl(start_url) | |
print("Descărcarea s-a încheiat!") | |
# Rulează scriptul | |
if __name__ == "__main__": | |
crawl_and_download(base_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment