Created
May 23, 2025 17:26
-
-
Save me-suzy/e1a12af5c7912f2d2ad7ab63ee7eb5c4 to your computer and use it in GitHub Desktop.
Sorteaza articole categorii in functie de data articolelor BEBE 2024 categories ordine descrescatoare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from datetime import datetime | |
def extract_date(text): | |
"""Extract and correct date from text, handling minor typos.""" | |
date_match = re.search(r'On\s+([A-Za-z]+(?:lie)?\s+\d{1,2},\s+\d{4})', text) | |
if date_match: | |
raw_date = date_match.group(1) | |
# Correct common typos like "Aprlie" to "April" | |
corrected_date = raw_date.replace('Aprlie', 'April').replace('Maylie', 'May') | |
try: | |
return datetime.strptime(corrected_date, '%B %d, %Y') | |
except ValueError: | |
print(f"Warning: Could not parse date '{raw_date}', using minimum date.") | |
return datetime.min | |
return datetime.min | |
def sort_articles(html_content, ascending=True): | |
"""Sort articles within the ARTICOL CATEGORIE section by date.""" | |
pattern = re.compile(r'(<!-- ARTICOL CATEGORIE START -->.*?<!-- ARTICOL CATEGORIE FINAL -->)', re.DOTALL) | |
match = pattern.search(html_content) | |
if match: | |
section_to_sort = match.group(1) | |
# Extract individual articles | |
article_pattern = re.compile(r'(<table width="\d+" border="0">.*?<p class="text_obisnuit"></p>)', re.DOTALL) | |
articles = article_pattern.findall(section_to_sort) | |
# Sort articles by date | |
sorted_articles = sorted(articles, key=lambda x: extract_date(x), reverse=not ascending) | |
# Reconstruct sorted section | |
sorted_section = '<!-- ARTICOL CATEGORIE START -->\n' + '\n'.join(sorted_articles) + '\n<!-- ARTICOL CATEGORIE FINAL -->' | |
# Replace original section with sorted one | |
return html_content.replace(section_to_sort, sorted_section) | |
return html_content | |
def process_file(file_path, ascending=True): | |
"""Process a single HTML file and sort its articles.""" | |
try: | |
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: | |
html_content = file.read() | |
sorted_html = sort_articles(html_content, ascending) | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.write(sorted_html) | |
print(f"Articolele au fost sortate cu succes în {file_path} (ordine {'crescătoare' if ascending else 'descrescătoare'})") | |
except Exception as e: | |
print(f"Eroare la procesarea fișierului {file_path}: {e}") | |
if __name__ == "__main__": | |
# Directorul care conține fișierele HTML | |
directory = r"d:\3\Input" | |
# Verifică dacă directorul există | |
if not os.path.isdir(directory): | |
print(f"Eroare: Directorul '{directory}' nu există.") | |
else: | |
# Sortează în ordine descrescătoare (cea mai recentă dată la început) | |
ascending_order = False | |
for filename in os.listdir(directory): | |
if filename.endswith('.html'): | |
file_path = os.path.join(directory, filename) | |
process_file(file_path, ascending_order) | |
print("Procesarea tuturor fișierelor HTML a fost finalizată.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment