Created
September 5, 2025 06:19
-
-
Save me-suzy/d3d36053ebcc96488fb75e84415f9eec to your computer and use it in GitHub Desktop.
Intersection Principal NOU 2.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from collections import defaultdict | |
# Define directories | |
base_dir = r'e:\Carte\BB\17 - Site Leadership\Principal 2022' | |
ro_dir = os.path.join(base_dir, 'ro') | |
en_dir = os.path.join(base_dir, 'en') | |
# Function to extract canonical from HTML content | |
def extract_canonical(content): | |
match = re.search(r'<link rel="canonical" href="https://neculaifantanaru\.com/(en/)?([^"]+)" />', content) | |
if match: | |
return (match.group(1) or '') + match.group(2) | |
return None | |
# Function to extract flags section | |
def extract_flags(content): | |
flags_start = content.find('<!-- FLAGS_1 -->') | |
flags_end = content.find('<!-- FLAGS -->', flags_start) | |
if flags_start != -1 and flags_end != -1: | |
return content[flags_start:flags_end + len('<!-- FLAGS -->')] | |
return None | |
# Function to replace canonical in content | |
def replace_canonical(content, new_href): | |
return re.sub(r'<link rel="canonical" href="[^"]+" />', f'<link rel="canonical" href="{new_href}" />', content) | |
# Function to replace specific flag link in flags (improved regex for \+1 and +1) | |
def replace_flag_link(flags, code, new_href): | |
# Handle both \+1 and +1, \+40 and +40 | |
if code == r'\+1': | |
pattern = r'<li><a cunt_code="(\\?\+1)" href="[^"]+">' | |
elif code == r'\+40': | |
pattern = r'<li><a cunt_code="(\\?\+40)" href="[^"]+">' | |
else: | |
pattern = rf'<li><a cunt_code="{code}" href="[^"]+">' | |
replacement = f'<li><a cunt_code="{code.replace(chr(92), "")}" href="{new_href}">' # Remove backslash from output | |
return re.sub(pattern, replacement, flags, count=1) | |
# Function to update file | |
def update_file(file_path, new_content): | |
with open(file_path, 'w', encoding='utf-8') as f: | |
f.write(new_content) | |
# Function to extract flag link (improved to handle \+1 and +1) | |
def extract_flag_link(flags, code): | |
if code == '+1': | |
match = re.search(r'<li><a cunt_code="(\\?\+1)" href="([^"]+)"', flags) | |
elif code == '+40': | |
match = re.search(r'<li><a cunt_code="(\\?\+40)" href="([^"]+)"', flags) | |
else: | |
match = re.search(rf'<li><a cunt_code="{code}" href="([^"]+)"', flags) | |
return match.group(2) if match else None | |
# Function to fix double .html.html | |
def fix_double_html(url): | |
return url.replace('.html.html', '.html') | |
# PASUL 1: Canonical = Numele fișierului | |
print('================================================================================') | |
print('PASUL 1: CANONICAL = NUMELE FIȘIERULUI') | |
print('============================================================') | |
canonical_fixed_ro = 0 | |
canonical_fixed_en = 0 | |
# Process RO files | |
ro_files = [f for f in os.listdir(ro_dir) if f.endswith('.html')] | |
for filename in ro_files: | |
file_path = os.path.join(ro_dir, filename) | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
canonical = extract_canonical(content) | |
expected_canonical = filename[:-5] # without .html, case-sensitive | |
expected_href = f'https://neculaifantanaru.com/{expected_canonical}.html' | |
if canonical != expected_canonical + '.html': | |
new_content = replace_canonical(content, expected_href) | |
update_file(file_path, new_content) | |
canonical_fixed_ro += 1 | |
print(f'Corectat RO: {filename} canonical → {expected_canonical}.html') | |
# Process EN files | |
en_files = [f for f in os.listdir(en_dir) if f.endswith('.html')] | |
for filename in en_files: | |
file_path = os.path.join(en_dir, filename) | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
canonical = extract_canonical(content) | |
expected_canonical = f'en/{filename[:-5]}.html' # en/ + name without .html | |
expected_href = f'https://neculaifantanaru.com/{expected_canonical}' | |
if canonical != expected_canonical: | |
new_content = replace_canonical(content, expected_href) | |
update_file(file_path, new_content) | |
canonical_fixed_en += 1 | |
print(f'Corectat EN: {filename} canonical → {expected_canonical}') | |
print(f'✅ Canonical-uri reparate: RO={canonical_fixed_ro}, EN={canonical_fixed_en}, TOTAL={canonical_fixed_ro + canonical_fixed_en}') | |
# PASUL 2: FLAGS = Canonical (în același fișier) | |
print('\nPASUL 2: FLAGS = CANONICAL (în același fișier)') | |
print('============================================================') | |
flags_fixed_ro = 0 | |
flags_fixed_en = 0 | |
# Process RO files for own flag (+40) | |
for filename in ro_files: | |
file_path = os.path.join(ro_dir, filename) | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
canonical = extract_canonical(content) | |
flags = extract_flags(content) | |
if flags and canonical: | |
current_link = extract_flag_link(flags, '+40') | |
expected_href = f'https://neculaifantanaru.com/{filename[:-5]}.html' | |
if current_link and current_link != expected_href: | |
new_flags = replace_flag_link(flags, r'\+40', expected_href) | |
new_content = content.replace(flags, new_flags) | |
update_file(file_path, new_content) | |
flags_fixed_ro += 1 | |
print(f'Corectat RO flags own: {filename} → {expected_href}') | |
# Process EN files for own flag (+1) | |
for filename in en_files: | |
file_path = os.path.join(en_dir, filename) | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
canonical = extract_canonical(content) | |
flags = extract_flags(content) | |
if flags and canonical: | |
current_link = extract_flag_link(flags, '+1') | |
expected_href = f'https://neculaifantanaru.com/en/{filename[:-5]}.html' | |
if current_link: | |
# Fix double .html.html | |
fixed_link = fix_double_html(current_link) | |
if fixed_link != expected_href: | |
new_flags = replace_flag_link(flags, r'\+1', expected_href) | |
new_content = content.replace(flags, new_flags) | |
update_file(file_path, new_content) | |
flags_fixed_en += 1 | |
if '.html.html' in current_link: | |
print(f'Corectat EN flags own (dublu .html): {filename} → {expected_href}') | |
else: | |
print(f'Corectat EN flags own: {filename} → {expected_href}') | |
print(f'✅ FLAGS reparate: RO={flags_fixed_ro}, EN={flags_fixed_en}, TOTAL={flags_fixed_ro + flags_fixed_en}') | |
# PASUL 3: SINCRONIZARE CROSS-REFERENCES RO ↔ EN | |
print('\nPASUL 3: SINCRONIZARE CROSS-REFERENCES RO ↔ EN') | |
print('============================================================') | |
# Build file sets for validation | |
ro_files_set = set(ro_files) | |
en_files_set = set(en_files) | |
# Build mappings (strict validation) | |
ro_to_en_map = {} | |
en_to_ro_map = {} | |
invalid_links = [] | |
# First pass: Try to match based on existing flags (strict validation) | |
for filename in ro_files: | |
file_path = os.path.join(ro_dir, filename) | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
flags = extract_flags(content) | |
if flags: | |
en_link = extract_flag_link(flags, '+1') | |
if en_link: | |
# Extract filename from URL | |
en_match = re.search(r'https://neculaifantanaru\.com/en/([^"]+)\.html', en_link) | |
if en_match: | |
en_name = en_match.group(1) + '.html' | |
# Check if EN file actually exists | |
if en_name in en_files_set: | |
ro_to_en_map[filename] = en_name | |
else: | |
invalid_links.append(f"RO {filename}: link către EN inexistent {en_name}") | |
for filename in en_files: | |
file_path = os.path.join(en_dir, filename) | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
flags = extract_flags(content) | |
if flags: | |
ro_link = extract_flag_link(flags, '+40') | |
if ro_link: | |
# Extract filename from URL | |
ro_match = re.search(r'https://neculaifantanaru\.com/([^"]+)\.html', ro_link) | |
if ro_match: | |
ro_name = ro_match.group(1) + '.html' | |
# Check if RO file actually exists | |
if ro_name in ro_files_set: | |
en_to_ro_map[filename] = ro_name | |
else: | |
invalid_links.append(f"EN {filename}: link către RO inexistent {ro_name}") | |
# Find bidirectional matches | |
bidirectional_pairs = [] | |
for ro_file, en_file in ro_to_en_map.items(): | |
if en_file in en_to_ro_map and en_to_ro_map[en_file] == ro_file: | |
bidirectional_pairs.append((ro_file, en_file)) | |
# Find files with no common links (all 4 links different) | |
orphaned_files = [] | |
for ro_file in ro_files: | |
if ro_file not in ro_to_en_map: | |
# Check if any EN file points to this RO | |
pointing_ens = [en_f for en_f, ro_f in en_to_ro_map.items() if ro_f == ro_file] | |
if not pointing_ens: | |
orphaned_files.append(f"RO {ro_file}: fără pereche validă") | |
for en_file in en_files: | |
if en_file not in en_to_ro_map: | |
# Check if any RO file points to this EN | |
pointing_ros = [ro_f for ro_f, en_f in ro_to_en_map.items() if en_f == en_file] | |
if not pointing_ros: | |
orphaned_files.append(f"EN {en_file}: fără pereche validă") | |
# Fallback: Match by filename similarity (case-insensitive) | |
unmatched_ro = [f for f in ro_files if f not in [pair[0] for pair in bidirectional_pairs]] | |
unmatched_en = [f for f in en_files if f not in [pair[1] for pair in bidirectional_pairs]] | |
similarity_pairs = [] | |
for ro_filename in unmatched_ro: | |
ro_base = ro_filename[:-5].lower() | |
for en_filename in unmatched_en: | |
en_base = en_filename[:-5].lower() | |
if ro_base == en_base or ro_base.replace('-', ' ') == en_base.replace('-', ' '): | |
if en_filename not in [pair[1] for pair in similarity_pairs]: | |
similarity_pairs.append((ro_filename, en_filename)) | |
break | |
all_pairs = bidirectional_pairs + similarity_pairs | |
print(f'Găsite {len(bidirectional_pairs)} perechi bidirectionale și {len(similarity_pairs)} perechi prin similaritate') | |
# Correct cross-references | |
cross_fixed = 0 | |
for ro_filename, en_filename in all_pairs: | |
# Correct RO file: set +1 to mapped EN | |
file_path = os.path.join(ro_dir, ro_filename) | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
flags = extract_flags(content) | |
if flags: | |
current_link = extract_flag_link(flags, '+1') | |
expected_href = f'https://neculaifantanaru.com/en/{en_filename[:-5]}.html' | |
if current_link != expected_href: | |
new_flags = replace_flag_link(flags, r'\+1', expected_href) | |
new_content = content.replace(flags, new_flags) | |
update_file(file_path, new_content) | |
cross_fixed += 1 | |
print(f'Corectat RO {ro_filename}: EN link → {en_filename}') | |
# Correct EN file: set +40 to mapped RO | |
file_path = os.path.join(en_dir, en_filename) | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
flags = extract_flags(content) | |
if flags: | |
current_link = extract_flag_link(flags, '+40') | |
expected_href = f'https://neculaifantanaru.com/{ro_filename[:-5]}.html' | |
if current_link != expected_href: | |
new_flags = replace_flag_link(flags, r'\+40', expected_href) | |
new_content = content.replace(flags, new_flags) | |
update_file(file_path, new_content) | |
cross_fixed += 1 | |
print(f'Corectat EN {en_filename}: RO link → {ro_filename}') | |
print(f'✅ Cross-references reparate: {cross_fixed}') | |
# Report invalid links and orphaned files | |
print('\n🚨 FIȘIERE CU PROBLEME:') | |
print('============================================================') | |
if invalid_links: | |
print('Link-uri către fișiere inexistente:') | |
for link in invalid_links: | |
print(f' {link}') | |
final_unmatched_ro = [f for f in ro_files if f not in [pair[0] for pair in all_pairs]] | |
final_unmatched_en = [f for f in en_files if f not in [pair[1] for pair in all_pairs]] | |
if final_unmatched_ro or final_unmatched_en: | |
print('\nFișiere fără perechi valide:') | |
for ro_file in final_unmatched_ro: | |
print(f' RO {ro_file}: fără pereche EN validă') | |
for en_file in final_unmatched_en: | |
print(f' EN {en_file}: fără pereche RO validă') | |
if not invalid_links and not final_unmatched_ro and not final_unmatched_en: | |
print('✅ Toate fișierele au perechi valide!') | |
# Final results | |
print('\n================================================================================') | |
print('REZULTATE FINALE') | |
print('================================================================================') | |
print(f'Pasul 1 - Canonical-uri reparate: {canonical_fixed_ro + canonical_fixed_en}') | |
print(f'Pasul 2 - FLAGS → canonical: {flags_fixed_ro + flags_fixed_en}') | |
print(f'Pasul 3 - Cross-references: {cross_fixed}') | |
print(f'🎉 TOTAL: {canonical_fixed_ro + canonical_fixed_en + flags_fixed_ro + flags_fixed_en + cross_fixed}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment