Created
September 3, 2025 09:29
-
-
Save me-suzy/382bca569881cfc4b79a068d6c5d5b83 to your computer and use it in GitHub Desktop.
Find and replace 234.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
def replace_headers_in_html(file_path): | |
# Read the content of the HTML file | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
# Store original content for comparison | |
original_content = content | |
# Case 1: Replace <h3 class="text_obisnuit2">(.*?)</h3> with <h2 class="text_obisnuit2"><em>(.*?)</em></h2> | |
# Made pattern more flexible with optional whitespace and non-greedy matching | |
pattern1 = r'<div itemprop="articleBody">\s*<!--\s*SASA-1\s*-->\s*<h3 class="text_obisnuit2">(.*?)</h3>' | |
replacement1 = r'<div itemprop="articleBody">\n\n<!-- SASA-1 -->\n <h2 class="text_obisnuit2"><em>\1</em></h2>' | |
content, count1 = re.subn(pattern1, replacement1, content, flags=re.DOTALL) | |
# Case 2: Add <!-- SASA-1 --> before <h2 class="text_obisnuit2"><em>(.*?)</em></h2> | |
# Made pattern more flexible with optional whitespace | |
pattern2 = r'<div itemprop="articleBody">\s*<h2 class="text_obisnuit2"><em>(.*?)</em></h2>' | |
replacement2 = r'<div itemprop="articleBody">\n\n<!-- SASA-1 -->\n <h2 class="text_obisnuit2"><em>\1</em></h2>' | |
content, count2 = re.subn(pattern2, replacement2, content, flags=re.DOTALL) | |
# Check if any replacements were made | |
if count1 > 0 or count2 > 0: | |
# Write the modified content back to the file | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.write(content) | |
print(f"Replaced {count1} occurrence(s) of pattern 1 and {count2} occurrence(s) of pattern 2 in {file_path}") | |
else: | |
print(f"No matches found in {file_path}") | |
# Log a snippet of the content for debugging | |
snippet = content[:500] # First 500 characters | |
print(f"Content snippet: {snippet}") | |
def process_html_files(directory): | |
# Walk through the directory | |
for root, _, files in os.walk(directory): | |
for file in files: | |
if file.endswith('.html'): | |
file_path = os.path.join(root, file) | |
print(f"Processing {file_path}") | |
replace_headers_in_html(file_path) | |
# Specify the directory path | |
directory = r"E:\Carte\BB\17 - Site Leadership\Principal 2022\en" | |
process_html_files(directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment