Created
July 3, 2024 21:16
-
-
Save dps/df17228930e772621698f23cf6e23b1c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from bs4 import BeautifulSoup, Comment | |
import re | |
def clean_html(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
soup = BeautifulSoup(file, 'html.parser') | |
# Remove <script> and <style> tags | |
for script in soup(["script", "style"]): | |
script.decompose() | |
# Remove comments | |
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): | |
comment.extract() | |
# Remove all nav | |
for nav in soup.find_all('nav'): | |
nav.decompose() | |
# Remove all images | |
for img in soup.find_all('img'): | |
img.decompose() | |
# Remove all svg | |
for svg in soup.find_all('svg'): | |
svg.decompose() | |
# Remove all figures | |
for figure in soup.find_all('figure'): | |
figure.decompose() | |
# Remove all forms | |
for figure in soup.find_all('form'): | |
figure.decompose() | |
# Remove all buttons | |
for b in soup.find_all('button'): | |
b.decompose() | |
# Remove <link> tags with type="text/css" | |
for link in soup.find_all('link', {'type': 'text/css'}): | |
link.decompose() | |
# Replace all ’ with ' | |
html_str = str(soup) | |
html_str = html_str.replace('’', "'") | |
html_str = html_str.replace('—', "-") | |
# “random tech startups,” | |
html_str = html_str.replace('“', "\"") | |
html_str = html_str.replace('”', "\"") | |
html_str = html_str.replace("…","...") | |
html_str = html_str.replace(" "," ") | |
# Replace additional common special characters | |
replacements = { | |
'—': '-', # Em dash | |
'–': '-', # En dash | |
'•': '-', # Bullet | |
'™': '(TM)', # Trademark | |
'®': '(R)', # Registered trademark | |
'©': '(C)', # Copyright | |
'‘': "'", # Left single quote | |
'‚': ',', # Single low-9 quotation mark | |
'“': '"', # Left double quotation mark | |
'”': '"', # Right double quotation mark | |
'«': '"', # Left-pointing double angle quotation mark | |
'»': '"', # Right-pointing double angle quotation mark | |
'‹': "'", # Single left-pointing angle quotation mark | |
'›': "'", # Single right-pointing angle quotation mark | |
'€': 'EUR', # Euro sign | |
'£': 'GBP', # Pound sign | |
'¥': 'JPY', # Yen sign | |
'§': 'S', # Section sign | |
'¶': 'P', # Pilcrow sign | |
'†': '+', # Dagger | |
'‡': '++', # Double dagger | |
'‰': 'o/oo', # Per mille sign | |
} | |
for char, replacement in replacements.items(): | |
html_str = html_str.replace(char, replacement) | |
# Remove any remaining non-printable characters | |
html_str = re.sub(r'[^\x20-\x7E]', '', html_str) | |
# Parse the modified HTML string back into BeautifulSoup | |
soup = BeautifulSoup(html_str, 'html.parser') | |
for tag in soup.find_all(['header', 'main', 'article', 'section', 'footer', 'nav', 'aside']): | |
tag.name = 'div' | |
# Add UTF-8 meta tag if not present | |
if not soup.find('meta', attrs={'charset': 'utf-8'}): | |
meta_tag = soup.new_tag('meta', charset='utf-8') | |
if soup.head: | |
soup.head.insert(0, meta_tag) | |
else: | |
# Create a head tag if it doesn't exist | |
head = soup.new_tag('head') | |
head.insert(0, meta_tag) | |
if soup.html: | |
soup.html.insert(0, head) | |
else: | |
soup.insert(0, head) | |
# Write the cleaned HTML back to the file | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.write(str(soup)) | |
def clean_directory(directory): | |
for root, _, files in os.walk(directory): | |
for file in files: | |
if file.endswith('.html'): | |
print(f"clean {file}") | |
clean_html(os.path.join(root, file)) | |
# Replace '/path/to/your/html/files' with the actual path | |
clean_directory('./situational-awareness.ai/') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment