Skip to content

Instantly share code, notes, and snippets.

@dps
Created July 3, 2024 21:16
Show Gist options
  • Save dps/df17228930e772621698f23cf6e23b1c to your computer and use it in GitHub Desktop.
Save dps/df17228930e772621698f23cf6e23b1c to your computer and use it in GitHub Desktop.
import os
from bs4 import BeautifulSoup, Comment
import re
def clean_html(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
# Remove <script> and <style> tags
for script in soup(["script", "style"]):
script.decompose()
# Remove comments
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
comment.extract()
# Remove all nav
for nav in soup.find_all('nav'):
nav.decompose()
# Remove all images
for img in soup.find_all('img'):
img.decompose()
# Remove all svg
for svg in soup.find_all('svg'):
svg.decompose()
# Remove all figures
for figure in soup.find_all('figure'):
figure.decompose()
# Remove all forms
for figure in soup.find_all('form'):
figure.decompose()
# Remove all buttons
for b in soup.find_all('button'):
b.decompose()
# Remove <link> tags with type="text/css"
for link in soup.find_all('link', {'type': 'text/css'}):
link.decompose()
# Replace all ’ with '
html_str = str(soup)
html_str = html_str.replace('’', "'")
html_str = html_str.replace('—', "-")
# “random tech startups,”
html_str = html_str.replace('“', "\"")
html_str = html_str.replace('”', "\"")
html_str = html_str.replace("…","...")
html_str = html_str.replace(" "," ")
# Replace additional common special characters
replacements = {
'—': '-', # Em dash
'–': '-', # En dash
'•': '-', # Bullet
'™': '(TM)', # Trademark
'®': '(R)', # Registered trademark
'©': '(C)', # Copyright
'‘': "'", # Left single quote
'‚': ',', # Single low-9 quotation mark
'“': '"', # Left double quotation mark
'”': '"', # Right double quotation mark
'«': '"', # Left-pointing double angle quotation mark
'»': '"', # Right-pointing double angle quotation mark
'‹': "'", # Single left-pointing angle quotation mark
'›': "'", # Single right-pointing angle quotation mark
'€': 'EUR', # Euro sign
'£': 'GBP', # Pound sign
'¥': 'JPY', # Yen sign
'§': 'S', # Section sign
'¶': 'P', # Pilcrow sign
'†': '+', # Dagger
'‡': '++', # Double dagger
'‰': 'o/oo', # Per mille sign
}
for char, replacement in replacements.items():
html_str = html_str.replace(char, replacement)
# Remove any remaining non-printable characters
html_str = re.sub(r'[^\x20-\x7E]', '', html_str)
# Parse the modified HTML string back into BeautifulSoup
soup = BeautifulSoup(html_str, 'html.parser')
for tag in soup.find_all(['header', 'main', 'article', 'section', 'footer', 'nav', 'aside']):
tag.name = 'div'
# Add UTF-8 meta tag if not present
if not soup.find('meta', attrs={'charset': 'utf-8'}):
meta_tag = soup.new_tag('meta', charset='utf-8')
if soup.head:
soup.head.insert(0, meta_tag)
else:
# Create a head tag if it doesn't exist
head = soup.new_tag('head')
head.insert(0, meta_tag)
if soup.html:
soup.html.insert(0, head)
else:
soup.insert(0, head)
# Write the cleaned HTML back to the file
with open(file_path, 'w', encoding='utf-8') as file:
file.write(str(soup))
def clean_directory(directory):
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.html'):
print(f"clean {file}")
clean_html(os.path.join(root, file))
# Replace '/path/to/your/html/files' with the actual path
clean_directory('./situational-awareness.ai/')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment