dps · July 3, 2024 21:16
diff --git a/clean.py b/clean.py
 import os
 from bs4 import BeautifulSoup, Comment
 import re


 def clean_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    # Remove <script> and <style> tags
    for script in soup(["script", "style"]):
        script.decompose()

    # Remove comments
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Remove all nav
    for nav in soup.find_all('nav'):
        nav.decompose()

    # Remove all images
    for img in soup.find_all('img'):
        img.decompose()

    # Remove all svg
    for svg in soup.find_all('svg'):
        svg.decompose()

    # Remove all figures
    for figure in soup.find_all('figure'):
        figure.decompose()
    
    # Remove all forms
    for figure in soup.find_all('form'):
        figure.decompose()

    # Remove all buttons
    for b in soup.find_all('button'):
        b.decompose()
    
    # Remove <link> tags with type="text/css"
    for link in soup.find_all('link', {'type': 'text/css'}):
        link.decompose()
    
    # Replace all ’ with '
    html_str = str(soup)
    html_str = html_str.replace('’', "'")
    html_str = html_str.replace('—', "-")
    # “random tech startups,”
    html_str = html_str.replace('“', "\"")
    html_str = html_str.replace('”', "\"")
    html_str = html_str.replace("…","...")
    html_str = html_str.replace(" "," ")

    # Replace additional common special characters
    replacements = {
        '—': '-',  # Em dash
        '–': '-',  # En dash
        '•': '-',  # Bullet
        '™': '(TM)',  # Trademark
        '®': '(R)',  # Registered trademark
        '©': '(C)',  # Copyright
        '‘': "'",  # Left single quote
        '‚': ',',  # Single low-9 quotation mark
        '“': '"',  # Left double quotation mark
        '”': '"',  # Right double quotation mark
        '«': '"',  # Left-pointing double angle quotation mark
        '»': '"',  # Right-pointing double angle quotation mark
        '‹': "'",  # Single left-pointing angle quotation mark
        '›': "'",  # Single right-pointing angle quotation mark
        '€': 'EUR',  # Euro sign
        '£': 'GBP',  # Pound sign
        '¥': 'JPY',  # Yen sign
        '§': 'S',  # Section sign
        '¶': 'P',  # Pilcrow sign
        '†': '+',  # Dagger
        '‡': '++',  # Double dagger
        '‰': 'o/oo',  # Per mille sign
    }

    for char, replacement in replacements.items():
        html_str = html_str.replace(char, replacement)

    # Remove any remaining non-printable characters
    html_str = re.sub(r'[^\x20-\x7E]', '', html_str)

    # Parse the modified HTML string back into BeautifulSoup
    soup = BeautifulSoup(html_str, 'html.parser')

    for tag in soup.find_all(['header', 'main', 'article', 'section', 'footer', 'nav', 'aside']):
        tag.name = 'div'

    # Add UTF-8 meta tag if not present
    if not soup.find('meta', attrs={'charset': 'utf-8'}):
        meta_tag = soup.new_tag('meta', charset='utf-8')
        if soup.head:
            soup.head.insert(0, meta_tag)
        else:
            # Create a head tag if it doesn't exist
            head = soup.new_tag('head')
            head.insert(0, meta_tag)
            if soup.html:
                soup.html.insert(0, head)
            else:
                soup.insert(0, head)

    # Write the cleaned HTML back to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(str(soup))

 def clean_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.html'):
                print(f"clean {file}")
                clean_html(os.path.join(root, file))

 # Replace '/path/to/your/html/files' with the actual path
 clean_directory('./situational-awareness.ai/')
	import os
	from bs4 import BeautifulSoup, Comment
	import re


	def clean_html(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	soup = BeautifulSoup(file, 'html.parser')

	# Remove <script> and <style> tags
	for script in soup(["script", "style"]):
	script.decompose()

	# Remove comments
	for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
	comment.extract()

	# Remove all nav
	for nav in soup.find_all('nav'):
	nav.decompose()

	# Remove all images
	for img in soup.find_all('img'):
	img.decompose()

	# Remove all svg
	for svg in soup.find_all('svg'):
	svg.decompose()

	# Remove all figures
	for figure in soup.find_all('figure'):
	figure.decompose()

	# Remove all forms
	for figure in soup.find_all('form'):
	figure.decompose()

	# Remove all buttons
	for b in soup.find_all('button'):
	b.decompose()

	# Remove <link> tags with type="text/css"
	for link in soup.find_all('link', {'type': 'text/css'}):
	link.decompose()

	# Replace all ’ with '
	html_str = str(soup)
	html_str = html_str.replace('’', "'")
	html_str = html_str.replace('—', "-")
	# “random tech startups,”
	html_str = html_str.replace('“', "\"")
	html_str = html_str.replace('”', "\"")
	html_str = html_str.replace("…","...")
	html_str = html_str.replace(" "," ")

	# Replace additional common special characters
	replacements = {
	'—': '-', # Em dash
	'–': '-', # En dash
	'•': '-', # Bullet
	'™': '(TM)', # Trademark
	'®': '(R)', # Registered trademark
	'©': '(C)', # Copyright
	'‘': "'", # Left single quote
	'‚': ',', # Single low-9 quotation mark
	'“': '"', # Left double quotation mark
	'”': '"', # Right double quotation mark
	'«': '"', # Left-pointing double angle quotation mark
	'»': '"', # Right-pointing double angle quotation mark
	'‹': "'", # Single left-pointing angle quotation mark
	'›': "'", # Single right-pointing angle quotation mark
	'€': 'EUR', # Euro sign
	'£': 'GBP', # Pound sign
	'¥': 'JPY', # Yen sign
	'§': 'S', # Section sign
	'¶': 'P', # Pilcrow sign
	'†': '+', # Dagger
	'‡': '++', # Double dagger
	'‰': 'o/oo', # Per mille sign
	}

	for char, replacement in replacements.items():
	html_str = html_str.replace(char, replacement)

	# Remove any remaining non-printable characters
	html_str = re.sub(r'[^\x20-\x7E]', '', html_str)

	# Parse the modified HTML string back into BeautifulSoup
	soup = BeautifulSoup(html_str, 'html.parser')

	for tag in soup.find_all(['header', 'main', 'article', 'section', 'footer', 'nav', 'aside']):
	tag.name = 'div'

	# Add UTF-8 meta tag if not present
	if not soup.find('meta', attrs={'charset': 'utf-8'}):
	meta_tag = soup.new_tag('meta', charset='utf-8')
	if soup.head:
	soup.head.insert(0, meta_tag)
	else:
	# Create a head tag if it doesn't exist
	head = soup.new_tag('head')
	head.insert(0, meta_tag)
	if soup.html:
	soup.html.insert(0, head)
	else:
	soup.insert(0, head)

	# Write the cleaned HTML back to the file
	with open(file_path, 'w', encoding='utf-8') as file:
	file.write(str(soup))

	def clean_directory(directory):
	for root, _, files in os.walk(directory):
	for file in files:
	if file.endswith('.html'):
	print(f"clean {file}")
	clean_html(os.path.join(root, file))

	# Replace '/path/to/your/html/files' with the actual path
	clean_directory('./situational-awareness.ai/')