greg-randall · July 1, 2025 14:23
diff --git a/index.php b/index.php
 <?php
 /*
 Note that the cleaner sends the html to DirtyMarkup for formatting.
 
 
 Example input:	
  <div class=WordSection1>	
  <p class=MsoNormal align=center style='text-align:center'><span	
  style='font-size:16.0pt;line-height:107%;font-family:"Abadi Extra Light",sans-serif'>Test	
  Clean<o:p></o:p></span></p>	
  <p class=MsoNormal><span style='font-size:12.0pt;line-height:107%;mso-bidi-font-family:	
  Calibri;mso-bidi-theme-font:minor-latin'>Test Paragraph, qwerty <span	
  class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span>	
  <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span	
  class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span>.<o:p></o:p></span></p>	
  <p class=MsoNormal><o:p>&nbsp;</o:p></p>	
  <p class=MsoNormal><o:p>&nbsp;</o:p></p>	
  </div>	
 Example Output:	
  <p>Test Clean</p>	
  <p>Test Paragraph, qwerty qwerty qwerty qwerty qwerty qwerty qwerty qwerty qwerty.</p> 
 */

 /* configuration */
 $debug = false; // set to true to see debug output

 $allowed_attribute = [
    // attributes to keep on the html i.e. <a href="www.asdf.com">
    "content",
    "http-equiv",
    "src",
    "href",
    "alt",
    "colspan",
    "rowspan",
    "id",
 ];

 // Tags to remove, but preserve the content inside them
 $tags_to_remove_and_keep_content = [
    "div",
    "span",
    "figure",
    "font",
    "section",
    "aside",
    "article",
 ];

 // Tags to remove completely, including all content inside them
 $tags_to_remove_with_content = [
    "style",
    "script",
    "link",
 ];

 $remove_fancy_quotes = true; // changes  ‘ ’   “  ” and some similar stuff to  ' and "
 $remove_fancy_spaces = true; // changes &nbsp; &thinsp; etc to a regular space.
 $remove_fancy_dashes = true; // changes EM dashes, EN dashes, etc to regular dashes
 $remove_empy_td = false; // keeps or removes empty table cells
 $convert_chars_to_entities = true; // converts html entities to their character equivalent i.e. & to &amp;
 $decode_safelinks = true; // decodes Microsoft SafeLinks URLs

 /* end configuration */

 if ($debug) {
    echo "<pre><strong>DEBUG MODE ON</strong>\n\n";
 }

 $html_to_process = '';
 $is_processing_needed = false;

 // Check for file upload first. The file upload takes priority.
 if (isset($_FILES['htmlFile']) && $_FILES['htmlFile']['error'] === UPLOAD_ERR_OK) {
    if ($debug) { echo "DEBUG: File upload detected.\n"; }
    $html_to_process = file_get_contents($_FILES['htmlFile']['tmp_name']);
    $is_processing_needed = true;
 } 
 // If no file was uploaded, check for input in the textarea.
 elseif (!empty($_POST["input"])) {
    if ($debug) { echo "DEBUG: Textarea input detected.\n"; }
    $html_to_process = $_POST["input"];
    $is_processing_needed = true;
 }

 if ($debug) {
    echo "DEBUG: is_processing_needed = " . ($is_processing_needed ? 'true' : 'false') . "\n";
    if($is_processing_needed){
        echo "DEBUG: Initial HTML to process:\n" . htmlspecialchars(substr($html_to_process, 0, 1000)) . "...\n\n";
    }
 }


 if (!$is_processing_needed) { // If there's no content to clean, show the form to upload or paste content.
   ?>
   Clean:
    <form action="index.php" method="post" enctype="multipart/form-data">
    Select HTML file to upload:
    <input type="file" name="htmlFile" id="htmlFile">
    <br><br>
    Or paste HTML code:
    <br>
    <textarea name="input" rows="50" cols="160"></textarea><br><br>
    <input type="submit" value="Clean HTML" name="submit">
    </form>
    <?php 
    if ($debug) { echo "</pre>"; }
    exit();


  } else { //if there is content let's process it
    
    error_reporting(E_ERROR | E_PARSE); //DOMDocument throws a fair number of errors, we'll quiet them down

    $html = $html_to_process;
    if (stripos($html, "<html") !== false && stripos($html, "<body") !== false) {
        //determine if the input is a full html document or not
        $html_fragment = "full";
    } else {
        $html_fragment = "fragment";
    }
    if ($debug) { echo "DEBUG: html_fragment set to: $html_fragment\n"; }

    if($convert_chars_to_entities){
       //encodes charecters into html entities, but only in the text
       $doc = new DOMDocument();
       // Use @ to suppress warnings from malformed HTML, which we are trying to clean anyway
       @$doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
       $html = $doc->saveHTML();
       if ($debug) { echo "DEBUG: After loading into DOMDocument and saving:\n" . htmlspecialchars(substr($html, 0, 1000)) . "...\n\n"; }
    }

    if ($remove_fancy_spaces) {
        $html = str_ireplace(
            [   " ", "&#8192;", " ", "&#8193;", " ", "&#8194;", "&ensp;", " ", "&#8195;", "&emsp;", " ", "&#8196;", " ", "&#8197;", " ", "&#8198;", " ", "&#8199;", " ", "&#8200;", " ", "&#8201;", "&thinsp;", " ", "&#8202;", "", "&#8203;", "&#160;", "&nbsp;",],
            " ",
            $html
        ); //change spaces to a regular spaces.
        $html = preg_replace("/\s+/", " ", $html); // collapses multiple spaces into one.
        if ($debug) { echo "DEBUG: After removing fancy spaces.\n"; }
    }
    
    if (!$remove_empy_td) {
        $html = preg_replace("/\> ?<\/td>/", ">~~..~~</td>", $html);
        $html = preg_replace("/\> ?<\/th>/", ">~~..~~</th>", $html);
    }

    $dom = new DOMDocument();
    @$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
    $xpath = new DOMXPath($dom);
    $elements = $xpath->query("//*");
    foreach ($elements as $element) {
        //loops through all the elements
        for ($i = $element->attributes->length; --$i >= 0; ) {
            //loops through all the attributes backwards
            $name = $element->attributes->item($i)->name;
            if (!in_array($name, $allowed_attribute)) {
                $element->removeAttribute($name);
            }
        }
    }

    // First, nuke the tags that should be completely removed (tag and content)
    $xpath_tags_to_nuke = [];
    foreach($tags_to_remove_with_content as $tag) {
        $xpath_tags_to_nuke[] = "//$tag";
    }
    $nuke_query = implode(" | ", $xpath_tags_to_nuke);
    if ($debug) { echo "DEBUG: XPath query for nuking tags: " . $nuke_query . "\n"; }
    foreach ($xpath->query($nuke_query) as $remove) {
        $remove->parentNode->removeChild($remove);
    }

    // Then, remove wrapper tags but preserve their content
    $xpath_tags_to_remove = [];
    foreach($tags_to_remove_and_keep_content as $tag) {
        $xpath_tags_to_remove[] = "//$tag";
    }
    $remove_query = implode(" | ", $xpath_tags_to_remove);
    if ($debug) { echo "DEBUG: XPath query for removing wrapper tags: " . $remove_query . "\n"; }
    foreach ($xpath->query($remove_query) as $remove) {
        // Move all tag content to its parent node just before it.
        while ($remove->hasChildNodes()) {
            $child = $remove->removeChild($remove->firstChild);
            $remove->parentNode->insertBefore($child, $remove);
        }
        $remove->parentNode->removeChild($remove);
    }


    //removes empty tags
    while (($node_list = $xpath->query("//*[not(*) and not(@*) and not(text()[normalize-space()]) and not(self::br)]")) && $node_list->length) {
        foreach ($node_list as $node) {
            $node->parentNode->removeChild($node);
        }
    }

    // Query all comment nodes
    $commentNodes = $xpath->query('//comment()');
    foreach ($commentNodes as $commentNode) {
        // Remove all comments
        $commentNode->parentNode->removeChild($commentNode);
    }

    $clean = $dom->saveHTML();
    if ($debug) { echo "DEBUG: After DOM processing and tag removal:\n" . htmlspecialchars(substr($clean, 0, 1000)) . "...\n\n"; }


    if ($remove_fancy_quotes) {
        $clean = str_ireplace(
            [   "&iuml;&iquest;&frac12;", "&lsquo;", "&rsquo;", "&#8216;", "&#8217;", "&apos;", "&prime;", "&#8242;", "’", "‘", "`",], "'", $clean
        ); 
        $clean = str_ireplace(
            [   "&ldquo;", "&rdquo;", "&#8220;", "&#8221;", "&quot;", "&Prime;", "&#8243;", "”", "“", "''",], '"', $clean
        ); 
    }
    if ($remove_fancy_dashes) {
        $clean = str_ireplace(
            [  "&#8208;", "‑", "&#8209;", "‒", "&#8210;", "–", "&#8211;", "&ndash;", "—", "&#8212;", "&mdash;", "―", "&#8213;",], "-", $clean
        );
    }

    if (!$remove_empy_td) {
        $clean = preg_replace("/~~\.\.~~/", "", $clean);
    }

    if($decode_safelinks){
        $clean = decodeSafeLinks($clean);
    }
    
    if ($debug) { 
        echo "DEBUG: Final clean before beautify:\n" . htmlspecialchars(substr($clean, 0, 1000)) . "...\n\n"; 
        echo "</pre>";
    }

    echo "
    <textarea rows=\"10\" cols=\"160\">".htmlspecialchars( beautify_html($clean, $html_fragment) )."</textarea>
    <br><br><button id='copyButton'>Copy to clipboard</button>
    <br><br><hr><br><h1>Cleaned Code Preview</h1><br><hr><br>\n". str_ireplace("<table>","<table border='2'>", beautify_html($clean, $html_fragment) )."\n
    <script>
        // Select the button
        var button = document.querySelector('#copyButton');
    
        // Add a click event listener to the button
        button.addEventListener('click', function() {
            // Select the textarea
            var textarea = document.querySelector('textarea');
    
            // Copy the contents of the textarea to the clipboard
            textarea.select();
            document.execCommand('copy');

            // Optional: visual feedback
            button.textContent = 'Copied!';
            setTimeout(() => { button.textContent = 'Copy to clipboard'; }, 2000);
        });
    </script>";
 } //displays the cleaned html directly in the page for easy copy and paste


 function decodeSafeLinks($html) {
    // Pattern to match Microsoft SafeLinks URLs
    $pattern = '/["\']https:\/\/nam\d+\.safelinks\.protection\.outlook\.com\/\?url=([^&]+)&[^"\']*["\']/';

    // Callback function to decode the URL
    $callback = function($matches) {
        // URL decode the captured URL parameter
        return '"' . urldecode($matches[1]) . '"';
    };

    // Replace all SafeLinks URLs with their decoded versions
    return preg_replace_callback($pattern, $callback, $html);
 }



 function beautify_html($html, $html_fragment)
 {
    $options = [        
        'indent' => true,
        'indent-spaces' => 2,
        'clean' => true,
        'output-xhtml' => true,
        'hide-comments' => true,
        'wrap' => 0,
    ];

    if ($html_fragment === 'fragment') {
        if ($GLOBALS['debug']) { echo "DEBUG: Beautifying as fragment (show-body-only).\n"; }
        $options['show-body-only'] = true;
    } else {
        if ($GLOBALS['debug']) { echo "DEBUG: Beautifying as full document.\n"; }
    }

    $tidy = new tidy();
    $tidy->parseString($html, $options, 'utf8');
    $tidy->cleanRepair();
    return (string)$tidy;
 }
 ?>
	<?php
	/*
	Note that the cleaner sends the html to DirtyMarkup for formatting.


	Example input:
	<div class=WordSection1>
	<p class=MsoNormal align=center style='text-align:center'><span
	style='font-size:16.0pt;line-height:107%;font-family:"Abadi Extra Light",sans-serif'>Test
	Clean<o:p></o:p></span></p>
	<p class=MsoNormal><span style='font-size:12.0pt;line-height:107%;mso-bidi-font-family:
	Calibri;mso-bidi-theme-font:minor-latin'>Test Paragraph, qwerty <span
	class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span>
	<span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span
	class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span>.<o:p></o:p></span></p>
	<p class=MsoNormal><o:p> </o:p></p>
	<p class=MsoNormal><o:p> </o:p></p>
	</div>
	Example Output:
	<p>Test Clean</p>
	<p>Test Paragraph, qwerty qwerty qwerty qwerty qwerty qwerty qwerty qwerty qwerty.</p>
	*/

	/* configuration */
	$debug = false; // set to true to see debug output

	$allowed_attribute = [
	// attributes to keep on the html i.e. <a href="www.asdf.com">
	"content",
	"http-equiv",
	"src",
	"href",
	"alt",
	"colspan",
	"rowspan",
	"id",
	];

	// Tags to remove, but preserve the content inside them
	$tags_to_remove_and_keep_content = [
	"div",
	"span",
	"figure",
	"font",
	"section",
	"aside",
	"article",
	];

	// Tags to remove completely, including all content inside them
	$tags_to_remove_with_content = [
	"style",
	"script",
	"link",
	];

	$remove_fancy_quotes = true; // changes ‘ ’ “ ” and some similar stuff to ' and "
	$remove_fancy_spaces = true; // changes     etc to a regular space.
	$remove_fancy_dashes = true; // changes EM dashes, EN dashes, etc to regular dashes
	$remove_empy_td = false; // keeps or removes empty table cells
	$convert_chars_to_entities = true; // converts html entities to their character equivalent i.e. & to &
	$decode_safelinks = true; // decodes Microsoft SafeLinks URLs

	/* end configuration */

	if ($debug) {
	echo "<pre><strong>DEBUG MODE ON</strong>\n\n";
	}

	$html_to_process = '';
	$is_processing_needed = false;

	// Check for file upload first. The file upload takes priority.
	if (isset($_FILES['htmlFile']) && $_FILES['htmlFile']['error'] === UPLOAD_ERR_OK) {
	if ($debug) { echo "DEBUG: File upload detected.\n"; }
	$html_to_process = file_get_contents($_FILES['htmlFile']['tmp_name']);
	$is_processing_needed = true;
	}
	// If no file was uploaded, check for input in the textarea.
	elseif (!empty($_POST["input"])) {
	if ($debug) { echo "DEBUG: Textarea input detected.\n"; }
	$html_to_process = $_POST["input"];
	$is_processing_needed = true;
	}

	if ($debug) {
	echo "DEBUG: is_processing_needed = " . ($is_processing_needed ? 'true' : 'false') . "\n";
	if($is_processing_needed){
	echo "DEBUG: Initial HTML to process:\n" . htmlspecialchars(substr($html_to_process, 0, 1000)) . "...\n\n";
	}
	}


	if (!$is_processing_needed) { // If there's no content to clean, show the form to upload or paste content.
	?>
	Clean:
	<form action="index.php" method="post" enctype="multipart/form-data">
	Select HTML file to upload:
	<input type="file" name="htmlFile" id="htmlFile">
	<br><br>
	Or paste HTML code:
	<br>
	<textarea name="input" rows="50" cols="160"></textarea><br><br>
	<input type="submit" value="Clean HTML" name="submit">
	</form>
	<?php
	if ($debug) { echo "</pre>"; }
	exit();


	} else { //if there is content let's process it

	error_reporting(E_ERROR \| E_PARSE); //DOMDocument throws a fair number of errors, we'll quiet them down

	$html = $html_to_process;
	if (stripos($html, "<html") !== false && stripos($html, "<body") !== false) {
	//determine if the input is a full html document or not
	$html_fragment = "full";
	} else {
	$html_fragment = "fragment";
	}
	if ($debug) { echo "DEBUG: html_fragment set to: $html_fragment\n"; }

	if($convert_chars_to_entities){
	//encodes charecters into html entities, but only in the text
	$doc = new DOMDocument();
	// Use @ to suppress warnings from malformed HTML, which we are trying to clean anyway
	@$doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED \| LIBXML_HTML_NODEFDTD);
	$html = $doc->saveHTML();
	if ($debug) { echo "DEBUG: After loading into DOMDocument and saving:\n" . htmlspecialchars(substr($html, 0, 1000)) . "...\n\n"; }
	}

	if ($remove_fancy_spaces) {
	$html = str_ireplace(
	[ " ", " ", " ", " ", " ", " ", "&ensp;", " ", " ", "&emsp;", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", "", "", " ", " ",],
	" ",
	$html
	); //change spaces to a regular spaces.
	$html = preg_replace("/\s+/", " ", $html); // collapses multiple spaces into one.
	if ($debug) { echo "DEBUG: After removing fancy spaces.\n"; }
	}

	if (!$remove_empy_td) {
	$html = preg_replace("/\> ?<\/td>/", ">~~..~~</td>", $html);
	$html = preg_replace("/\> ?<\/th>/", ">~~..~~</th>", $html);
	}

	$dom = new DOMDocument();
	@$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED \| LIBXML_HTML_NODEFDTD);
	$xpath = new DOMXPath($dom);
	$elements = $xpath->query("//*");
	foreach ($elements as $element) {
	//loops through all the elements
	for ($i = $element->attributes->length; --$i >= 0; ) {
	//loops through all the attributes backwards
	$name = $element->attributes->item($i)->name;
	if (!in_array($name, $allowed_attribute)) {
	$element->removeAttribute($name);
	}
	}
	}

	// First, nuke the tags that should be completely removed (tag and content)
	$xpath_tags_to_nuke = [];
	foreach($tags_to_remove_with_content as $tag) {
	$xpath_tags_to_nuke[] = "//$tag";
	}
	$nuke_query = implode(" \| ", $xpath_tags_to_nuke);
	if ($debug) { echo "DEBUG: XPath query for nuking tags: " . $nuke_query . "\n"; }
	foreach ($xpath->query($nuke_query) as $remove) {
	$remove->parentNode->removeChild($remove);
	}

	// Then, remove wrapper tags but preserve their content
	$xpath_tags_to_remove = [];
	foreach($tags_to_remove_and_keep_content as $tag) {
	$xpath_tags_to_remove[] = "//$tag";
	}
	$remove_query = implode(" \| ", $xpath_tags_to_remove);
	if ($debug) { echo "DEBUG: XPath query for removing wrapper tags: " . $remove_query . "\n"; }
	foreach ($xpath->query($remove_query) as $remove) {
	// Move all tag content to its parent node just before it.
	while ($remove->hasChildNodes()) {
	$child = $remove->removeChild($remove->firstChild);
	$remove->parentNode->insertBefore($child, $remove);
	}
	$remove->parentNode->removeChild($remove);
	}


	//removes empty tags
	while (($node_list = $xpath->query("//[not() and not(@*) and not(text()[normalize-space()]) and not(self::br)]")) && $node_list->length) {
	foreach ($node_list as $node) {
	$node->parentNode->removeChild($node);
	}
	}

	// Query all comment nodes
	$commentNodes = $xpath->query('//comment()');
	foreach ($commentNodes as $commentNode) {
	// Remove all comments
	$commentNode->parentNode->removeChild($commentNode);
	}

	$clean = $dom->saveHTML();
	if ($debug) { echo "DEBUG: After DOM processing and tag removal:\n" . htmlspecialchars(substr($clean, 0, 1000)) . "...\n\n"; }


	if ($remove_fancy_quotes) {
	$clean = str_ireplace(
	[ "ï¿½", "‘", "’", "‘", "’", "'", "′", "′", "’", "‘", "`",], "'", $clean
	);
	$clean = str_ireplace(
	[ "“", "”", "“", "”", """, "″", "″", "”", "“", "''",], '"', $clean
	);
	}
	if ($remove_fancy_dashes) {
	$clean = str_ireplace(
	[ "‐", "‑", "‑", "‒", "‒", "–", "–", "–", "—", "—", "—", "―", "―",], "-", $clean
	);
	}

	if (!$remove_empy_td) {
	$clean = preg_replace("/~~\.\.~~/", "", $clean);
	}

	if($decode_safelinks){
	$clean = decodeSafeLinks($clean);
	}

	if ($debug) {
	echo "DEBUG: Final clean before beautify:\n" . htmlspecialchars(substr($clean, 0, 1000)) . "...\n\n";
	echo "</pre>";
	}

	echo "
	<textarea rows=\"10\" cols=\"160\">".htmlspecialchars( beautify_html($clean, $html_fragment) )."</textarea>
	<br><br><button id='copyButton'>Copy to clipboard</button>
	<br><br><hr><br><h1>Cleaned Code Preview</h1><br><hr><br>\n". str_ireplace("<table>","<table border='2'>", beautify_html($clean, $html_fragment) )."\n
	<script>
	// Select the button
	var button = document.querySelector('#copyButton');

	// Add a click event listener to the button
	button.addEventListener('click', function() {
	// Select the textarea
	var textarea = document.querySelector('textarea');

	// Copy the contents of the textarea to the clipboard
	textarea.select();
	document.execCommand('copy');

	// Optional: visual feedback
	button.textContent = 'Copied!';
	setTimeout(() => { button.textContent = 'Copy to clipboard'; }, 2000);
	});
	</script>";
	} //displays the cleaned html directly in the page for easy copy and paste


	function decodeSafeLinks($html) {
	// Pattern to match Microsoft SafeLinks URLs
	$pattern = '/["\']https:\/\/nam\d+\.safelinks\.protection\.outlook\.com\/\?url=([^&]+)&[^"\']*["\']/';

	// Callback function to decode the URL
	$callback = function($matches) {
	// URL decode the captured URL parameter
	return '"' . urldecode($matches[1]) . '"';
	};

	// Replace all SafeLinks URLs with their decoded versions
	return preg_replace_callback($pattern, $callback, $html);
	}



	function beautify_html($html, $html_fragment)
	{
	$options = [
	'indent' => true,
	'indent-spaces' => 2,
	'clean' => true,
	'output-xhtml' => true,
	'hide-comments' => true,
	'wrap' => 0,
	];

	if ($html_fragment === 'fragment') {
	if ($GLOBALS['debug']) { echo "DEBUG: Beautifying as fragment (show-body-only).\n"; }
	$options['show-body-only'] = true;
	} else {
	if ($GLOBALS['debug']) { echo "DEBUG: Beautifying as full document.\n"; }
	}

	$tidy = new tidy();
	$tidy->parseString($html, $options, 'utf8');
	$tidy->cleanRepair();
	return (string)$tidy;
	}
	?>
No results found