Last active
July 1, 2025 14:23
-
-
Save greg-randall/05adf2268c82c89543c159bc2742fce7 to your computer and use it in GitHub Desktop.
HTML Cleaner. Paste dirty html into a field, and it removes nearly all HTML attributes (except the ones you want -- src, href, alt, and a couple others), and formats. Using proper DOMDocument PHP parser.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /* | |
| Note that the cleaner sends the html to DirtyMarkup for formatting. | |
| Example input: | |
| <div class=WordSection1> | |
| <p class=MsoNormal align=center style='text-align:center'><span | |
| style='font-size:16.0pt;line-height:107%;font-family:"Abadi Extra Light",sans-serif'>Test | |
| Clean<o:p></o:p></span></p> | |
| <p class=MsoNormal><span style='font-size:12.0pt;line-height:107%;mso-bidi-font-family: | |
| Calibri;mso-bidi-theme-font:minor-latin'>Test Paragraph, qwerty <span | |
| class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> | |
| <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span | |
| class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span>.<o:p></o:p></span></p> | |
| <p class=MsoNormal><o:p> </o:p></p> | |
| <p class=MsoNormal><o:p> </o:p></p> | |
| </div> | |
| Example Output: | |
| <p>Test Clean</p> | |
| <p>Test Paragraph, qwerty qwerty qwerty qwerty qwerty qwerty qwerty qwerty qwerty.</p> | |
| */ | |
| /* configuration */ | |
| $debug = false; // set to true to see debug output | |
| $allowed_attribute = [ | |
| // attributes to keep on the html i.e. <a href="www.asdf.com"> | |
| "content", | |
| "http-equiv", | |
| "src", | |
| "href", | |
| "alt", | |
| "colspan", | |
| "rowspan", | |
| "id", | |
| ]; | |
| // Tags to remove, but preserve the content inside them | |
| $tags_to_remove_and_keep_content = [ | |
| "div", | |
| "span", | |
| "figure", | |
| "font", | |
| "section", | |
| "aside", | |
| "article", | |
| ]; | |
| // Tags to remove completely, including all content inside them | |
| $tags_to_remove_with_content = [ | |
| "style", | |
| "script", | |
| "link", | |
| ]; | |
| $remove_fancy_quotes = true; // changes ‘ ’ “ ” and some similar stuff to ' and " | |
| $remove_fancy_spaces = true; // changes   etc to a regular space. | |
| $remove_fancy_dashes = true; // changes EM dashes, EN dashes, etc to regular dashes | |
| $remove_empy_td = false; // keeps or removes empty table cells | |
| $convert_chars_to_entities = true; // converts html entities to their character equivalent i.e. & to & | |
| $decode_safelinks = true; // decodes Microsoft SafeLinks URLs | |
| /* end configuration */ | |
| if ($debug) { | |
| echo "<pre><strong>DEBUG MODE ON</strong>\n\n"; | |
| } | |
| $html_to_process = ''; | |
| $is_processing_needed = false; | |
| // Check for file upload first. The file upload takes priority. | |
| if (isset($_FILES['htmlFile']) && $_FILES['htmlFile']['error'] === UPLOAD_ERR_OK) { | |
| if ($debug) { echo "DEBUG: File upload detected.\n"; } | |
| $html_to_process = file_get_contents($_FILES['htmlFile']['tmp_name']); | |
| $is_processing_needed = true; | |
| } | |
| // If no file was uploaded, check for input in the textarea. | |
| elseif (!empty($_POST["input"])) { | |
| if ($debug) { echo "DEBUG: Textarea input detected.\n"; } | |
| $html_to_process = $_POST["input"]; | |
| $is_processing_needed = true; | |
| } | |
| if ($debug) { | |
| echo "DEBUG: is_processing_needed = " . ($is_processing_needed ? 'true' : 'false') . "\n"; | |
| if($is_processing_needed){ | |
| echo "DEBUG: Initial HTML to process:\n" . htmlspecialchars(substr($html_to_process, 0, 1000)) . "...\n\n"; | |
| } | |
| } | |
| if (!$is_processing_needed) { // If there's no content to clean, show the form to upload or paste content. | |
| ?> | |
| Clean: | |
| <form action="index.php" method="post" enctype="multipart/form-data"> | |
| Select HTML file to upload: | |
| <input type="file" name="htmlFile" id="htmlFile"> | |
| <br><br> | |
| Or paste HTML code: | |
| <br> | |
| <textarea name="input" rows="50" cols="160"></textarea><br><br> | |
| <input type="submit" value="Clean HTML" name="submit"> | |
| </form> | |
| <?php | |
| if ($debug) { echo "</pre>"; } | |
| exit(); | |
| } else { //if there is content let's process it | |
| error_reporting(E_ERROR | E_PARSE); //DOMDocument throws a fair number of errors, we'll quiet them down | |
| $html = $html_to_process; | |
| if (stripos($html, "<html") !== false && stripos($html, "<body") !== false) { | |
| //determine if the input is a full html document or not | |
| $html_fragment = "full"; | |
| } else { | |
| $html_fragment = "fragment"; | |
| } | |
| if ($debug) { echo "DEBUG: html_fragment set to: $html_fragment\n"; } | |
| if($convert_chars_to_entities){ | |
| //encodes charecters into html entities, but only in the text | |
| $doc = new DOMDocument(); | |
| // Use @ to suppress warnings from malformed HTML, which we are trying to clean anyway | |
| @$doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); | |
| $html = $doc->saveHTML(); | |
| if ($debug) { echo "DEBUG: After loading into DOMDocument and saving:\n" . htmlspecialchars(substr($html, 0, 1000)) . "...\n\n"; } | |
| } | |
| if ($remove_fancy_spaces) { | |
| $html = str_ireplace( | |
| [ " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", "", "​", " ", " ",], | |
| " ", | |
| $html | |
| ); //change spaces to a regular spaces. | |
| $html = preg_replace("/\s+/", " ", $html); // collapses multiple spaces into one. | |
| if ($debug) { echo "DEBUG: After removing fancy spaces.\n"; } | |
| } | |
| if (!$remove_empy_td) { | |
| $html = preg_replace("/\> ?<\/td>/", ">~~..~~</td>", $html); | |
| $html = preg_replace("/\> ?<\/th>/", ">~~..~~</th>", $html); | |
| } | |
| $dom = new DOMDocument(); | |
| @$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); | |
| $xpath = new DOMXPath($dom); | |
| $elements = $xpath->query("//*"); | |
| foreach ($elements as $element) { | |
| //loops through all the elements | |
| for ($i = $element->attributes->length; --$i >= 0; ) { | |
| //loops through all the attributes backwards | |
| $name = $element->attributes->item($i)->name; | |
| if (!in_array($name, $allowed_attribute)) { | |
| $element->removeAttribute($name); | |
| } | |
| } | |
| } | |
| // First, nuke the tags that should be completely removed (tag and content) | |
| $xpath_tags_to_nuke = []; | |
| foreach($tags_to_remove_with_content as $tag) { | |
| $xpath_tags_to_nuke[] = "//$tag"; | |
| } | |
| $nuke_query = implode(" | ", $xpath_tags_to_nuke); | |
| if ($debug) { echo "DEBUG: XPath query for nuking tags: " . $nuke_query . "\n"; } | |
| foreach ($xpath->query($nuke_query) as $remove) { | |
| $remove->parentNode->removeChild($remove); | |
| } | |
| // Then, remove wrapper tags but preserve their content | |
| $xpath_tags_to_remove = []; | |
| foreach($tags_to_remove_and_keep_content as $tag) { | |
| $xpath_tags_to_remove[] = "//$tag"; | |
| } | |
| $remove_query = implode(" | ", $xpath_tags_to_remove); | |
| if ($debug) { echo "DEBUG: XPath query for removing wrapper tags: " . $remove_query . "\n"; } | |
| foreach ($xpath->query($remove_query) as $remove) { | |
| // Move all tag content to its parent node just before it. | |
| while ($remove->hasChildNodes()) { | |
| $child = $remove->removeChild($remove->firstChild); | |
| $remove->parentNode->insertBefore($child, $remove); | |
| } | |
| $remove->parentNode->removeChild($remove); | |
| } | |
| //removes empty tags | |
| while (($node_list = $xpath->query("//*[not(*) and not(@*) and not(text()[normalize-space()]) and not(self::br)]")) && $node_list->length) { | |
| foreach ($node_list as $node) { | |
| $node->parentNode->removeChild($node); | |
| } | |
| } | |
| // Query all comment nodes | |
| $commentNodes = $xpath->query('//comment()'); | |
| foreach ($commentNodes as $commentNode) { | |
| // Remove all comments | |
| $commentNode->parentNode->removeChild($commentNode); | |
| } | |
| $clean = $dom->saveHTML(); | |
| if ($debug) { echo "DEBUG: After DOM processing and tag removal:\n" . htmlspecialchars(substr($clean, 0, 1000)) . "...\n\n"; } | |
| if ($remove_fancy_quotes) { | |
| $clean = str_ireplace( | |
| [ "�", "‘", "’", "‘", "’", "'", "′", "′", "’", "‘", "`",], "'", $clean | |
| ); | |
| $clean = str_ireplace( | |
| [ "“", "”", "“", "”", """, "″", "″", "”", "“", "''",], '"', $clean | |
| ); | |
| } | |
| if ($remove_fancy_dashes) { | |
| $clean = str_ireplace( | |
| [ "‐", "‑", "‑", "‒", "‒", "–", "–", "–", "—", "—", "—", "―", "―",], "-", $clean | |
| ); | |
| } | |
| if (!$remove_empy_td) { | |
| $clean = preg_replace("/~~\.\.~~/", "", $clean); | |
| } | |
| if($decode_safelinks){ | |
| $clean = decodeSafeLinks($clean); | |
| } | |
| if ($debug) { | |
| echo "DEBUG: Final clean before beautify:\n" . htmlspecialchars(substr($clean, 0, 1000)) . "...\n\n"; | |
| echo "</pre>"; | |
| } | |
| echo " | |
| <textarea rows=\"10\" cols=\"160\">".htmlspecialchars( beautify_html($clean, $html_fragment) )."</textarea> | |
| <br><br><button id='copyButton'>Copy to clipboard</button> | |
| <br><br><hr><br><h1>Cleaned Code Preview</h1><br><hr><br>\n". str_ireplace("<table>","<table border='2'>", beautify_html($clean, $html_fragment) )."\n | |
| <script> | |
| // Select the button | |
| var button = document.querySelector('#copyButton'); | |
| // Add a click event listener to the button | |
| button.addEventListener('click', function() { | |
| // Select the textarea | |
| var textarea = document.querySelector('textarea'); | |
| // Copy the contents of the textarea to the clipboard | |
| textarea.select(); | |
| document.execCommand('copy'); | |
| // Optional: visual feedback | |
| button.textContent = 'Copied!'; | |
| setTimeout(() => { button.textContent = 'Copy to clipboard'; }, 2000); | |
| }); | |
| </script>"; | |
| } //displays the cleaned html directly in the page for easy copy and paste | |
| function decodeSafeLinks($html) { | |
| // Pattern to match Microsoft SafeLinks URLs | |
| $pattern = '/["\']https:\/\/nam\d+\.safelinks\.protection\.outlook\.com\/\?url=([^&]+)&[^"\']*["\']/'; | |
| // Callback function to decode the URL | |
| $callback = function($matches) { | |
| // URL decode the captured URL parameter | |
| return '"' . urldecode($matches[1]) . '"'; | |
| }; | |
| // Replace all SafeLinks URLs with their decoded versions | |
| return preg_replace_callback($pattern, $callback, $html); | |
| } | |
| function beautify_html($html, $html_fragment) | |
| { | |
| $options = [ | |
| 'indent' => true, | |
| 'indent-spaces' => 2, | |
| 'clean' => true, | |
| 'output-xhtml' => true, | |
| 'hide-comments' => true, | |
| 'wrap' => 0, | |
| ]; | |
| if ($html_fragment === 'fragment') { | |
| if ($GLOBALS['debug']) { echo "DEBUG: Beautifying as fragment (show-body-only).\n"; } | |
| $options['show-body-only'] = true; | |
| } else { | |
| if ($GLOBALS['debug']) { echo "DEBUG: Beautifying as full document.\n"; } | |
| } | |
| $tidy = new tidy(); | |
| $tidy->parseString($html, $options, 'utf8'); | |
| $tidy->cleanRepair(); | |
| return (string)$tidy; | |
| } | |
| ?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment