Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Last active July 1, 2025 14:23
Show Gist options
  • Select an option

  • Save greg-randall/05adf2268c82c89543c159bc2742fce7 to your computer and use it in GitHub Desktop.

Select an option

Save greg-randall/05adf2268c82c89543c159bc2742fce7 to your computer and use it in GitHub Desktop.
HTML Cleaner. Paste dirty html into a field, and it removes nearly all HTML attributes (except the ones you want -- src, href, alt, and a couple others), and formats. Using proper DOMDocument PHP parser.
<?php
/*
Note that the cleaner sends the html to DirtyMarkup for formatting.
Example input:
<div class=WordSection1>
<p class=MsoNormal align=center style='text-align:center'><span
style='font-size:16.0pt;line-height:107%;font-family:"Abadi Extra Light",sans-serif'>Test
Clean<o:p></o:p></span></p>
<p class=MsoNormal><span style='font-size:12.0pt;line-height:107%;mso-bidi-font-family:
Calibri;mso-bidi-theme-font:minor-latin'>Test Paragraph, qwerty <span
class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span>
<span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span
class=SpellE>qwerty</span> <span class=SpellE>qwerty</span> <span class=SpellE>qwerty</span>.<o:p></o:p></span></p>
<p class=MsoNormal><o:p>&nbsp;</o:p></p>
<p class=MsoNormal><o:p>&nbsp;</o:p></p>
</div>
Example Output:
<p>Test Clean</p>
<p>Test Paragraph, qwerty qwerty qwerty qwerty qwerty qwerty qwerty qwerty qwerty.</p>
*/
/* configuration */
$debug = false; // set to true to see debug output
$allowed_attribute = [
// attributes to keep on the html i.e. <a href="www.asdf.com">
"content",
"http-equiv",
"src",
"href",
"alt",
"colspan",
"rowspan",
"id",
];
// Tags to remove, but preserve the content inside them
$tags_to_remove_and_keep_content = [
"div",
"span",
"figure",
"font",
"section",
"aside",
"article",
];
// Tags to remove completely, including all content inside them
$tags_to_remove_with_content = [
"style",
"script",
"link",
];
$remove_fancy_quotes = true; // changes ‘ ’ “ ” and some similar stuff to ' and "
$remove_fancy_spaces = true; // changes &nbsp; &thinsp; etc to a regular space.
$remove_fancy_dashes = true; // changes EM dashes, EN dashes, etc to regular dashes
$remove_empy_td = false; // keeps or removes empty table cells
$convert_chars_to_entities = true; // converts html entities to their character equivalent i.e. & to &amp;
$decode_safelinks = true; // decodes Microsoft SafeLinks URLs
/* end configuration */
if ($debug) {
echo "<pre><strong>DEBUG MODE ON</strong>\n\n";
}
$html_to_process = '';
$is_processing_needed = false;
// Check for file upload first. The file upload takes priority.
if (isset($_FILES['htmlFile']) && $_FILES['htmlFile']['error'] === UPLOAD_ERR_OK) {
if ($debug) { echo "DEBUG: File upload detected.\n"; }
$html_to_process = file_get_contents($_FILES['htmlFile']['tmp_name']);
$is_processing_needed = true;
}
// If no file was uploaded, check for input in the textarea.
elseif (!empty($_POST["input"])) {
if ($debug) { echo "DEBUG: Textarea input detected.\n"; }
$html_to_process = $_POST["input"];
$is_processing_needed = true;
}
if ($debug) {
echo "DEBUG: is_processing_needed = " . ($is_processing_needed ? 'true' : 'false') . "\n";
if($is_processing_needed){
echo "DEBUG: Initial HTML to process:\n" . htmlspecialchars(substr($html_to_process, 0, 1000)) . "...\n\n";
}
}
if (!$is_processing_needed) { // If there's no content to clean, show the form to upload or paste content.
?>
Clean:
<form action="index.php" method="post" enctype="multipart/form-data">
Select HTML file to upload:
<input type="file" name="htmlFile" id="htmlFile">
<br><br>
Or paste HTML code:
<br>
<textarea name="input" rows="50" cols="160"></textarea><br><br>
<input type="submit" value="Clean HTML" name="submit">
</form>
<?php
if ($debug) { echo "</pre>"; }
exit();
} else { //if there is content let's process it
error_reporting(E_ERROR | E_PARSE); //DOMDocument throws a fair number of errors, we'll quiet them down
$html = $html_to_process;
if (stripos($html, "<html") !== false && stripos($html, "<body") !== false) {
//determine if the input is a full html document or not
$html_fragment = "full";
} else {
$html_fragment = "fragment";
}
if ($debug) { echo "DEBUG: html_fragment set to: $html_fragment\n"; }
if($convert_chars_to_entities){
//encodes charecters into html entities, but only in the text
$doc = new DOMDocument();
// Use @ to suppress warnings from malformed HTML, which we are trying to clean anyway
@$doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
$html = $doc->saveHTML();
if ($debug) { echo "DEBUG: After loading into DOMDocument and saving:\n" . htmlspecialchars(substr($html, 0, 1000)) . "...\n\n"; }
}
if ($remove_fancy_spaces) {
$html = str_ireplace(
[ " ", "&#8192;", " ", "&#8193;", " ", "&#8194;", "&ensp;", " ", "&#8195;", "&emsp;", " ", "&#8196;", " ", "&#8197;", " ", "&#8198;", " ", "&#8199;", " ", "&#8200;", " ", "&#8201;", "&thinsp;", " ", "&#8202;", "​", "&#8203;", "&#160;", "&nbsp;",],
" ",
$html
); //change spaces to a regular spaces.
$html = preg_replace("/\s+/", " ", $html); // collapses multiple spaces into one.
if ($debug) { echo "DEBUG: After removing fancy spaces.\n"; }
}
if (!$remove_empy_td) {
$html = preg_replace("/\> ?<\/td>/", ">~~..~~</td>", $html);
$html = preg_replace("/\> ?<\/th>/", ">~~..~~</th>", $html);
}
$dom = new DOMDocument();
@$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
$xpath = new DOMXPath($dom);
$elements = $xpath->query("//*");
foreach ($elements as $element) {
//loops through all the elements
for ($i = $element->attributes->length; --$i >= 0; ) {
//loops through all the attributes backwards
$name = $element->attributes->item($i)->name;
if (!in_array($name, $allowed_attribute)) {
$element->removeAttribute($name);
}
}
}
// First, nuke the tags that should be completely removed (tag and content)
$xpath_tags_to_nuke = [];
foreach($tags_to_remove_with_content as $tag) {
$xpath_tags_to_nuke[] = "//$tag";
}
$nuke_query = implode(" | ", $xpath_tags_to_nuke);
if ($debug) { echo "DEBUG: XPath query for nuking tags: " . $nuke_query . "\n"; }
foreach ($xpath->query($nuke_query) as $remove) {
$remove->parentNode->removeChild($remove);
}
// Then, remove wrapper tags but preserve their content
$xpath_tags_to_remove = [];
foreach($tags_to_remove_and_keep_content as $tag) {
$xpath_tags_to_remove[] = "//$tag";
}
$remove_query = implode(" | ", $xpath_tags_to_remove);
if ($debug) { echo "DEBUG: XPath query for removing wrapper tags: " . $remove_query . "\n"; }
foreach ($xpath->query($remove_query) as $remove) {
// Move all tag content to its parent node just before it.
while ($remove->hasChildNodes()) {
$child = $remove->removeChild($remove->firstChild);
$remove->parentNode->insertBefore($child, $remove);
}
$remove->parentNode->removeChild($remove);
}
//removes empty tags
while (($node_list = $xpath->query("//*[not(*) and not(@*) and not(text()[normalize-space()]) and not(self::br)]")) && $node_list->length) {
foreach ($node_list as $node) {
$node->parentNode->removeChild($node);
}
}
// Query all comment nodes
$commentNodes = $xpath->query('//comment()');
foreach ($commentNodes as $commentNode) {
// Remove all comments
$commentNode->parentNode->removeChild($commentNode);
}
$clean = $dom->saveHTML();
if ($debug) { echo "DEBUG: After DOM processing and tag removal:\n" . htmlspecialchars(substr($clean, 0, 1000)) . "...\n\n"; }
if ($remove_fancy_quotes) {
$clean = str_ireplace(
[ "&iuml;&iquest;&frac12;", "&lsquo;", "&rsquo;", "&#8216;", "&#8217;", "&apos;", "&prime;", "&#8242;", "’", "‘", "`",], "'", $clean
);
$clean = str_ireplace(
[ "&ldquo;", "&rdquo;", "&#8220;", "&#8221;", "&quot;", "&Prime;", "&#8243;", "”", "“", "''",], '"', $clean
);
}
if ($remove_fancy_dashes) {
$clean = str_ireplace(
[ "&#8208;", "‑", "&#8209;", "‒", "&#8210;", "–", "&#8211;", "&ndash;", "—", "&#8212;", "&mdash;", "―", "&#8213;",], "-", $clean
);
}
if (!$remove_empy_td) {
$clean = preg_replace("/~~\.\.~~/", "", $clean);
}
if($decode_safelinks){
$clean = decodeSafeLinks($clean);
}
if ($debug) {
echo "DEBUG: Final clean before beautify:\n" . htmlspecialchars(substr($clean, 0, 1000)) . "...\n\n";
echo "</pre>";
}
echo "
<textarea rows=\"10\" cols=\"160\">".htmlspecialchars( beautify_html($clean, $html_fragment) )."</textarea>
<br><br><button id='copyButton'>Copy to clipboard</button>
<br><br><hr><br><h1>Cleaned Code Preview</h1><br><hr><br>\n". str_ireplace("<table>","<table border='2'>", beautify_html($clean, $html_fragment) )."\n
<script>
// Select the button
var button = document.querySelector('#copyButton');
// Add a click event listener to the button
button.addEventListener('click', function() {
// Select the textarea
var textarea = document.querySelector('textarea');
// Copy the contents of the textarea to the clipboard
textarea.select();
document.execCommand('copy');
// Optional: visual feedback
button.textContent = 'Copied!';
setTimeout(() => { button.textContent = 'Copy to clipboard'; }, 2000);
});
</script>";
} //displays the cleaned html directly in the page for easy copy and paste
function decodeSafeLinks($html) {
// Pattern to match Microsoft SafeLinks URLs
$pattern = '/["\']https:\/\/nam\d+\.safelinks\.protection\.outlook\.com\/\?url=([^&]+)&[^"\']*["\']/';
// Callback function to decode the URL
$callback = function($matches) {
// URL decode the captured URL parameter
return '"' . urldecode($matches[1]) . '"';
};
// Replace all SafeLinks URLs with their decoded versions
return preg_replace_callback($pattern, $callback, $html);
}
function beautify_html($html, $html_fragment)
{
$options = [
'indent' => true,
'indent-spaces' => 2,
'clean' => true,
'output-xhtml' => true,
'hide-comments' => true,
'wrap' => 0,
];
if ($html_fragment === 'fragment') {
if ($GLOBALS['debug']) { echo "DEBUG: Beautifying as fragment (show-body-only).\n"; }
$options['show-body-only'] = true;
} else {
if ($GLOBALS['debug']) { echo "DEBUG: Beautifying as full document.\n"; }
}
$tidy = new tidy();
$tidy->parseString($html, $options, 'utf8');
$tidy->cleanRepair();
return (string)$tidy;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment