Last active
September 16, 2024 18:41
-
-
Save lyquix-owner/9dd5eee80b8aaee2bd968e3a48641909 to your computer and use it in GitHub Desktop.
PHP script to automatically clean dirty HTML. Removes unnecessary attributes (e.g. style, id, dir), replaces deprecated tags with valid ones (e.g. <b> to <strong>), and strips undesirable tags (e.g <font>). We have used this script to safely clean hundreds of blog posts that were littered with inline styling.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// List of tags to be replaced and their replacement | |
$replace_tags = [ | |
'i' => 'em', | |
'b' => 'strong' | |
]; | |
// List of tags to be stripped. Text and children tags will be preserved. | |
$remove_tags = [ | |
'acronym', | |
'applet', | |
'b', | |
'basefont', | |
'big', | |
'bgsound', | |
'blink', | |
'center', | |
'del', | |
'dir', | |
'font', | |
'frame', | |
'frameset', | |
'hgroup', | |
'i', | |
'ins', | |
'kbd', | |
'marquee', | |
'nobr', | |
'noframes', | |
'plaintext', | |
'samp', | |
'small', | |
'span', | |
'strike', | |
'tt', | |
'u', | |
'var' | |
]; | |
// List of attributes to remove. Applied to all tags. | |
$remove_attribs = [ | |
'class', | |
'style', | |
'lang', | |
'width', | |
'height', | |
'align', | |
'hspace', | |
'vspace', | |
'dir' | |
]; | |
// Your HTML code | |
$html = '<p class="large-font", style="color: red"><b>Hello</b> <span style="margin-left: 1em">world!</span><br>How are you doing?</p>'; | |
function replaceTags($html, $tags) { | |
// Clean the HTML | |
$html = '<div>' . $html . '</div>'; // Workaround to get the HTML back from DOMDocument without the <html><head> and <body> tags | |
$dom = new DOMDocument; | |
$dom->loadHTML($html); | |
$html = substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6); | |
// Use simple string replace to replace tags | |
foreach($tags as $search => $replace) { | |
$html = str_replace('<' . $search . '>', '<' . $replace . '>', $html); | |
$html = str_replace('<' . $search . ' ', '<' . $replace . ' ', $html); | |
$html = str_replace('</' . $search . '>', '</' . $replace . '>', $html); | |
} | |
return $html; | |
} | |
function stripTags($html, $tags) { | |
// Remove all attributes from tags to be removed | |
$html = '<div>' . $html . '</div>'; | |
$dom = new DOMDocument; | |
$dom->loadHTML($html); | |
foreach($tags as $tag){ | |
$nodes = $dom->getElementsByTagName($tag); | |
foreach($nodes as $node) { | |
// Remove attributes | |
while($node->attributes->length) { | |
$node->removeAttribute($node->attributes->item(0)->name); | |
} | |
} | |
} | |
$html = substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6); | |
// Strip tags using string replace | |
foreach($tags as $tag){ | |
$html = str_replace('<' . $tag . '>', '', $html); | |
$html = str_replace('</' . $tag . '>', '', $html); | |
} | |
return $html; | |
} | |
function stripAttributes($html, $attribs) { | |
// Find all nodes that contain the attribute and remove it | |
$html = '<div>' . $html . '</div>'; | |
$dom = new DOMDocument; | |
$dom->loadHTML($html); | |
$xPath = new DOMXPath($dom); | |
foreach($attribs as $attrib) { | |
$nodes = $xPath->query('//*[@' . $attrib . ']'); | |
foreach($nodes as $node) $node->removeAttribute($attrib); | |
} | |
return substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6); | |
} | |
$html = replaceTags($html, $replace_tags); | |
$html = stripTags($html, $remove_tags); | |
$html = stripAttributes($html, $remove_attribs); | |
echo $html; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On line 55, you should remove the , (comma) in the example before the style attribute
<p class="large-font", style="color ...
in the string to clean. Otherwise it really works great!