Skip to content

Instantly share code, notes, and snippets.

@lyquix-owner
Last active September 16, 2024 18:41
Show Gist options
  • Save lyquix-owner/9dd5eee80b8aaee2bd968e3a48641909 to your computer and use it in GitHub Desktop.
Save lyquix-owner/9dd5eee80b8aaee2bd968e3a48641909 to your computer and use it in GitHub Desktop.
PHP script to automatically clean dirty HTML. Removes unnecessary attributes (e.g. style, id, dir), replaces deprecated tags with valid ones (e.g. <b> to <strong>), and strips undesirable tags (e.g <font>). We have used this script to safely clean hundreds of blog posts that were littered with inline styling.
<?php
// List of tags to be replaced and their replacement
$replace_tags = [
'i' => 'em',
'b' => 'strong'
];
// List of tags to be stripped. Text and children tags will be preserved.
$remove_tags = [
'acronym',
'applet',
'b',
'basefont',
'big',
'bgsound',
'blink',
'center',
'del',
'dir',
'font',
'frame',
'frameset',
'hgroup',
'i',
'ins',
'kbd',
'marquee',
'nobr',
'noframes',
'plaintext',
'samp',
'small',
'span',
'strike',
'tt',
'u',
'var'
];
// List of attributes to remove. Applied to all tags.
$remove_attribs = [
'class',
'style',
'lang',
'width',
'height',
'align',
'hspace',
'vspace',
'dir'
];
// Your HTML code
$html = '<p class="large-font", style="color: red"><b>Hello</b> <span style="margin-left: 1em">world!</span><br>How are you doing?</p>';
function replaceTags($html, $tags) {
// Clean the HTML
$html = '<div>' . $html . '</div>'; // Workaround to get the HTML back from DOMDocument without the <html><head> and <body> tags
$dom = new DOMDocument;
$dom->loadHTML($html);
$html = substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
// Use simple string replace to replace tags
foreach($tags as $search => $replace) {
$html = str_replace('<' . $search . '>', '<' . $replace . '>', $html);
$html = str_replace('<' . $search . ' ', '<' . $replace . ' ', $html);
$html = str_replace('</' . $search . '>', '</' . $replace . '>', $html);
}
return $html;
}
function stripTags($html, $tags) {
// Remove all attributes from tags to be removed
$html = '<div>' . $html . '</div>';
$dom = new DOMDocument;
$dom->loadHTML($html);
foreach($tags as $tag){
$nodes = $dom->getElementsByTagName($tag);
foreach($nodes as $node) {
// Remove attributes
while($node->attributes->length) {
$node->removeAttribute($node->attributes->item(0)->name);
}
}
}
$html = substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
// Strip tags using string replace
foreach($tags as $tag){
$html = str_replace('<' . $tag . '>', '', $html);
$html = str_replace('</' . $tag . '>', '', $html);
}
return $html;
}
function stripAttributes($html, $attribs) {
// Find all nodes that contain the attribute and remove it
$html = '<div>' . $html . '</div>';
$dom = new DOMDocument;
$dom->loadHTML($html);
$xPath = new DOMXPath($dom);
foreach($attribs as $attrib) {
$nodes = $xPath->query('//*[@' . $attrib . ']');
foreach($nodes as $node) $node->removeAttribute($attrib);
}
return substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
}
$html = replaceTags($html, $replace_tags);
$html = stripTags($html, $remove_tags);
$html = stripAttributes($html, $remove_attribs);
echo $html;
@ndeblauw
Copy link

On line 55, you should remove the , (comma) in the example before the style attribute <p class="large-font", style="color ... in the string to clean. Otherwise it really works great!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment