Skip to content

Instantly share code, notes, and snippets.

@vitqst
Forked from xeoncross/clean_html.php
Created August 28, 2018 07:11
Show Gist options
  • Save vitqst/6e320ec60de881352f55a702aaef3e1a to your computer and use it in GitHub Desktop.
Save vitqst/6e320ec60de881352f55a702aaef3e1a to your computer and use it in GitHub Desktop.
Sanitize HTML using PHP and the DOMDocument
<?php
/**
* Clean HTML string removing all element attributes and elements which are
* not in the provided whitelist (but keeping their allowed children).
*
* @see https://github.com/alixaxel/phunction/blob/master/phunction/HTML.php
* @param string $html to clean
* @param array $whitelist
*/
function clean_html($html, array $whitelist)
{
libxml_use_internal_errors(true) AND libxml_clear_errors();
if (is_object($html)) {
if ($html->hasChildNodes()) {
foreach (range($html->childNodes->length - 1, 0) as $i) {
clean_html($html->childNodes->item($i), $whitelist);
}
}
if ( ! in_array($html->nodeName, $whitelist)) {
$fragment = $html->ownerDocument->createDocumentFragment();
while ($html->childNodes->length > 0) {
$fragment->appendChild($html->childNodes->item(0));
}
return $html->parentNode->replaceChild($fragment, $html);
}
while ($html->hasAttributes()) {
$html->removeAttributeNode($html->attributes->item(0));
}
} else if($dom = DOMDocument::loadHTML($html)) {
clean_html($dom->documentElement, $whitelist);
return preg_replace('~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $dom->saveHTML());
}
}
<?php
$whitelist = array(
'#text',
'h3', 'h4', 'h5', 'h6',
'blockquote', 'q', 'p',
'pre', 'code', // Code
'ul', 'ol', 'li',
'b', 'em', 'i', 'u', 'strike', 'sup', 'sub',
// Notice what is remove ↓ since attributes are not allowed
//'a' => array('href', 'title'), 'img' => array('src', 'alt', 'title'),
);
$string = <<<END
<div id="hello">
Hello World!
<div>
<p><span>text</span> goes here</p>
<a href="javascript:alert(document.location);">XSS</a> and normal text.
<b style="width: expression(alert(document.location));">XSS</b> is bad.
</div>
<br>
</div>
END;
print clean_html($string, $whitelist);
/* Output:
Hello World!
<p>text goes here</p>
XSS and normal text.
<b>XSS</b> is bad.
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment