- 
      
- 
        Save vitqst/6e320ec60de881352f55a702aaef3e1a to your computer and use it in GitHub Desktop. 
    Sanitize HTML using PHP and the DOMDocument
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | <?php | |
| /** | |
| * Clean HTML string removing all element attributes and elements which are | |
| * not in the provided whitelist (but keeping their allowed children). | |
| * | |
| * @see https://github.com/alixaxel/phunction/blob/master/phunction/HTML.php | |
| * @param string $html to clean | |
| * @param array $whitelist | |
| */ | |
| function clean_html($html, array $whitelist) | |
| { | |
| libxml_use_internal_errors(true) AND libxml_clear_errors(); | |
| if (is_object($html)) { | |
| if ($html->hasChildNodes()) { | |
| foreach (range($html->childNodes->length - 1, 0) as $i) { | |
| clean_html($html->childNodes->item($i), $whitelist); | |
| } | |
| } | |
| if ( ! in_array($html->nodeName, $whitelist)) { | |
| $fragment = $html->ownerDocument->createDocumentFragment(); | |
| while ($html->childNodes->length > 0) { | |
| $fragment->appendChild($html->childNodes->item(0)); | |
| } | |
| return $html->parentNode->replaceChild($fragment, $html); | |
| } | |
| while ($html->hasAttributes()) { | |
| $html->removeAttributeNode($html->attributes->item(0)); | |
| } | |
| } else if($dom = DOMDocument::loadHTML($html)) { | |
| clean_html($dom->documentElement, $whitelist); | |
| return preg_replace('~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $dom->saveHTML()); | |
| } | |
| } | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | <?php | |
| $whitelist = array( | |
| '#text', | |
| 'h3', 'h4', 'h5', 'h6', | |
| 'blockquote', 'q', 'p', | |
| 'pre', 'code', // Code | |
| 'ul', 'ol', 'li', | |
| 'b', 'em', 'i', 'u', 'strike', 'sup', 'sub', | |
| // Notice what is remove ↓ since attributes are not allowed | |
| //'a' => array('href', 'title'), 'img' => array('src', 'alt', 'title'), | |
| ); | |
| $string = <<<END | |
| <div id="hello"> | |
| Hello World! | |
| <div> | |
| <p><span>text</span> goes here</p> | |
| <a href="javascript:alert(document.location);">XSS</a> and normal text. | |
| <b style="width: expression(alert(document.location));">XSS</b> is bad. | |
| </div> | |
| <br> | |
| </div> | |
| END; | |
| print clean_html($string, $whitelist); | |
| /* Output: | |
| Hello World! | |
| <p>text goes here</p> | |
| XSS and normal text. | |
| <b>XSS</b> is bad. | |
| */ | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment