Skip to content

Instantly share code, notes, and snippets.

@BeMoreDifferent
Created October 17, 2012 15:11
Show Gist options
  • Save BeMoreDifferent/3906071 to your computer and use it in GitHub Desktop.
Save BeMoreDifferent/3906071 to your computer and use it in GitHub Desktop.
small html cleaner class
<?php
/**
*
* @author: Daniel Albrecht [http://bemoredifferent.com]
* @abstract: Class to clean html tags and attributes based on DOMDocument
*
*/
class filter{
private $html;
private $dom;
private $xpath;
private $goodTags;
private $tagArray;
public $tagsInHTML;
public function __construct($html) {
$this -> goodTags = array('h1','h2','h3','h4','h5','h6','span','p','b','a','strong','blockquote','br','img','div','section','article','header','footer','html','body');
$this -> html = utf8_decode($html);
$this -> dom = new DOMDocument();
$this -> dom->validateOnParse = true;
$this -> dom->preserveWhiteSpace = false;
libxml_use_internal_errors(true);
$this -> dom->loadHTML( $this -> html );
libxml_use_internal_errors(false);
$this->tagArray = $this->getTagNames();
}
//--------------------------------------------
//------ REMOVE HTML TAGS
//--------------------------------------------
public function removeTags($tags = false){
if($tags == false){
$notRemove = $this->goodTags;
}else{
$notRemove = $tags;
}
foreach ($notRemove as $key => $value) {
$notRemove[$value] = $notRemove[$key];
unset($notRemove[$key]);
}
foreach ($this->tagArray as $value) {
if(empty($notRemove[$value])){
$script_tags = $this->dom->getElementsByTagName($value);
$length = $script_tags->length;
for ($i = 0; $i < $length; $i++) {
$script_tags->item(0)->parentNode->removeChild($script_tags->item(0));
}
}
}
}
//--------------------------------------------
//------ REMOVE TAG ATTRIBUTES
//--------------------------------------------
/**
* @abstrct: Removes all Attributes except array()
* @param: $attr [array] whitlist of attributes
*/
public function removeAttributes($attr=false){
$all_tags = $this->dom->getElementsByTagName('*');
$length = $all_tags->length;
if($attr != false){
foreach ($attr as $value) {
$goodAttrLoop[$value] = $value;
}
}
for ($p=0; $p < $length; $p++) {
for ($i = $all_tags->item($p)->attributes->length - 1; $i >= 0; --$i){
if(empty($goodAttrLoop[$all_tags->item($p)->attributes->item($i)->nodeName])){
$all_tags->item($p)->removeAttributeNode($all_tags->item($p)->attributes->item($i));
}
}
}
return false;
}
//--------------------------------------------
//------ RETURN CLEAN HTML
//--------------------------------------------
public function returnHTML($mini = false){
$html = $this -> dom -> saveHTML();
$html = str_replace('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">', '<!DOCTYPE html>', $html);
if($mini == true){
$html = preg_replace('/^\s+|\n|\r|\s+$/m', ' ', $html);
$html = str_replace('> <', '><', $html);
}
return $html;
}
//--------------------------------------------
//------ HELPER
//--------------------------------------------
private function getTagNames(){
$nodes = $this->dom->getElementsByTagName("*");
$nodeList = array();
for ($i = 0; $i < $nodes->length; $i++) {
$nodeList[] = strtolower($nodes->item($i)->nodeName);
}
$nodeList = array_unique($nodeList);
return $nodeList;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment