Created
October 17, 2012 15:11
-
-
Save BeMoreDifferent/3906071 to your computer and use it in GitHub Desktop.
small html cleaner class
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* | |
* @author: Daniel Albrecht [http://bemoredifferent.com] | |
* @abstract: Class to clean html tags and attributes based on DOMDocument | |
* | |
*/ | |
class filter{ | |
private $html; | |
private $dom; | |
private $xpath; | |
private $goodTags; | |
private $tagArray; | |
public $tagsInHTML; | |
public function __construct($html) { | |
$this -> goodTags = array('h1','h2','h3','h4','h5','h6','span','p','b','a','strong','blockquote','br','img','div','section','article','header','footer','html','body'); | |
$this -> html = utf8_decode($html); | |
$this -> dom = new DOMDocument(); | |
$this -> dom->validateOnParse = true; | |
$this -> dom->preserveWhiteSpace = false; | |
libxml_use_internal_errors(true); | |
$this -> dom->loadHTML( $this -> html ); | |
libxml_use_internal_errors(false); | |
$this->tagArray = $this->getTagNames(); | |
} | |
//-------------------------------------------- | |
//------ REMOVE HTML TAGS | |
//-------------------------------------------- | |
public function removeTags($tags = false){ | |
if($tags == false){ | |
$notRemove = $this->goodTags; | |
}else{ | |
$notRemove = $tags; | |
} | |
foreach ($notRemove as $key => $value) { | |
$notRemove[$value] = $notRemove[$key]; | |
unset($notRemove[$key]); | |
} | |
foreach ($this->tagArray as $value) { | |
if(empty($notRemove[$value])){ | |
$script_tags = $this->dom->getElementsByTagName($value); | |
$length = $script_tags->length; | |
for ($i = 0; $i < $length; $i++) { | |
$script_tags->item(0)->parentNode->removeChild($script_tags->item(0)); | |
} | |
} | |
} | |
} | |
//-------------------------------------------- | |
//------ REMOVE TAG ATTRIBUTES | |
//-------------------------------------------- | |
/** | |
* @abstrct: Removes all Attributes except array() | |
* @param: $attr [array] whitlist of attributes | |
*/ | |
public function removeAttributes($attr=false){ | |
$all_tags = $this->dom->getElementsByTagName('*'); | |
$length = $all_tags->length; | |
if($attr != false){ | |
foreach ($attr as $value) { | |
$goodAttrLoop[$value] = $value; | |
} | |
} | |
for ($p=0; $p < $length; $p++) { | |
for ($i = $all_tags->item($p)->attributes->length - 1; $i >= 0; --$i){ | |
if(empty($goodAttrLoop[$all_tags->item($p)->attributes->item($i)->nodeName])){ | |
$all_tags->item($p)->removeAttributeNode($all_tags->item($p)->attributes->item($i)); | |
} | |
} | |
} | |
return false; | |
} | |
//-------------------------------------------- | |
//------ RETURN CLEAN HTML | |
//-------------------------------------------- | |
public function returnHTML($mini = false){ | |
$html = $this -> dom -> saveHTML(); | |
$html = str_replace('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">', '<!DOCTYPE html>', $html); | |
if($mini == true){ | |
$html = preg_replace('/^\s+|\n|\r|\s+$/m', ' ', $html); | |
$html = str_replace('> <', '><', $html); | |
} | |
return $html; | |
} | |
//-------------------------------------------- | |
//------ HELPER | |
//-------------------------------------------- | |
private function getTagNames(){ | |
$nodes = $this->dom->getElementsByTagName("*"); | |
$nodeList = array(); | |
for ($i = 0; $i < $nodes->length; $i++) { | |
$nodeList[] = strtolower($nodes->item($i)->nodeName); | |
} | |
$nodeList = array_unique($nodeList); | |
return $nodeList; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment