Created
October 13, 2011 20:17
-
-
Save nerdsrescueme/1285380 to your computer and use it in GitHub Desktop.
HTML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Atom\Parser; | |
class Html { | |
private $separator = '-{}-'; | |
private $single_tags = 'meta|img|hr|br|link|!--|!DOCTYPE|input'; | |
//-- Don't edit below this -- | |
private $html; | |
private $level = -1; | |
public $level_array = array(); | |
function __construct($html) | |
{ | |
$this->html = $this->clean($html); | |
} | |
private function get_element($data) | |
{ | |
$data = explode($this->separator, $data); | |
$data = explode('-', $data[1]); | |
return $this->level_array[$data[0]][$data[1]]; | |
} | |
private function to_html($string, $level) | |
{ | |
$items = $this->get_replacements($string); | |
foreach($items as $item) | |
{ | |
$elem = $this->get_element($item); | |
$string = str_replace( | |
$item, | |
$level === 0 ? $elem['htmlText'] : '<'.$elem['tag'].$elem['attr'].'>'.$elem['htmlText'].'</'.$elem['tag'].'>', | |
$string | |
); | |
} | |
return $string; | |
} | |
private function replace_single_tags() | |
{ | |
$result = preg_match_all('/<('.$this->single_tags.')(.[^><]*)?>/is', $this->html, $matches); | |
if($result > 0) | |
{ | |
foreach ($matches[0] as $value) | |
{ | |
$this->html = str_replace($value, '', $this->html); | |
} | |
} | |
} | |
private function replace_simple_tags() | |
{ | |
$result = preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html, $matches); | |
if ($result>0) | |
{ | |
$this->level++ and $level = array(); | |
foreach($matches[0] as $id => $value) | |
{ | |
if($this->level === 0) | |
{ | |
$html_text = $value; | |
} | |
else | |
{ | |
$html_text=$this->to_html($matches[3][$id], $this->level-1); | |
} | |
$level[] = array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $matches[1][$id], 'level' => $this->level, 'text' => $matches[3][$id], 'attr' => $matches[2][$id] , 'htmlText' => $html_text); | |
$this->html = str_replace($value, $this->separator.$this->level.'-'.$id.$this->separator, $this->html); | |
} | |
$this->level_array[$this->level] = $level; | |
} | |
} | |
private function replace_remaining_tags() | |
{ | |
$result = preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.*)?<\/\1>/is', $this->html, $matches); | |
if($result > 0) | |
{ | |
$this->level++ and $level = array(); | |
foreach($matches[0] as $id => $value) | |
{ | |
if($this->level === 0) | |
{ | |
$html_text = $matches[3][$id]; | |
} | |
else | |
{ | |
$html_text = $this->to_html($matches[3][$id], $this->level-1); | |
} | |
$level[] = array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $matches[1][$id], 'level' => $this->level, 'text' => $matches[3][$id], 'attr' => $matches[2][$id] , 'htmlText' => $html_text); | |
$this->html = str_replace($value, $this->separator.$this->level.'-'.$id.$this->separator, $this->html); | |
} | |
$this->level[$this->level] = $level; | |
} | |
} | |
private function simple_tags_exist() | |
{ | |
return preg_match('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html) > 0; | |
} | |
private function single_tags_exist() | |
{ | |
return preg_match('/<('.$this->single_tags.')(.[^><]*)?>/is', $this->html) > 0; | |
} | |
private function clean($html) | |
{ | |
// Strip characters unnecessary to parsing. | |
$html = str_replace(array("\n","\r",' ',"\t"),'',$html); | |
// Strip extra whitespace | |
$html = preg_replace('| +|', ' ', $html); | |
return $html; | |
} | |
public function to_array($html = '') | |
{ | |
if($html!='') | |
{ | |
$this->html = $this->clean($html); | |
} | |
while($this->simple_tags_exist() or $this->single_tags_exist()) | |
{ | |
$this->replace_single_tags(); | |
$this->replace_simple_tags(); | |
} | |
$this->replace_remaining_tags(); | |
return $this->get_array($this->html); | |
} | |
private function get_replacements($data) | |
{ | |
$final = array(); | |
$data = explode($this->separator, $data); | |
$count = count($data); | |
for($i=0; $i<($count-1)/2; $i++) | |
{ | |
$final[] = $this->separator.$data[$i*2+1].$this->separator; | |
} | |
return $final; | |
} | |
private function starts_with_text($data) | |
{ | |
$data = substr(trim(str_replace(array("\n","\r"),'',$data)), 0, 1); | |
return $data != '<' and $data != '>'; | |
} | |
private function in_array(array $array, $string) | |
{ | |
foreach($array as $item) | |
{ | |
if(strpos($str, $item) !== false) | |
{ | |
return true; | |
} | |
} | |
return false; | |
} | |
private function get_array($html, $parent = '') | |
{ | |
$final=array(); | |
if(strpos($html, $this->separator) !==false ) | |
{ | |
$replacements = $this->get_replacements($html); | |
foreach($replacements as $replacement) | |
{ | |
$repl = $replacement; | |
$replacement = explode($this->separator, $replacement); | |
$replacement = explode('-',$replacement[1]); | |
$element = $this->level_array[$replacement[0]][$replacement[1]]; | |
$this->level_array[$replacement[0]][$replacement[1]]['parent'] = $parent; | |
$final[] = array( 'tag' => $element['tag'], 'innerHTML' => $element['htmlText'], 'repl' => $element['rep'],'stratr' => $element['attr'], 'level' => $element['level'], 'parent' => $parent, 'children' => $this->get_array($element['text'], $repl)); | |
} | |
} | |
return $final; | |
} | |
public function loadNode($rep) | |
{ | |
$element = $this->get_element($rep); | |
return array( 'tag' => $element['tag'], 'innerHTML' => $element['htmlText'], 'repl' => $element['rep'], 'stratr' => $element['attr'], 'level' => $element['level'], 'parent' => $element['parent']); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
what do this code