Skip to content

Instantly share code, notes, and snippets.

@nerdsrescueme
Created October 13, 2011 20:17
Show Gist options
  • Save nerdsrescueme/1285380 to your computer and use it in GitHub Desktop.
Save nerdsrescueme/1285380 to your computer and use it in GitHub Desktop.
HTML
<?php
namespace Atom\Parser;
class Html {
private $separator = '-{}-';
private $single_tags = 'meta|img|hr|br|link|!--|!DOCTYPE|input';
//-- Don't edit below this --
private $html;
private $level = -1;
public $level_array = array();
function __construct($html)
{
$this->html = $this->clean($html);
}
private function get_element($data)
{
$data = explode($this->separator, $data);
$data = explode('-', $data[1]);
return $this->level_array[$data[0]][$data[1]];
}
private function to_html($string, $level)
{
$items = $this->get_replacements($string);
foreach($items as $item)
{
$elem = $this->get_element($item);
$string = str_replace(
$item,
$level === 0 ? $elem['htmlText'] : '<'.$elem['tag'].$elem['attr'].'>'.$elem['htmlText'].'</'.$elem['tag'].'>',
$string
);
}
return $string;
}
private function replace_single_tags()
{
$result = preg_match_all('/<('.$this->single_tags.')(.[^><]*)?>/is', $this->html, $matches);
if($result > 0)
{
foreach ($matches[0] as $value)
{
$this->html = str_replace($value, '', $this->html);
}
}
}
private function replace_simple_tags()
{
$result = preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html, $matches);
if ($result>0)
{
$this->level++ and $level = array();
foreach($matches[0] as $id => $value)
{
if($this->level === 0)
{
$html_text = $value;
}
else
{
$html_text=$this->to_html($matches[3][$id], $this->level-1);
}
$level[] = array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $matches[1][$id], 'level' => $this->level, 'text' => $matches[3][$id], 'attr' => $matches[2][$id] , 'htmlText' => $html_text);
$this->html = str_replace($value, $this->separator.$this->level.'-'.$id.$this->separator, $this->html);
}
$this->level_array[$this->level] = $level;
}
}
private function replace_remaining_tags()
{
$result = preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.*)?<\/\1>/is', $this->html, $matches);
if($result > 0)
{
$this->level++ and $level = array();
foreach($matches[0] as $id => $value)
{
if($this->level === 0)
{
$html_text = $matches[3][$id];
}
else
{
$html_text = $this->to_html($matches[3][$id], $this->level-1);
}
$level[] = array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $matches[1][$id], 'level' => $this->level, 'text' => $matches[3][$id], 'attr' => $matches[2][$id] , 'htmlText' => $html_text);
$this->html = str_replace($value, $this->separator.$this->level.'-'.$id.$this->separator, $this->html);
}
$this->level[$this->level] = $level;
}
}
private function simple_tags_exist()
{
return preg_match('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html) > 0;
}
private function single_tags_exist()
{
return preg_match('/<('.$this->single_tags.')(.[^><]*)?>/is', $this->html) > 0;
}
private function clean($html)
{
// Strip characters unnecessary to parsing.
$html = str_replace(array("\n","\r",'&nbsp;',"\t"),'',$html);
// Strip extra whitespace
$html = preg_replace('| +|', ' ', $html);
return $html;
}
public function to_array($html = '')
{
if($html!='')
{
$this->html = $this->clean($html);
}
while($this->simple_tags_exist() or $this->single_tags_exist())
{
$this->replace_single_tags();
$this->replace_simple_tags();
}
$this->replace_remaining_tags();
return $this->get_array($this->html);
}
private function get_replacements($data)
{
$final = array();
$data = explode($this->separator, $data);
$count = count($data);
for($i=0; $i<($count-1)/2; $i++)
{
$final[] = $this->separator.$data[$i*2+1].$this->separator;
}
return $final;
}
private function starts_with_text($data)
{
$data = substr(trim(str_replace(array("\n","\r"),'',$data)), 0, 1);
return $data != '<' and $data != '>';
}
private function in_array(array $array, $string)
{
foreach($array as $item)
{
if(strpos($str, $item) !== false)
{
return true;
}
}
return false;
}
private function get_array($html, $parent = '')
{
$final=array();
if(strpos($html, $this->separator) !==false )
{
$replacements = $this->get_replacements($html);
foreach($replacements as $replacement)
{
$repl = $replacement;
$replacement = explode($this->separator, $replacement);
$replacement = explode('-',$replacement[1]);
$element = $this->level_array[$replacement[0]][$replacement[1]];
$this->level_array[$replacement[0]][$replacement[1]]['parent'] = $parent;
$final[] = array( 'tag' => $element['tag'], 'innerHTML' => $element['htmlText'], 'repl' => $element['rep'],'stratr' => $element['attr'], 'level' => $element['level'], 'parent' => $parent, 'children' => $this->get_array($element['text'], $repl));
}
}
return $final;
}
public function loadNode($rep)
{
$element = $this->get_element($rep);
return array( 'tag' => $element['tag'], 'innerHTML' => $element['htmlText'], 'repl' => $element['rep'], 'stratr' => $element['attr'], 'level' => $element['level'], 'parent' => $element['parent']);
}
}
@akhlaqs
Copy link

akhlaqs commented Apr 14, 2016

what do this code

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment