Last active
May 14, 2020 17:14
-
-
Save peterchaula/a7ae057dd649cc8567ed1cae6e3178ff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Website: http://sourceforge.net/projects/simplehtmldom/ | |
* Additional projects: http://sourceforge.net/projects/debugobject/ | |
* Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) | |
* | |
* Licensed under The MIT License | |
* See the LICENSE file in the project root for more information. | |
* | |
* Authors: | |
* S.C. Chen | |
* John Schlick | |
* Rus Carroll | |
* logmanoriginal | |
* | |
* Contributors: | |
* Yousuke Kumakura | |
* Vadim Voituk | |
* Antcs | |
* | |
* Version Rev. 1.9.1 (291) | |
*/ | |
define('HDOM_TYPE_ELEMENT', 1); | |
define('HDOM_TYPE_COMMENT', 2); | |
define('HDOM_TYPE_TEXT', 3); | |
define('HDOM_TYPE_ENDTAG', 4); | |
define('HDOM_TYPE_ROOT', 5); | |
define('HDOM_TYPE_UNKNOWN', 6); | |
define('HDOM_QUOTE_DOUBLE', 0); | |
define('HDOM_QUOTE_SINGLE', 1); | |
define('HDOM_QUOTE_NO', 3); | |
define('HDOM_INFO_BEGIN', 0); | |
define('HDOM_INFO_END', 1); | |
define('HDOM_INFO_QUOTE', 2); | |
define('HDOM_INFO_SPACE', 3); | |
define('HDOM_INFO_TEXT', 4); | |
define('HDOM_INFO_INNER', 5); | |
define('HDOM_INFO_OUTER', 6); | |
define('HDOM_INFO_ENDSPACE', 7); | |
defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); | |
defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); | |
defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); | |
defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); | |
define('HDOM_SMARTY_AS_TEXT', 1); | |
function file_get_html( | |
$url, | |
$use_include_path = false, | |
$context = null, | |
$offset = 0, | |
$maxLen = -1, | |
$lowercase = true, | |
$forceTagsClosed = true, | |
$target_charset = DEFAULT_TARGET_CHARSET, | |
$stripRN = true, | |
$defaultBRText = DEFAULT_BR_TEXT, | |
$defaultSpanText = DEFAULT_SPAN_TEXT) | |
{ | |
if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } | |
$dom = new simple_html_dom( | |
null, | |
$lowercase, | |
$forceTagsClosed, | |
$target_charset, | |
$stripRN, | |
$defaultBRText, | |
$defaultSpanText | |
); | |
/** | |
* For sourceforge users: uncomment the next line and comment the | |
* retrieve_url_contents line 2 lines down if it is not already done. | |
*/ | |
$contents = file_get_contents( | |
$url, | |
$use_include_path, | |
$context, | |
$offset, | |
$maxLen | |
); | |
// $contents = retrieve_url_contents($url); | |
if (empty($contents) || strlen($contents) > $maxLen) { | |
$dom->clear(); | |
return false; | |
} | |
return $dom->load($contents, $lowercase, $stripRN); | |
} | |
function str_get_html( | |
$str, | |
$lowercase = true, | |
$forceTagsClosed = true, | |
$target_charset = DEFAULT_TARGET_CHARSET, | |
$stripRN = true, | |
$defaultBRText = DEFAULT_BR_TEXT, | |
$defaultSpanText = DEFAULT_SPAN_TEXT) | |
{ | |
$dom = new simple_html_dom( | |
null, | |
$lowercase, | |
$forceTagsClosed, | |
$target_charset, | |
$stripRN, | |
$defaultBRText, | |
$defaultSpanText | |
); | |
if (empty($str) || strlen($str) > MAX_FILE_SIZE) { | |
$dom->clear(); | |
return false; | |
} | |
return $dom->load($str, $lowercase, $stripRN); | |
} | |
function dump_html_tree($node, $show_attr = true, $deep = 0) | |
{ | |
$node->dump($node); | |
} | |
class simple_html_dom_node | |
{ | |
public $nodetype = HDOM_TYPE_TEXT; | |
public $tag = 'text'; | |
public $attr = array(); | |
public $children = array(); | |
public $nodes = array(); | |
public $parent = null; | |
public $_ = array(); | |
public $tag_start = 0; | |
private $dom = null; | |
function __construct($dom) | |
{ | |
$this->dom = $dom; | |
$dom->nodes[] = $this; | |
} | |
function __destruct() | |
{ | |
$this->clear(); | |
} | |
function __toString() | |
{ | |
return $this->outertext(); | |
} | |
function clear() | |
{ | |
$this->dom = null; | |
$this->nodes = null; | |
$this->parent = null; | |
$this->children = null; | |
} | |
function dump($show_attr = true, $depth = 0) | |
{ | |
echo str_repeat("\t", $depth) . $this->tag; | |
if ($show_attr && count($this->attr) > 0) { | |
echo '('; | |
foreach ($this->attr as $k => $v) { | |
echo "[$k]=>\"$v\", "; | |
} | |
echo ')'; | |
} | |
echo "\n"; | |
if ($this->nodes) { | |
foreach ($this->nodes as $node) { | |
$node->dump($show_attr, $depth + 1); | |
} | |
} | |
} | |
function dump_node($echo = true) | |
{ | |
$string = $this->tag; | |
if (count($this->attr) > 0) { | |
$string .= '('; | |
foreach ($this->attr as $k => $v) { | |
$string .= "[$k]=>\"$v\", "; | |
} | |
$string .= ')'; | |
} | |
if (count($this->_) > 0) { | |
$string .= ' $_ ('; | |
foreach ($this->_ as $k => $v) { | |
if (is_array($v)) { | |
$string .= "[$k]=>("; | |
foreach ($v as $k2 => $v2) { | |
$string .= "[$k2]=>\"$v2\", "; | |
} | |
$string .= ')'; | |
} else { | |
$string .= "[$k]=>\"$v\", "; | |
} | |
} | |
$string .= ')'; | |
} | |
if (isset($this->text)) { | |
$string .= " text: ({$this->text})"; | |
} | |
$string .= ' HDOM_INNER_INFO: '; | |
if (isset($node->_[HDOM_INFO_INNER])) { | |
$string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; | |
} else { | |
$string .= ' NULL '; | |
} | |
$string .= ' children: ' . count($this->children); | |
$string .= ' nodes: ' . count($this->nodes); | |
$string .= ' tag_start: ' . $this->tag_start; | |
$string .= "\n"; | |
if ($echo) { | |
echo $string; | |
return; | |
} else { | |
return $string; | |
} | |
} | |
function parent($parent = null) | |
{ | |
// I am SURE that this doesn't work properly. | |
// It fails to unset the current node from it's current parents nodes or | |
// children list first. | |
if ($parent !== null) { | |
$this->parent = $parent; | |
$this->parent->nodes[] = $this; | |
$this->parent->children[] = $this; | |
} | |
return $this->parent; | |
} | |
function has_child() | |
{ | |
return !empty($this->children); | |
} | |
function children($idx = -1) | |
{ | |
if ($idx === -1) { | |
return $this->children; | |
} | |
if (isset($this->children[$idx])) { | |
return $this->children[$idx]; | |
} | |
return null; | |
} | |
function first_child() | |
{ | |
if (count($this->children) > 0) { | |
return $this->children[0]; | |
} | |
return null; | |
} | |
function last_child() | |
{ | |
if (count($this->children) > 0) { | |
return end($this->children); | |
} | |
return null; | |
} | |
function next_sibling() | |
{ | |
if ($this->parent === null) { | |
return null; | |
} | |
$idx = array_search($this, $this->parent->children, true); | |
if ($idx !== false && isset($this->parent->children[$idx + 1])) { | |
return $this->parent->children[$idx + 1]; | |
} | |
return null; | |
} | |
function prev_sibling() | |
{ | |
if ($this->parent === null) { | |
return null; | |
} | |
$idx = array_search($this, $this->parent->children, true); | |
if ($idx !== false && $idx > 0) { | |
return $this->parent->children[$idx - 1]; | |
} | |
return null; | |
} | |
function find_ancestor_tag($tag) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | |
if ($this->parent === null) { | |
return null; | |
} | |
$ancestor = $this->parent; | |
while (!is_null($ancestor)) { | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); | |
} | |
if ($ancestor->tag === $tag) { | |
break; | |
} | |
$ancestor = $ancestor->parent; | |
} | |
return $ancestor; | |
} | |
function innertext() | |
{ | |
if (isset($this->_[HDOM_INFO_INNER])) { | |
return $this->_[HDOM_INFO_INNER]; | |
} | |
if (isset($this->_[HDOM_INFO_TEXT])) { | |
return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | |
} | |
$ret = ''; | |
foreach ($this->nodes as $n) { | |
$ret .= $n->outertext(); | |
} | |
return $ret; | |
} | |
function outertext() | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { | |
$text = ''; | |
if ($this->tag === 'text') { | |
if (!empty($this->text)) { | |
$text = ' with text: ' . $this->text; | |
} | |
} | |
$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); | |
} | |
if ($this->tag === 'root') { | |
return $this->innertext(); | |
} | |
// todo: What is the use of this callback? Remove? | |
if ($this->dom && $this->dom->callback !== null) { | |
call_user_func_array($this->dom->callback, array($this)); | |
} | |
if (isset($this->_[HDOM_INFO_OUTER])) { | |
return $this->_[HDOM_INFO_OUTER]; | |
} | |
if (isset($this->_[HDOM_INFO_TEXT])) { | |
return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | |
} | |
$ret = ''; | |
if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { | |
$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); | |
} | |
if (isset($this->_[HDOM_INFO_INNER])) { | |
// todo: <br> should either never have HDOM_INFO_INNER or always | |
if ($this->tag !== 'br') { | |
$ret .= $this->_[HDOM_INFO_INNER]; | |
} | |
} elseif ($this->nodes) { | |
foreach ($this->nodes as $n) { | |
$ret .= $this->convert_text($n->outertext()); | |
} | |
} | |
if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { | |
$ret .= '</' . $this->tag . '>'; | |
} | |
return $ret; | |
} | |
function text() | |
{ | |
if (isset($this->_[HDOM_INFO_INNER])) { | |
return $this->_[HDOM_INFO_INNER]; | |
} | |
switch ($this->nodetype) { | |
case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | |
case HDOM_TYPE_COMMENT: return ''; | |
case HDOM_TYPE_UNKNOWN: return ''; | |
} | |
if (strcasecmp($this->tag, 'script') === 0) { return ''; } | |
if (strcasecmp($this->tag, 'style') === 0) { return ''; } | |
$ret = ''; | |
// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed | |
// for some span tags, and some p tags) $this->nodes is set to NULL. | |
// NOTE: This indicates that there is a problem where it's set to NULL | |
// without a clear happening. | |
// WHY is this happening? | |
if (!is_null($this->nodes)) { | |
foreach ($this->nodes as $n) { | |
// Start paragraph after a blank line | |
if ($n->tag === 'p') { | |
$ret = trim($ret) . "\n\n"; | |
} | |
$ret .= $this->convert_text($n->text()); | |
// If this node is a span... add a space at the end of it so | |
// multiple spans don't run into each other. This is plaintext | |
// after all. | |
if ($n->tag === 'span') { | |
$ret .= $this->dom->default_span_text; | |
} | |
} | |
} | |
return $ret; | |
} | |
function xmltext() | |
{ | |
$ret = $this->innertext(); | |
$ret = str_ireplace('<![CDATA[', '', $ret); | |
$ret = str_replace(']]>', '', $ret); | |
return $ret; | |
} | |
function makeup() | |
{ | |
// text, comment, unknown | |
if (isset($this->_[HDOM_INFO_TEXT])) { | |
return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | |
} | |
$ret = '<' . $this->tag; | |
$i = -1; | |
foreach ($this->attr as $key => $val) { | |
++$i; | |
// skip removed attribute | |
if ($val === null || $val === false) { continue; } | |
$ret .= $this->_[HDOM_INFO_SPACE][$i][0]; | |
//no value attr: nowrap, checked selected... | |
if ($val === true) { | |
$ret .= $key; | |
} else { | |
switch ($this->_[HDOM_INFO_QUOTE][$i]) | |
{ | |
case HDOM_QUOTE_DOUBLE: $quote = '"'; break; | |
case HDOM_QUOTE_SINGLE: $quote = '\''; break; | |
default: $quote = ''; | |
} | |
$ret .= $key | |
. $this->_[HDOM_INFO_SPACE][$i][1] | |
. '=' | |
. $this->_[HDOM_INFO_SPACE][$i][2] | |
. $quote | |
. $val | |
. $quote; | |
} | |
} | |
$ret = $this->dom->restore_noise($ret); | |
return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; | |
} | |
function find($selector, $idx = null, $lowercase = false) | |
{ | |
$selectors = $this->parse_selector($selector); | |
if (($count = count($selectors)) === 0) { return array(); } | |
$found_keys = array(); | |
// find each selector | |
for ($c = 0; $c < $count; ++$c) { | |
// The change on the below line was documented on the sourceforge | |
// code tracker id 2788009 | |
// used to be: if (($levle=count($selectors[0]))===0) return array(); | |
if (($levle = count($selectors[$c])) === 0) { return array(); } | |
if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } | |
$head = array($this->_[HDOM_INFO_BEGIN] => 1); | |
$cmd = ' '; // Combinator | |
// handle descendant selectors, no recursive! | |
for ($l = 0; $l < $levle; ++$l) { | |
$ret = array(); | |
foreach ($head as $k => $v) { | |
$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; | |
//PaperG - Pass this optional parameter on to the seek function. | |
$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); | |
} | |
$head = $ret; | |
$cmd = $selectors[$c][$l][4]; // Next Combinator | |
} | |
foreach ($head as $k => $v) { | |
if (!isset($found_keys[$k])) { | |
$found_keys[$k] = 1; | |
} | |
} | |
} | |
// sort keys | |
ksort($found_keys); | |
$found = array(); | |
foreach ($found_keys as $k => $v) { | |
$found[] = $this->dom->nodes[$k]; | |
} | |
// return nth-element or array | |
if (is_null($idx)) { return $found; } | |
elseif ($idx < 0) { $idx = count($found) + $idx; } | |
return (isset($found[$idx])) ? $found[$idx] : null; | |
} | |
protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | |
list($tag, $id, $class, $attributes, $cmb) = $selector; | |
$nodes = array(); | |
if ($parent_cmd === ' ') { // Descendant Combinator | |
// Find parent closing tag if the current element doesn't have a closing | |
// tag (i.e. void element) | |
$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; | |
if ($end == 0) { | |
$parent = $this->parent; | |
while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { | |
$end -= 1; | |
$parent = $parent->parent; | |
} | |
$end += $parent->_[HDOM_INFO_END]; | |
} | |
// Get list of target nodes | |
$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; | |
$nodes_count = $end - $nodes_start; | |
$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); | |
} elseif ($parent_cmd === '>') { // Child Combinator | |
$nodes = $this->children; | |
} elseif ($parent_cmd === '+' | |
&& $this->parent | |
&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator | |
$index = array_search($this, $this->parent->children, true) + 1; | |
if ($index < count($this->parent->children)) | |
$nodes[] = $this->parent->children[$index]; | |
} elseif ($parent_cmd === '~' | |
&& $this->parent | |
&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator | |
$index = array_search($this, $this->parent->children, true); | |
$nodes = array_slice($this->parent->children, $index); | |
} | |
// Go throgh each element starting at this element until the end tag | |
// Note: If this element is a void tag, any previous void element is | |
// skipped. | |
foreach($nodes as $node) { | |
$pass = true; | |
// Skip root nodes | |
if(!$node->parent) { | |
$pass = false; | |
} | |
// Handle 'text' selector | |
if($pass && $tag === 'text' && $node->tag === 'text') { | |
$ret[array_search($node, $this->dom->nodes, true)] = 1; | |
unset($node); | |
continue; | |
} | |
// Skip if node isn't a child node (i.e. text nodes) | |
if($pass && !in_array($node, $node->parent->children, true)) { | |
$pass = false; | |
} | |
// Skip if tag doesn't match | |
if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { | |
$pass = false; | |
} | |
// Skip if ID doesn't exist | |
if ($pass && $id !== '' && !isset($node->attr['id'])) { | |
$pass = false; | |
} | |
// Check if ID matches | |
if ($pass && $id !== '' && isset($node->attr['id'])) { | |
// Note: Only consider the first ID (as browsers do) | |
$node_id = explode(' ', trim($node->attr['id']))[0]; | |
if($id !== $node_id) { $pass = false; } | |
} | |
// Check if all class(es) exist | |
if ($pass && $class !== '' && is_array($class) && !empty($class)) { | |
if (isset($node->attr['class'])) { | |
$node_classes = explode(' ', $node->attr['class']); | |
if ($lowercase) { | |
$node_classes = array_map('strtolower', $node_classes); | |
} | |
foreach($class as $c) { | |
if(!in_array($c, $node_classes)) { | |
$pass = false; | |
break; | |
} | |
} | |
} else { | |
$pass = false; | |
} | |
} | |
// Check attributes | |
if ($pass | |
&& $attributes !== '' | |
&& is_array($attributes) | |
&& !empty($attributes)) { | |
foreach($attributes as $a) { | |
list ( | |
$att_name, | |
$att_expr, | |
$att_val, | |
$att_inv, | |
$att_case_sensitivity | |
) = $a; | |
// Handle indexing attributes (i.e. "[2]") | |
/** | |
* Note: This is not supported by the CSS Standard but adds | |
* the ability to select items compatible to XPath (i.e. | |
* the 3rd element within it's parent). | |
* | |
* Note: This doesn't conflict with the CSS Standard which | |
* doesn't work on numeric attributes anyway. | |
*/ | |
if (is_numeric($att_name) | |
&& $att_expr === '' | |
&& $att_val === '') { | |
$count = 0; | |
// Find index of current element in parent | |
foreach ($node->parent->children as $c) { | |
if ($c->tag === $node->tag) ++$count; | |
if ($c === $node) break; | |
} | |
// If this is the correct node, continue with next | |
// attribute | |
if ($count === (int)$att_name) continue; | |
} | |
// Check attribute availability | |
if ($att_inv) { // Attribute should NOT be set | |
if (isset($node->attr[$att_name])) { | |
$pass = false; | |
break; | |
} | |
} else { // Attribute should be set | |
// todo: "plaintext" is not a valid CSS selector! | |
if ($att_name !== 'plaintext' | |
&& !isset($node->attr[$att_name])) { | |
$pass = false; | |
break; | |
} | |
} | |
// Continue with next attribute if expression isn't defined | |
if ($att_expr === '') continue; | |
// If they have told us that this is a "plaintext" | |
// search then we want the plaintext of the node - right? | |
// todo "plaintext" is not a valid CSS selector! | |
if ($att_name === 'plaintext') { | |
$nodeKeyValue = $node->text(); | |
} else { | |
$nodeKeyValue = $node->attr[$att_name]; | |
} | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, | |
'testing node: ' | |
. $node->tag | |
. ' for attribute: ' | |
. $att_name | |
. $att_expr | |
. $att_val | |
. ' where nodes value is: ' | |
. $nodeKeyValue | |
); | |
} | |
// If lowercase is set, do a case insensitive test of | |
// the value of the selector. | |
if ($lowercase) { | |
$check = $this->match( | |
$att_expr, | |
strtolower($att_val), | |
strtolower($nodeKeyValue), | |
$att_case_sensitivity | |
); | |
} else { | |
$check = $this->match( | |
$att_expr, | |
$att_val, | |
$nodeKeyValue, | |
$att_case_sensitivity | |
); | |
} | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, | |
'after match: ' | |
. ($check ? 'true' : 'false') | |
); | |
} | |
if (!$check) { | |
$pass = false; | |
break; | |
} | |
} | |
} | |
// Found a match. Add to list and clear node | |
if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; | |
unset($node); | |
} | |
// It's passed by reference so this is actually what this function returns. | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(1, 'EXIT - ret: ', $ret); | |
} | |
} | |
protected function match($exp, $pattern, $value, $case_sensitivity) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} | |
if ($case_sensitivity === 'i') { | |
$pattern = strtolower($pattern); | |
$value = strtolower($value); | |
} | |
switch ($exp) { | |
case '=': | |
return ($value === $pattern); | |
case '!=': | |
return ($value !== $pattern); | |
case '^=': | |
return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); | |
case '$=': | |
return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); | |
case '*=': | |
return preg_match('/' . preg_quote($pattern, '/') . '/', $value); | |
case '|=': | |
/** | |
* [att|=val] | |
* | |
* Represents an element with the att attribute, its value | |
* either being exactly "val" or beginning with "val" | |
* immediately followed by "-" (U+002D). | |
*/ | |
return strpos($value, $pattern) === 0; | |
case '~=': | |
/** | |
* [att~=val] | |
* | |
* Represents an element with the att attribute whose value is a | |
* whitespace-separated list of words, one of which is exactly | |
* "val". If "val" contains whitespace, it will never represent | |
* anything (since the words are separated by spaces). Also if | |
* "val" is the empty string, it will never represent anything. | |
*/ | |
return in_array($pattern, explode(' ', trim($value)), true); | |
} | |
return false; | |
} | |
protected function parse_selector($selector_string) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | |
/** | |
* Pattern of CSS selectors, modified from mootools (https://mootools.net/) | |
* | |
* Paperg: Add the colon to the attribute, so that it properly finds | |
* <tag attr:ibute="something" > like google does. | |
* | |
* Note: if you try to look at this attribute, you MUST use getAttribute | |
* since $dom->x:y will fail the php syntax check. | |
* | |
* Notice the \[ starting the attribute? and the @? following? This | |
* implies that an attribute can begin with an @ sign that is not | |
* captured. This implies that an html attribute specifier may start | |
* with an @ sign that is NOT captured by the expression. Farther study | |
* is required to determine of this should be documented or removed. | |
* | |
* Matches selectors in this order: | |
* | |
* [0] - full match | |
* | |
* [1] - tag name | |
* ([\w:\*-]*) | |
* Matches the tag name consisting of zero or more words, colons, | |
* asterisks and hyphens. | |
* | |
* [2] - id name | |
* (?:\#([\w-]+)) | |
* Optionally matches a id name, consisting of an "#" followed by | |
* the id name (one or more words and hyphens). | |
* | |
* [3] - class names (including dots) | |
* (?:\.([\w\.-]+))? | |
* Optionally matches a list of classs, consisting of an "." | |
* followed by the class name (one or more words and hyphens) | |
* where multiple classes can be chained (i.e. ".foo.bar.baz") | |
* | |
* [4] - attributes | |
* ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? | |
* Optionally matches the attributes list | |
* | |
* [5] - separator | |
* ([\/, >+~]+) | |
* Matches the selector list separator | |
*/ | |
// phpcs:ignore Generic.Files.LineLength | |
$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; | |
preg_match_all( | |
$pattern, | |
trim($selector_string) . ' ', // Add final ' ' as pseudo separator | |
$matches, | |
PREG_SET_ORDER | |
); | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'Matches Array: ', $matches); | |
} | |
$selectors = array(); | |
$result = array(); | |
foreach ($matches as $m) { | |
$m[0] = trim($m[0]); | |
// Skip NoOps | |
if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } | |
// Convert to lowercase | |
if ($this->dom->lowercase) { | |
$m[1] = strtolower($m[1]); | |
} | |
// Extract classes | |
if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } | |
/* Extract attributes (pattern based on the pattern above!) | |
* [0] - full match | |
* [1] - attribute name | |
* [2] - attribute expression | |
* [3] - attribute value | |
* [4] - case sensitivity | |
* | |
* Note: Attributes can be negated with a "!" prefix to their name | |
*/ | |
if($m[4] !== '') { | |
preg_match_all( | |
"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", | |
trim($m[4]), | |
$attributes, | |
PREG_SET_ORDER | |
); | |
// Replace element by array | |
$m[4] = array(); | |
foreach($attributes as $att) { | |
// Skip empty matches | |
if(trim($att[0]) === '') { continue; } | |
$inverted = (isset($att[1][0]) && $att[1][0] === '!'); | |
$m[4][] = array( | |
$inverted ? substr($att[1], 1) : $att[1], // Name | |
(isset($att[2])) ? $att[2] : '', // Expression | |
(isset($att[3])) ? $att[3] : '', // Value | |
$inverted, // Inverted Flag | |
(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity | |
); | |
} | |
} | |
// Sanitize Separator | |
if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator | |
$m[5] = ' '; | |
} else { // Other Separator | |
$m[5] = trim($m[5]); | |
} | |
// Clear Separator if it's a Selector List | |
if ($is_list = ($m[5] === ',')) { $m[5] = ''; } | |
// Remove full match before adding to results | |
array_shift($m); | |
$result[] = $m; | |
if ($is_list) { // Selector List | |
$selectors[] = $result; | |
$result = array(); | |
} | |
} | |
if (count($result) > 0) { $selectors[] = $result; } | |
return $selectors; | |
} | |
function __get($name) | |
{ | |
if (isset($this->attr[$name])) { | |
return $this->convert_text($this->attr[$name]); | |
} | |
switch ($name) { | |
case 'outertext': return $this->outertext(); | |
case 'innertext': return $this->innertext(); | |
case 'plaintext': return $this->text(); | |
case 'xmltext': return $this->xmltext(); | |
default: return array_key_exists($name, $this->attr); | |
} | |
} | |
function __set($name, $value) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | |
switch ($name) { | |
case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; | |
case 'innertext': | |
if (isset($this->_[HDOM_INFO_TEXT])) { | |
return $this->_[HDOM_INFO_TEXT] = $value; | |
} | |
return $this->_[HDOM_INFO_INNER] = $value; | |
} | |
if (!isset($this->attr[$name])) { | |
$this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); | |
$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; | |
} | |
$this->attr[$name] = $value; | |
} | |
function __isset($name) | |
{ | |
switch ($name) { | |
case 'outertext': return true; | |
case 'innertext': return true; | |
case 'plaintext': return true; | |
} | |
//no value attr: nowrap, checked selected... | |
return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); | |
} | |
function __unset($name) | |
{ | |
if (isset($this->attr[$name])) { unset($this->attr[$name]); } | |
} | |
function convert_text($text) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | |
$converted_text = $text; | |
$sourceCharset = ''; | |
$targetCharset = ''; | |
if ($this->dom) { | |
$sourceCharset = strtoupper($this->dom->_charset); | |
$targetCharset = strtoupper($this->dom->_target_charset); | |
} | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(3, | |
'source charset: ' | |
. $sourceCharset | |
. ' target charaset: ' | |
. $targetCharset | |
); | |
} | |
if (!empty($sourceCharset) | |
&& !empty($targetCharset) | |
&& (strcasecmp($sourceCharset, $targetCharset) != 0)) { | |
// Check if the reported encoding could have been incorrect and the text is actually already UTF-8 | |
if ((strcasecmp($targetCharset, 'UTF-8') == 0) | |
&& ($this->is_utf8($text))) { | |
$converted_text = $text; | |
} else { | |
$converted_text = iconv($sourceCharset, $targetCharset, $text); | |
} | |
} | |
// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. | |
if ($targetCharset === 'UTF-8') { | |
if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { | |
$converted_text = substr($converted_text, 3); | |
} | |
if (substr($converted_text, -3) === "\xef\xbb\xbf") { | |
$converted_text = substr($converted_text, 0, -3); | |
} | |
} | |
return $converted_text; | |
} | |
static function is_utf8($str) | |
{ | |
$c = 0; $b = 0; | |
$bits = 0; | |
$len = strlen($str); | |
for($i = 0; $i < $len; $i++) { | |
$c = ord($str[$i]); | |
if($c > 128) { | |
if(($c >= 254)) { return false; } | |
elseif($c >= 252) { $bits = 6; } | |
elseif($c >= 248) { $bits = 5; } | |
elseif($c >= 240) { $bits = 4; } | |
elseif($c >= 224) { $bits = 3; } | |
elseif($c >= 192) { $bits = 2; } | |
else { return false; } | |
if(($i + $bits) > $len) { return false; } | |
while($bits > 1) { | |
$i++; | |
$b = ord($str[$i]); | |
if($b < 128 || $b > 191) { return false; } | |
$bits--; | |
} | |
} | |
} | |
return true; | |
} | |
function get_display_size() | |
{ | |
global $debug_object; | |
$width = -1; | |
$height = -1; | |
if ($this->tag !== 'img') { | |
return false; | |
} | |
// See if there is aheight or width attribute in the tag itself. | |
if (isset($this->attr['width'])) { | |
$width = $this->attr['width']; | |
} | |
if (isset($this->attr['height'])) { | |
$height = $this->attr['height']; | |
} | |
// Now look for an inline style. | |
if (isset($this->attr['style'])) { | |
// Thanks to user gnarf from stackoverflow for this regular expression. | |
$attributes = array(); | |
preg_match_all( | |
'/([\w-]+)\s*:\s*([^;]+)\s*;?/', | |
$this->attr['style'], | |
$matches, | |
PREG_SET_ORDER | |
); | |
foreach ($matches as $match) { | |
$attributes[$match[1]] = $match[2]; | |
} | |
// If there is a width in the style attributes: | |
if (isset($attributes['width']) && $width == -1) { | |
// check that the last two characters are px (pixels) | |
if (strtolower(substr($attributes['width'], -2)) === 'px') { | |
$proposed_width = substr($attributes['width'], 0, -2); | |
// Now make sure that it's an integer and not something stupid. | |
if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { | |
$width = $proposed_width; | |
} | |
} | |
} | |
// If there is a width in the style attributes: | |
if (isset($attributes['height']) && $height == -1) { | |
// check that the last two characters are px (pixels) | |
if (strtolower(substr($attributes['height'], -2)) == 'px') { | |
$proposed_height = substr($attributes['height'], 0, -2); | |
// Now make sure that it's an integer and not something stupid. | |
if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { | |
$height = $proposed_height; | |
} | |
} | |
} | |
} | |
// Future enhancement: | |
// Look in the tag to see if there is a class or id specified that has | |
// a height or width attribute to it. | |
// Far future enhancement | |
// Look at all the parent tags of this image to see if they specify a | |
// class or id that has an img selector that specifies a height or width | |
// Note that in this case, the class or id will have the img subselector | |
// for it to apply to the image. | |
// ridiculously far future development | |
// If the class or id is specified in a SEPARATE css file thats not on | |
// the page, go get it and do what we were just doing for the ones on | |
// the page. | |
$result = array( | |
'height' => $height, | |
'width' => $width | |
); | |
return $result; | |
} | |
function save($filepath = '') | |
{ | |
$ret = $this->outertext(); | |
if ($filepath !== '') { | |
file_put_contents($filepath, $ret, LOCK_EX); | |
} | |
return $ret; | |
} | |
function addClass($class) | |
{ | |
if (is_string($class)) { | |
$class = explode(' ', $class); | |
} | |
if (is_array($class)) { | |
foreach($class as $c) { | |
if (isset($this->class)) { | |
if ($this->hasClass($c)) { | |
continue; | |
} else { | |
$this->class .= ' ' . $c; | |
} | |
} else { | |
$this->class = $c; | |
} | |
} | |
} else { | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'Invalid type: ', gettype($class)); | |
} | |
} | |
} | |
function hasClass($class) | |
{ | |
if (is_string($class)) { | |
if (isset($this->class)) { | |
return in_array($class, explode(' ', $this->class), true); | |
} | |
} else { | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'Invalid type: ', gettype($class)); | |
} | |
} | |
return false; | |
} | |
function removeClass($class = null) | |
{ | |
if (!isset($this->class)) { | |
return; | |
} | |
if (is_null($class)) { | |
$this->removeAttribute('class'); | |
return; | |
} | |
if (is_string($class)) { | |
$class = explode(' ', $class); | |
} | |
if (is_array($class)) { | |
$class = array_diff(explode(' ', $this->class), $class); | |
if (empty($class)) { | |
$this->removeAttribute('class'); | |
} else { | |
$this->class = implode(' ', $class); | |
} | |
} | |
} | |
function getAllAttributes() | |
{ | |
return $this->attr; | |
} | |
function getAttribute($name) | |
{ | |
return $this->__get($name); | |
} | |
function setAttribute($name, $value) | |
{ | |
$this->__set($name, $value); | |
} | |
function hasAttribute($name) | |
{ | |
return $this->__isset($name); | |
} | |
function removeAttribute($name) | |
{ | |
$this->__set($name, null); | |
} | |
function remove() | |
{ | |
if ($this->parent) { | |
$this->parent->removeChild($this); | |
} | |
} | |
function removeChild($node) | |
{ | |
$nidx = array_search($node, $this->nodes, true); | |
$cidx = array_search($node, $this->children, true); | |
$didx = array_search($node, $this->dom->nodes, true); | |
if ($nidx !== false && $cidx !== false && $didx !== false) { | |
foreach($node->children as $child) { | |
$node->removeChild($child); | |
} | |
foreach($node->nodes as $entity) { | |
$enidx = array_search($entity, $node->nodes, true); | |
$edidx = array_search($entity, $node->dom->nodes, true); | |
if ($enidx !== false && $edidx !== false) { | |
unset($node->nodes[$enidx]); | |
unset($node->dom->nodes[$edidx]); | |
} | |
} | |
unset($this->nodes[$nidx]); | |
unset($this->children[$cidx]); | |
unset($this->dom->nodes[$didx]); | |
$node->clear(); | |
} | |
} | |
function getElementById($id) | |
{ | |
return $this->find("#$id", 0); | |
} | |
function getElementsById($id, $idx = null) | |
{ | |
return $this->find("#$id", $idx); | |
} | |
function getElementByTagName($name) | |
{ | |
return $this->find($name, 0); | |
} | |
function getElementsByTagName($name, $idx = null) | |
{ | |
return $this->find($name, $idx); | |
} | |
function parentNode() | |
{ | |
return $this->parent(); | |
} | |
function childNodes($idx = -1) | |
{ | |
return $this->children($idx); | |
} | |
function firstChild() | |
{ | |
return $this->first_child(); | |
} | |
function lastChild() | |
{ | |
return $this->last_child(); | |
} | |
function nextSibling() | |
{ | |
return $this->next_sibling(); | |
} | |
function previousSibling() | |
{ | |
return $this->prev_sibling(); | |
} | |
function hasChildNodes() | |
{ | |
return $this->has_child(); | |
} | |
function nodeName() | |
{ | |
return $this->tag; | |
} | |
function appendChild($node) | |
{ | |
$node->parent($this); | |
return $node; | |
} | |
} | |
class simple_html_dom | |
{ | |
public $root = null; | |
public $nodes = array(); | |
public $callback = null; | |
public $lowercase = false; | |
public $original_size; | |
public $size; | |
protected $pos; | |
protected $doc; | |
protected $char; | |
protected $cursor; | |
protected $parent; | |
protected $noise = array(); | |
protected $token_blank = " \t\r\n"; | |
protected $token_equal = ' =/>'; | |
protected $token_slash = " />\r\n\t"; | |
protected $token_attr = ' >'; | |
public $_charset = ''; | |
public $_target_charset = ''; | |
protected $default_br_text = ''; | |
public $default_span_text = ''; | |
protected $self_closing_tags = array( | |
'area' => 1, | |
'base' => 1, | |
'br' => 1, | |
'col' => 1, | |
'embed' => 1, | |
'hr' => 1, | |
'img' => 1, | |
'input' => 1, | |
'link' => 1, | |
'meta' => 1, | |
'param' => 1, | |
'source' => 1, | |
'track' => 1, | |
'wbr' => 1 | |
); | |
protected $block_tags = array( | |
'body' => 1, | |
'div' => 1, | |
'form' => 1, | |
'root' => 1, | |
'span' => 1, | |
'table' => 1 | |
); | |
protected $optional_closing_tags = array( | |
// Not optional, see | |
// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element | |
'b' => array('b' => 1), | |
'dd' => array('dd' => 1, 'dt' => 1), | |
// Not optional, see | |
// https://www.w3.org/TR/html/grouping-content.html#the-dl-element | |
'dl' => array('dd' => 1, 'dt' => 1), | |
'dt' => array('dd' => 1, 'dt' => 1), | |
'li' => array('li' => 1), | |
'optgroup' => array('optgroup' => 1, 'option' => 1), | |
'option' => array('optgroup' => 1, 'option' => 1), | |
'p' => array('p' => 1), | |
'rp' => array('rp' => 1, 'rt' => 1), | |
'rt' => array('rp' => 1, 'rt' => 1), | |
'td' => array('td' => 1, 'th' => 1), | |
'th' => array('td' => 1, 'th' => 1), | |
'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), | |
); | |
function __construct( | |
$str = null, | |
$lowercase = true, | |
$forceTagsClosed = true, | |
$target_charset = DEFAULT_TARGET_CHARSET, | |
$stripRN = true, | |
$defaultBRText = DEFAULT_BR_TEXT, | |
$defaultSpanText = DEFAULT_SPAN_TEXT, | |
$options = 0) | |
{ | |
if ($str) { | |
if (preg_match('/^http:\/\//i', $str) || is_file($str)) { | |
$this->load_file($str); | |
} else { | |
$this->load( | |
$str, | |
$lowercase, | |
$stripRN, | |
$defaultBRText, | |
$defaultSpanText, | |
$options | |
); | |
} | |
} | |
// Forcing tags to be closed implies that we don't trust the html, but | |
// it can lead to parsing errors if we SHOULD trust the html. | |
if (!$forceTagsClosed) { | |
$this->optional_closing_array = array(); | |
} | |
$this->_target_charset = $target_charset; | |
} | |
function __destruct() | |
{ | |
$this->clear(); | |
} | |
function load( | |
$str, | |
$lowercase = true, | |
$stripRN = true, | |
$defaultBRText = DEFAULT_BR_TEXT, | |
$defaultSpanText = DEFAULT_SPAN_TEXT, | |
$options = 0) | |
{ | |
global $debug_object; | |
// prepare | |
$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); | |
// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 | |
// Script tags removal now preceeds style tag removal. | |
// strip out <script> tags | |
$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); | |
$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); | |
// strip out the \r \n's if we are told to. | |
if ($stripRN) { | |
$this->doc = str_replace("\r", ' ', $this->doc); | |
$this->doc = str_replace("\n", ' ', $this->doc); | |
// set the length of content since we have changed it. | |
$this->size = strlen($this->doc); | |
} | |
// strip out cdata | |
$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); | |
// strip out comments | |
$this->remove_noise("'<!--(.*?)-->'is"); | |
// strip out <style> tags | |
$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); | |
$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); | |
// strip out preformatted tags | |
$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); | |
// strip out server side scripts | |
$this->remove_noise("'(<\?)(.*?)(\?>)'s", true); | |
if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts | |
$this->remove_noise("'(\{\w)(.*?)(\})'s", true); | |
} | |
// parsing | |
$this->parse(); | |
// end | |
$this->root->_[HDOM_INFO_END] = $this->cursor; | |
$this->parse_charset(); | |
// make load function chainable | |
return $this; | |
} | |
function load_file() | |
{ | |
$args = func_get_args(); | |
if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { | |
$this->load($doc, true); | |
} else { | |
return false; | |
} | |
} | |
function set_callback($function_name) | |
{ | |
$this->callback = $function_name; | |
} | |
function remove_callback() | |
{ | |
$this->callback = null; | |
} | |
function save($filepath = '') | |
{ | |
$ret = $this->root->innertext(); | |
if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } | |
return $ret; | |
} | |
function find($selector, $idx = null, $lowercase = false) | |
{ | |
return $this->root->find($selector, $idx, $lowercase); | |
} | |
function clear() | |
{ | |
if (isset($this->nodes)) { | |
foreach ($this->nodes as $n) { | |
$n->clear(); | |
$n = null; | |
} | |
} | |
// This add next line is documented in the sourceforge repository. | |
// 2977248 as a fix for ongoing memory leaks that occur even with the | |
// use of clear. | |
if (isset($this->children)) { | |
foreach ($this->children as $n) { | |
$n->clear(); | |
$n = null; | |
} | |
} | |
if (isset($this->parent)) { | |
$this->parent->clear(); | |
unset($this->parent); | |
} | |
if (isset($this->root)) { | |
$this->root->clear(); | |
unset($this->root); | |
} | |
unset($this->doc); | |
unset($this->noise); | |
} | |
function dump($show_attr = true) | |
{ | |
$this->root->dump($show_attr); | |
} | |
protected function prepare( | |
$str, $lowercase = true, | |
$defaultBRText = DEFAULT_BR_TEXT, | |
$defaultSpanText = DEFAULT_SPAN_TEXT) | |
{ | |
$this->clear(); | |
$this->doc = trim($str); | |
$this->size = strlen($this->doc); | |
$this->original_size = $this->size; // original size of the html | |
$this->pos = 0; | |
$this->cursor = 1; | |
$this->noise = array(); | |
$this->nodes = array(); | |
$this->lowercase = $lowercase; | |
$this->default_br_text = $defaultBRText; | |
$this->default_span_text = $defaultSpanText; | |
$this->root = new simple_html_dom_node($this); | |
$this->root->tag = 'root'; | |
$this->root->_[HDOM_INFO_BEGIN] = -1; | |
$this->root->nodetype = HDOM_TYPE_ROOT; | |
$this->parent = $this->root; | |
if ($this->size > 0) { $this->char = $this->doc[0]; } | |
} | |
protected function parse() | |
{ | |
while (true) { | |
// Read next tag if there is no text between current position and the | |
// next opening tag. | |
if (($s = $this->copy_until_char('<')) === '') { | |
if($this->read_tag()) { | |
continue; | |
} else { | |
return true; | |
} | |
} | |
// Add a text node for text between tags | |
$node = new simple_html_dom_node($this); | |
++$this->cursor; | |
$node->_[HDOM_INFO_TEXT] = $s; | |
$this->link_nodes($node, false); | |
} | |
} | |
protected function parse_charset() | |
{ | |
global $debug_object; | |
$charset = null; | |
if (function_exists('get_last_retrieve_url_contents_content_type')) { | |
$contentTypeHeader = get_last_retrieve_url_contents_content_type(); | |
$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); | |
if ($success) { | |
$charset = $matches[1]; | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, | |
'header content-type found charset of: ' | |
. $charset | |
); | |
} | |
} | |
} | |
if (empty($charset)) { | |
// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type | |
$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); | |
if (!empty($el)) { | |
$fullvalue = $el->content; | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, | |
'meta content-type tag found' | |
. $fullvalue | |
); | |
} | |
if (!empty($fullvalue)) { | |
$success = preg_match( | |
'/charset=(.+)/i', | |
$fullvalue, | |
$matches | |
); | |
if ($success) { | |
$charset = $matches[1]; | |
} else { | |
// If there is a meta tag, and they don't specify the | |
// character set, research says that it's typically | |
// ISO-8859-1 | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, | |
'meta content-type tag couldn\'t be parsed. using iso-8859 default.' | |
); | |
} | |
$charset = 'ISO-8859-1'; | |
} | |
} | |
} | |
} | |
if (empty($charset)) { | |
// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration | |
if ($meta = $this->root->find('meta[charset]', 0)) { | |
$charset = $meta->charset; | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'meta charset: ' . $charset); | |
} | |
} | |
} | |
if (empty($charset)) { | |
// Try to guess the charset based on the content | |
// Requires Multibyte String (mbstring) support (optional) | |
if (function_exists('mb_detect_encoding')) { | |
/** | |
* mb_detect_encoding() is not intended to distinguish between | |
* charsets, especially single-byte charsets. Its primary | |
* purpose is to detect which multibyte encoding is in use, | |
* i.e. UTF-8, UTF-16, shift-JIS, etc. | |
* | |
* -- https://bugs.php.net/bug.php?id=38138 | |
* | |
* Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will | |
* always result in CP1251/ISO-8859-5 and vice versa. | |
* | |
* Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 | |
* to stay compatible. | |
*/ | |
$encoding = mb_detect_encoding( | |
$this->doc, | |
array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) | |
); | |
if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { | |
// Due to a limitation of mb_detect_encoding | |
// 'CP1251'/'ISO-8859-5' will be detected as | |
// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in | |
// which case we can simply assume it is the other charset. | |
if (!@iconv('CP1252', 'UTF-8', $this->doc)) { | |
$encoding = 'CP1251'; | |
} | |
} | |
if ($encoding !== false) { | |
$charset = $encoding; | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'mb_detect: ' . $charset); | |
} | |
} | |
} | |
} | |
if (empty($charset)) { | |
// Assume it's UTF-8 as it is the most likely charset to be used | |
$charset = 'UTF-8'; | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'No match found, assume ' . $charset); | |
} | |
} | |
// Since CP1252 is a superset, if we get one of it's subsets, we want | |
// it instead. | |
if ((strtolower($charset) == 'iso-8859-1') | |
|| (strtolower($charset) == 'latin1') | |
|| (strtolower($charset) == 'latin-1')) { | |
$charset = 'CP1252'; | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, | |
'replacing ' . $charset . ' with CP1252 as its a superset' | |
); | |
} | |
} | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(1, 'EXIT - ' . $charset); | |
} | |
return $this->_charset = $charset; | |
} | |
protected function read_tag() | |
{ | |
// Set end position if no further tags found | |
if ($this->char !== '<') { | |
$this->root->_[HDOM_INFO_END] = $this->cursor; | |
return false; | |
} | |
$begin_tag_pos = $this->pos; | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
// end tag | |
if ($this->char === '/') { | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
// Skip whitespace in end tags (i.e. in "</ html>") | |
$this->skip($this->token_blank); | |
$tag = $this->copy_until_char('>'); | |
// Skip attributes in end tags | |
if (($pos = strpos($tag, ' ')) !== false) { | |
$tag = substr($tag, 0, $pos); | |
} | |
$parent_lower = strtolower($this->parent->tag); | |
$tag_lower = strtolower($tag); | |
// The end tag is supposed to close the parent tag. Handle situations | |
// when it doesn't | |
if ($parent_lower !== $tag_lower) { | |
// Parent tag does not have to be closed necessarily (optional closing tag) | |
// Current tag is a block tag, so it may close an ancestor | |
if (isset($this->optional_closing_tags[$parent_lower]) | |
&& isset($this->block_tags[$tag_lower])) { | |
$this->parent->_[HDOM_INFO_END] = 0; | |
$org_parent = $this->parent; | |
// Traverse ancestors to find a matching opening tag | |
// Stop at root node | |
while (($this->parent->parent) | |
&& strtolower($this->parent->tag) !== $tag_lower | |
){ | |
$this->parent = $this->parent->parent; | |
} | |
// If we don't have a match add current tag as text node | |
if (strtolower($this->parent->tag) !== $tag_lower) { | |
$this->parent = $org_parent; // restore origonal parent | |
if ($this->parent->parent) { | |
$this->parent = $this->parent->parent; | |
} | |
$this->parent->_[HDOM_INFO_END] = $this->cursor; | |
return $this->as_text_node($tag); | |
} | |
} elseif (($this->parent->parent) | |
&& isset($this->block_tags[$tag_lower]) | |
) { | |
// Grandparent exists and current tag is a block tag, so our | |
// parent doesn't have an end tag | |
$this->parent->_[HDOM_INFO_END] = 0; // No end tag | |
$org_parent = $this->parent; | |
// Traverse ancestors to find a matching opening tag | |
// Stop at root node | |
while (($this->parent->parent) | |
&& strtolower($this->parent->tag) !== $tag_lower | |
) { | |
$this->parent = $this->parent->parent; | |
} | |
// If we don't have a match add current tag as text node | |
if (strtolower($this->parent->tag) !== $tag_lower) { | |
$this->parent = $org_parent; // restore origonal parent | |
$this->parent->_[HDOM_INFO_END] = $this->cursor; | |
return $this->as_text_node($tag); | |
} | |
} elseif (($this->parent->parent) | |
&& strtolower($this->parent->parent->tag) === $tag_lower | |
) { // Grandparent exists and current tag closes it | |
$this->parent->_[HDOM_INFO_END] = 0; | |
$this->parent = $this->parent->parent; | |
} else { // Random tag, add as text node | |
return $this->as_text_node($tag); | |
} | |
} | |
// Set end position of parent tag to current cursor position | |
$this->parent->_[HDOM_INFO_END] = $this->cursor; | |
if ($this->parent->parent) { | |
$this->parent = $this->parent->parent; | |
} | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
return true; | |
} | |
// start tag | |
$node = new simple_html_dom_node($this); | |
$node->_[HDOM_INFO_BEGIN] = $this->cursor; | |
++$this->cursor; | |
$tag = $this->copy_until($this->token_slash); // Get tag name | |
$node->tag_start = $begin_tag_pos; | |
// doctype, cdata & comments... | |
// <!DOCTYPE html> | |
// <![CDATA[ ... ]]> | |
// <!-- Comment --> | |
if (isset($tag[0]) && $tag[0] === '!') { | |
$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); | |
if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") | |
$node->nodetype = HDOM_TYPE_COMMENT; | |
$node->tag = 'comment'; | |
} else { // Could be doctype or CDATA but we don't care | |
$node->nodetype = HDOM_TYPE_UNKNOWN; | |
$node->tag = 'unknown'; | |
} | |
if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } | |
$this->link_nodes($node, true); | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
return true; | |
} | |
// The start tag cannot contain another start tag, if so add as text | |
// i.e. "<<html>" | |
if ($pos = strpos($tag, '<') !== false) { | |
$tag = '<' . substr($tag, 0, -1); | |
$node->_[HDOM_INFO_TEXT] = $tag; | |
$this->link_nodes($node, false); | |
$this->char = $this->doc[--$this->pos]; // prev | |
return true; | |
} | |
// Handle invalid tag names (i.e. "<html#doc>") | |
if (!preg_match('/^\w[\w:-]*$/', $tag)) { | |
$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); | |
// Next char is the beginning of a new tag, don't touch it. | |
if ($this->char === '<') { | |
$this->link_nodes($node, false); | |
return true; | |
} | |
// Next char closes current tag, add and be done with it. | |
if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } | |
$this->link_nodes($node, false); | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
return true; | |
} | |
// begin tag, add new node | |
$node->nodetype = HDOM_TYPE_ELEMENT; | |
$tag_lower = strtolower($tag); | |
$node->tag = ($this->lowercase) ? $tag_lower : $tag; | |
// handle optional closing tags | |
if (isset($this->optional_closing_tags[$tag_lower])) { | |
// Traverse ancestors to close all optional closing tags | |
while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { | |
$this->parent->_[HDOM_INFO_END] = 0; | |
$this->parent = $this->parent->parent; | |
} | |
$node->parent = $this->parent; | |
} | |
$guard = 0; // prevent infinity loop | |
// [0] Space between tag and first attribute | |
$space = array($this->copy_skip($this->token_blank), '', ''); | |
// attributes | |
do { | |
// Everything until the first equal sign should be the attribute name | |
$name = $this->copy_until($this->token_equal); | |
if ($name === '' && $this->char !== null && $space[0] === '') { | |
break; | |
} | |
if ($guard === $this->pos) { // Escape infinite loop | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
continue; | |
} | |
$guard = $this->pos; | |
// handle endless '<' | |
// Out of bounds before the tag ended | |
if ($this->pos >= $this->size - 1 && $this->char !== '>') { | |
$node->nodetype = HDOM_TYPE_TEXT; | |
$node->_[HDOM_INFO_END] = 0; | |
$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; | |
$node->tag = 'text'; | |
$this->link_nodes($node, false); | |
return true; | |
} | |
// handle mismatch '<' | |
// Attributes cannot start after opening tag | |
if ($this->doc[$this->pos - 1] == '<') { | |
$node->nodetype = HDOM_TYPE_TEXT; | |
$node->tag = 'text'; | |
$node->attr = array(); | |
$node->_[HDOM_INFO_END] = 0; | |
$node->_[HDOM_INFO_TEXT] = substr( | |
$this->doc, | |
$begin_tag_pos, | |
$this->pos - $begin_tag_pos - 1 | |
); | |
$this->pos -= 2; | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
$this->link_nodes($node, false); | |
return true; | |
} | |
if ($name !== '/' && $name !== '') { // this is a attribute name | |
// [1] Whitespace after attribute name | |
$space[1] = $this->copy_skip($this->token_blank); | |
$name = $this->restore_noise($name); // might be a noisy name | |
if ($this->lowercase) { $name = strtolower($name); } | |
if ($this->char === '=') { // attribute with value | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
$this->parse_attr($node, $name, $space); // get attribute value | |
} else { | |
//no value attr: nowrap, checked selected... | |
$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; | |
$node->attr[$name] = true; | |
if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev | |
} | |
$node->_[HDOM_INFO_SPACE][] = $space; | |
// prepare for next attribute | |
$space = array( | |
$this->copy_skip($this->token_blank), | |
'', | |
'' | |
); | |
} else { // no more attributes | |
break; | |
} | |
} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended | |
$this->link_nodes($node, true); | |
$node->_[HDOM_INFO_ENDSPACE] = $space[0]; | |
// handle empty tags (i.e. "<div/>") | |
if ($this->copy_until_char('>') === '/') { | |
$node->_[HDOM_INFO_ENDSPACE] .= '/'; | |
$node->_[HDOM_INFO_END] = 0; | |
} else { | |
// reset parent | |
if (!isset($this->self_closing_tags[strtolower($node->tag)])) { | |
$this->parent = $node; | |
} | |
} | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
// If it's a BR tag, we need to set it's text to the default text. | |
// This way when we see it in plaintext, we can generate formatting that the user wants. | |
// since a br tag never has sub nodes, this works well. | |
if ($node->tag === 'br') { | |
$node->_[HDOM_INFO_INNER] = $this->default_br_text; | |
} | |
return true; | |
} | |
protected function parse_attr($node, $name, &$space) | |
{ | |
$is_duplicate = isset($node->attr[$name]); | |
if (!$is_duplicate) // Copy whitespace between "=" and value | |
$space[2] = $this->copy_skip($this->token_blank); | |
switch ($this->char) { | |
case '"': | |
$quote_type = HDOM_QUOTE_DOUBLE; | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
$value = $this->copy_until_char('"'); | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
break; | |
case '\'': | |
$quote_type = HDOM_QUOTE_SINGLE; | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
$value = $this->copy_until_char('\''); | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
break; | |
default: | |
$quote_type = HDOM_QUOTE_NO; | |
$value = $this->copy_until($this->token_attr); | |
} | |
$value = $this->restore_noise($value); | |
// PaperG: Attributes should not have \r or \n in them, that counts as | |
// html whitespace. | |
$value = str_replace("\r", '', $value); | |
$value = str_replace("\n", '', $value); | |
// PaperG: If this is a "class" selector, lets get rid of the preceeding | |
// and trailing space since some people leave it in the multi class case. | |
if ($name === 'class') { | |
$value = trim($value); | |
} | |
if (!$is_duplicate) { | |
$node->_[HDOM_INFO_QUOTE][] = $quote_type; | |
$node->attr[$name] = $value; | |
} | |
} | |
protected function link_nodes(&$node, $is_child) | |
{ | |
$node->parent = $this->parent; | |
$this->parent->nodes[] = $node; | |
if ($is_child) { | |
$this->parent->children[] = $node; | |
} | |
} | |
protected function as_text_node($tag) | |
{ | |
$node = new simple_html_dom_node($this); | |
++$this->cursor; | |
$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; | |
$this->link_nodes($node, false); | |
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
return true; | |
} | |
protected function skip($chars) | |
{ | |
$this->pos += strspn($this->doc, $chars, $this->pos); | |
$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
} | |
protected function copy_skip($chars) | |
{ | |
$pos = $this->pos; | |
$len = strspn($this->doc, $chars, $pos); | |
$this->pos += $len; | |
$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
if ($len === 0) { return ''; } | |
return substr($this->doc, $pos, $len); | |
} | |
protected function copy_until($chars) | |
{ | |
$pos = $this->pos; | |
$len = strcspn($this->doc, $chars, $pos); | |
$this->pos += $len; | |
$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next | |
return substr($this->doc, $pos, $len); | |
} | |
protected function copy_until_char($char) | |
{ | |
if ($this->char === null) { return ''; } | |
if (($pos = strpos($this->doc, $char, $this->pos)) === false) { | |
$ret = substr($this->doc, $this->pos, $this->size - $this->pos); | |
$this->char = null; | |
$this->pos = $this->size; | |
return $ret; | |
} | |
if ($pos === $this->pos) { return ''; } | |
$pos_old = $this->pos; | |
$this->char = $this->doc[$pos]; | |
$this->pos = $pos; | |
return substr($this->doc, $pos_old, $pos - $pos_old); | |
} | |
protected function remove_noise($pattern, $remove_tag = false) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | |
$count = preg_match_all( | |
$pattern, | |
$this->doc, | |
$matches, | |
PREG_SET_ORDER | PREG_OFFSET_CAPTURE | |
); | |
for ($i = $count - 1; $i > -1; --$i) { | |
$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'key is: ' . $key); | |
} | |
$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch | |
$this->noise[$key] = $matches[$i][$idx][0]; | |
$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); | |
} | |
// reset the length of content | |
$this->size = strlen($this->doc); | |
if ($this->size > 0) { | |
$this->char = $this->doc[0]; | |
} | |
} | |
function restore_noise($text) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | |
while (($pos = strpos($text, '___noise___')) !== false) { | |
// Sometimes there is a broken piece of markup, and we don't GET the | |
// pos+11 etc... token which indicates a problem outside of us... | |
// todo: "___noise___1000" (or any number with four or more digits) | |
// in the DOM causes an infinite loop which could be utilized by | |
// malicious software | |
if (strlen($text) > $pos + 15) { | |
$key = '___noise___' | |
. $text[$pos + 11] | |
. $text[$pos + 12] | |
. $text[$pos + 13] | |
. $text[$pos + 14] | |
. $text[$pos + 15]; | |
if (is_object($debug_object)) { | |
$debug_object->debug_log(2, 'located key of: ' . $key); | |
} | |
if (isset($this->noise[$key])) { | |
$text = substr($text, 0, $pos) | |
. $this->noise[$key] | |
. substr($text, $pos + 16); | |
} else { | |
// do this to prevent an infinite loop. | |
$text = substr($text, 0, $pos) | |
. 'UNDEFINED NOISE FOR KEY: ' | |
. $key | |
. substr($text, $pos + 16); | |
} | |
} else { | |
// There is no valid key being given back to us... We must get | |
// rid of the ___noise___ or we will have a problem. | |
$text = substr($text, 0, $pos) | |
. 'NO NUMERIC NOISE KEY' | |
. substr($text, $pos + 11); | |
} | |
} | |
return $text; | |
} | |
function search_noise($text) | |
{ | |
global $debug_object; | |
if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } | |
foreach($this->noise as $noiseElement) { | |
if (strpos($noiseElement, $text) !== false) { | |
return $noiseElement; | |
} | |
} | |
} | |
function __toString() | |
{ | |
return $this->root->innertext(); | |
} | |
function __get($name) | |
{ | |
switch ($name) { | |
case 'outertext': | |
return $this->root->innertext(); | |
case 'innertext': | |
return $this->root->innertext(); | |
case 'plaintext': | |
return $this->root->text(); | |
case 'charset': | |
return $this->_charset; | |
case 'target_charset': | |
return $this->_target_charset; | |
} | |
} | |
function childNodes($idx = -1) | |
{ | |
return $this->root->childNodes($idx); | |
} | |
function firstChild() | |
{ | |
return $this->root->first_child(); | |
} | |
function lastChild() | |
{ | |
return $this->root->last_child(); | |
} | |
function createElement($name, $value = null) | |
{ | |
return @str_get_html("<$name>$value</$name>")->firstChild(); | |
} | |
function createTextNode($value) | |
{ | |
return @end(str_get_html($value)->nodes); | |
} | |
function getElementById($id) | |
{ | |
return $this->find("#$id", 0); | |
} | |
function getElementsById($id, $idx = null) | |
{ | |
return $this->find("#$id", $idx); | |
} | |
function getElementByTagName($name) | |
{ | |
return $this->find($name, 0); | |
} | |
function getElementsByTagName($name, $idx = -1) | |
{ | |
return $this->find($name, $idx); | |
} | |
function loadFile() | |
{ | |
$args = func_get_args(); | |
$this->load_file($args); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment