You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// if the html source doesn't contain a valid utf8 header, domdocument interprets is as iso// we circumvent this with mb_convert_encoding// warning: if you don't add a doctype/html tag, domdocument adds that information for you// also if only a text node is provided, it is surrounded by a p-tag// we also add <meta http-equiv="content-type" content="text/html;charset=utf-8" /> got proper encoding (see below)$html = file_get_contents('tpl.html');
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
$has_wrapper = strpos($html, '<html') !== false;
if ($has_wrapper === false) { $html = '<!DOCTYPE html><html data-please-remove-wrapper><body>' . $html . '</body></html>'; }
if (mb_strpos($html, '</head>') !== false) { $html = str_replace('</head>', '<!--remove--><meta http-equiv="Content-type" content="text/html; charset=utf-8" /><!--/remove--></head>', $html); }
elseif (mb_strpos($html, '<body') !== false) { $html = str_replace('<body', '<!--remove--><head><meta http-equiv="content-type" content="text/html;charset=utf-8" /></head><!--/remove--><body', $html); }
else { $html = '<!--remove--><head><meta http-equiv="content-type" content="text/html;charset=utf-8" /></head><!--/remove-->' . $html; }
@$DOMDocument->loadHTML($html);
get back html from domdocument
// domdocument does not close empty li tags (because they're valid html)// to circumvent that, use:$nodes = $DOMXPath->query('/html/body//*[not(node())]');
foreach($nodesas$nodes__value) { $nodes__value->nodeValue = ''; }
$html = $DOMDocument->saveHTML();
// domdocument converts all umlauts to html entities, revert that// $html = html_entity_decode($html); // this method is bad when we use intentionally encoded code e.g. in <pre> tags; another option to prevent html entities (and leave everything intact)// is to add <meta http-equiv="content-type" content="text/html;charset=utf-8" /> (see above)// warning: this still encodes < to > because < is invalid html!// undo above changesif (mb_strpos($html, '<!--remove-->') !== false && mb_strpos($html, '<!--/remove-->') !== false) {
$html = mb_substr($html, 0, mb_strpos($html, '<!--remove-->')) . mb_substr($html, mb_strpos($htmlModified, '<!--/remove-->') + mb_strlen('<!--/remove-->'));
}
// if domdocument added previously a default header, we squish thatif (mb_stripos($html, 'data-please-remove-wrapper') !== false) {
$pos1 = mb_strpos($html, '<body>') + mb_strlen('<body>');
$pos2 = mb_strpos($html, '</body>');
$html = mb_substr($html, $pos1, $pos2 - $pos1);
}
// difference: https://stackoverflow.com/questions/12380919/php-dom-textcontent-vs-nodevalue$node->nodeValue
$node->textContent
// reading (this is important): if you fetch the variable of a text node with nodeValue (or even textContent) and also getAttribute// the content is automatically is encoded (what we usually don't want)// we use htmlentities (or the even weaker htmlspecialchars) to revert thathtmlentities($node->nodeValue)
htmlspecialchars($node->nodeValue)
// writing (this is important): domdocument sets strings with encoded html chars for text nodes as plain text (and not html)// we therefore use the parent node and set the node value accordingly (so that the encoded strings are properly set)$node->parentNode->nodeValue = 'That's cool';
// if you really want to use the text node, you can do:$node->nodeValue = html_entity_decode('That's cool', ENT_QUOTES | ENT_XML1, 'UTF-8');
get children of node (recursively)
$DOMXPath->query('.//node()', $node);
get children count of node (recursively)
$DOMXPath->evaluate('count(./node())', $node);
get text siblings (including oneself if text node) node
$DOMDocument = new \DOMDocument(); // master dom document (needed for reference)$str = mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8');
$tmp = new \DOMDocument();
$tmp->loadHTML($str, LIBXML_HTML_NOIMPLIED);
$node = $DOMDocument->importNode($tmp->documentElement,true);
replace node with string
$str = '<strong>String that replaces the node</strong>';
$tmp = new \DOMDocument();
$str = mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8');
$tmp->loadHTML($str, LIBXML_HTML_NOIMPLIED);
$repl = $DOMDocument->importNode($tmp->documentElement, true);
$node->parentNode->replaceChild($repl, $node);
load xml
$DOMDocument = new \DOMDocument();
@$DOMDocument->loadXML($html);
$DOMXPath = new \DOMXPath($DOMDocument);
write xml
$html = $DOMDocument->saveXML();
if domdocument is from xml
if($dom->xmlVersion != '') {}
search in all namespaces
$DOMXPath->query('//loc'); // this does not work, if the <loc> nodes are inside a socalled namespace (<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">)$DOMXPath->query('//*[name()=\'loc\']'); // this works in all namespaces