domdocument domxpath xpath #php

setup

$DOMDocument = new \DOMDocument();
$DOMDocument->loadHTML('<div>foo</div>');
$DOMXPath = new \DOMXPath($DOMDocument);

load html file

$DOMDocument->loadHTML(file_get_contents('tpl.html'));

load html file (with or without header)

// if the html source doesn't contain a valid utf8 header, domdocument interprets is as iso
// we circumvent this with mb_convert_encoding
// warning: if you don't add a doctype/html tag, domdocument adds that information for you
// also if only a text node is provided, it is surrounded by a p-tag
// we also add <meta http-equiv="content-type" content="text/html;charset=utf-8" /> got proper encoding (see below)
$html = file_get_contents('tpl.html');
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
$has_wrapper = strpos($html, '<html') !== false;
if ($has_wrapper === false) { $html = '<!DOCTYPE html><html data-please-remove-wrapper><body>' . $html . '</body></html>'; }
if (mb_strpos($html, '</head>') !== false) { $html = str_replace('</head>', '<!--remove--><meta http-equiv="Content-type" content="text/html; charset=utf-8" /><!--/remove--></head>', $html); }
elseif (mb_strpos($html, '<body') !== false) { $html = str_replace('<body', '<!--remove--><head><meta http-equiv="content-type" content="text/html;charset=utf-8" /></head><!--/remove--><body', $html); }
else { $html = '<!--remove--><head><meta http-equiv="content-type" content="text/html;charset=utf-8" /></head><!--/remove-->' . $html; }
@$DOMDocument->loadHTML($html);

get back html from domdocument

// domdocument does not close empty li tags (because they're valid html)
// to circumvent that, use:
$nodes = $DOMXPath->query('/html/body//*[not(node())]');
foreach($nodes as $nodes__value) { $nodes__value->nodeValue = ''; }
$html = $DOMDocument->saveHTML();
// domdocument converts all umlauts to html entities, revert that
// $html = html_entity_decode($html); 
// this method is bad when we use intentionally encoded code e.g. in <pre> tags; another option to prevent html entities (and leave everything intact)
// is to add <meta http-equiv="content-type" content="text/html;charset=utf-8" /> (see above)
// warning: this still encodes < to &gt; because < is invalid html!
// undo above changes
if (mb_strpos($html, '<!--remove-->') !== false && mb_strpos($html, '<!--/remove-->') !== false) {
    $html = mb_substr($html, 0, mb_strpos($html, '<!--remove-->')) . mb_substr($html, mb_strpos($htmlModified, '<!--/remove-->') + mb_strlen('<!--/remove-->'));
}
// if domdocument added previously a default header, we squish that
if (mb_stripos($html, 'data-please-remove-wrapper') !== false) {
  $pos1 = mb_strpos($html, '<body>') + mb_strlen('<body>');
  $pos2 = mb_strpos($html, '</body>');
  $html = mb_substr($html, $pos1, $pos2 - $pos1);
}

query nodes

$nodes = $DOMXPath->query('/html/body//*[@id="foo"]');
foreach($nodes as $nodes__value) {
    /* ... */
}

check length of query

$nodes = $DOMXPath->query('/html/body//*[@id="foo"]');
if( $nodes->length > 0 ) {}
if( count($nodes) > 0 ) {}

get first item

$nodes = $DOMXPath->query('/html/body//*[@id="foo"]');
if( $nodes->length > 0 ) { $node = $nodes[0]; }

types of selectors

node(): any node (including text nodes)
text(): text nodes
comment(): comment nodes
*: dom nodes
node()[normalize-space()]: any node (including text nodes) excluding whitespace text nodes (and also including
)
text()[normalize-space()]: any text node excluding whitespace
/html/body//*|/html/body//text()[normalize-space()]: dom nodes and test nodes (without whitespace)

get all nodes (including text nodes)

$DOMXPath->query('/html/body//node()');

get all nodes (without text nodes)

$DOMXPath->query('/html/body//*');

get text nodes only

$DOMXPath->query('/html/body//text()');

class selector

$DOMXPath->query('/html/body//*[contains(concat(" ", normalize-space(@class), " "), " foo ")]');

id selector

$DOMXPath->query('/html/body//*[@id="root"]');

tag selector

$DOMXPath->query('/html/body//input');

multiple tag selector

$DOMXPath->query('/html/body//input|/html/body//select');

tag selector

$DOMDocument->getElementsByTagName('input');

attribute selector

$DOMXPath->query('/html/body//input[@placeholder]');

attribute value selector

$DOMXPath->query('/html/body//a[@href="#"]);

attribute value selector

$DOMXPath->query('/html/body//a[starts-with(@href, 'tel:')]);

attribute selector (key wildcard)

$DOMXPath->query('/html/body//@*[starts-with(name(), \'data-\')]/parent::*');

next sibling selector ".foo + .bar"

$DOMXPath->query('//*[contains(concat(" ", normalize-space(@class), " "), " foo ")]/following::*[contains(concat(" ", normalize-space(@class), " "), " bar ")]');

check if is text node

if($node->nodeName === '#text') {}
if($node->nodeType === 3) {}

check if is dom/element node

if($node->nodeType === 1) {}

get tag name of node

$node->tagName

get/set content of text node

// difference: https://stackoverflow.com/questions/12380919/php-dom-textcontent-vs-nodevalue
$node->nodeValue
$node->textContent
// reading (this is important): if you fetch the variable of a text node with nodeValue (or even textContent) and also getAttribute
// the content is automatically is encoded (what we usually don't want)
// we use htmlentities (or the even weaker htmlspecialchars) to revert that
htmlentities($node->nodeValue)
htmlspecialchars($node->nodeValue)
// writing (this is important): domdocument sets strings with encoded html chars for text nodes as plain text (and not html)
// we therefore use the parent node and set the node value accordingly (so that the encoded strings are properly set)
$node->parentNode->nodeValue = 'That&#39;s cool';
// if you really want to use the text node, you can do:
$node->nodeValue = html_entity_decode('That&#39;s cool', ENT_QUOTES | ENT_XML1, 'UTF-8');

get children of node (recursively)

$DOMXPath->query('.//node()', $node);

get children count of node (recursively)

$DOMXPath->evaluate('count(./node())', $node);

get text siblings (including oneself if text node) node

$DOMXPath->query('./../text()[normalize-space()]', $node);

get text siblings that are longer than 3 chars

$DOMXPath->query('./../text()[normalize-space()][string-length() > 3]', $node);

get text siblings that are longer than 1 char (excluding whitespace)

$DOMXPath->query('./../text()[normalize-space()][string-length(normalize-space(.)) > 1]', $node);

get dom elements without content inside (empty tags)

$DOMXPath->query('/html/body//*[not(node())][not(text())]')

get direct sibling of node

$DOMXPath->query('(./following-sibling::*|./following-sibling::text()[normalize-space()])[1]', $node);
$DOMXPath->query('(/html/body//*[@id="foo"]/following-sibling::*|/html/body//*[@id="foo"]/following-sibling::text()[normalize-space()])[1]');

get attributes of node beginning with "data-"

$attrs = $this->DOMXPath->query('./@*[starts-with(name(),"data-")]', $node);
if (!empty($attrs)) {
    echo $attrs__value->nodeName;
    echo $attrs__value->nodeValue;
}

get dom attribute

$node->getAttribute('foo');

set dom attribute

$node->setAttribute('foo','bar');

check if dom attribute exists

$node->hasAttribute('foo');

get unique id of node (this is very neat for comparing nodes etc)

$nodes = $DOMXPath->query('/html/body//node()');
foreach ($nodes as $nodes__value) {
    $id = intval($DOMXPath->evaluate('count(.//following::node()|.//child::node())',$nodes__value))+1;
];

get unique id of node (way faster)

$nodes = $DOMXPath->query('/html/body//node()');
foreach ($nodes as $nodes__value) {
    $id = $nodes__value->getNodePath();
];

add text node

$parent->appendChild($DOMDocument->createTextNode('test'));

add / append child

$child = $DOMDocument->createElement('a', '');
$child->setAttribute('href', 'https://tld.com');
$parent->appendChild($child);

prepend child

if ($parent->hasChildNodes()) {
    $parent->insertBefore($child,$parent->firstChild);
} else {
    $parent->appendChild($child);
}

insert before

$node->parentNode->insertBefore($newNode, $node);

insert after

if($node->nextSibling === null) { $node->parentNode->appendChild($newNode); }
else { $node->parentNode->insertBefore($newNode, $node->nextSibling);  }

copy clone node

$node->cloneNode(true)

remove node

$node->parentNode->removeChild($node);

get outer html of node

$doc = new \DOMDocument();
$doc->appendChild($doc->importNode($node, true));
echo $doc->saveHTML();

get inner html of node

$inner = '';
foreach ($node->childNodes as $child) {
    $inner .= $node->ownerDocument->saveHTML($child);
}
return $inner;

set inner html of node

for ($x = $node->childNodes->length - 1; $x >= 0; $x--) {
    $node->removeChild($node->childNodes->item($x));
}
if ($value != '') {
    $f = $node->ownerDocument->createDocumentFragment();
    $result = @$f->appendXML($value);
    if ($result) {
        if ($f->hasChildNodes()) {
            $node->appendChild($f);
        }
    } else {
        $f = new \DOMDocument();
        $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
        $result = @$f->loadHTML('<htmlfragment>' . $value . '</htmlfragment>');
        if ($result) {
            $import = $f->getElementsByTagName('htmlfragment')->item(0);
            foreach ($import->childNodes as $child) {
                $importedNode = $node->ownerDocument->importNode($child, true);
                $node->appendChild($importedNode);
            }
        } else {
        }
    }
}

string to single node

$DOMDocument = new \DOMDocument(); // master dom document (needed for reference)
$str = mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8');
$tmp = new \DOMDocument();
$tmp->loadHTML($str, LIBXML_HTML_NOIMPLIED);
$node = $DOMDocument->importNode($tmp->documentElement,true);

replace node with string

$str = '<strong>String that replaces the node</strong>';
$tmp = new \DOMDocument();
$str = mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8');
$tmp->loadHTML($str, LIBXML_HTML_NOIMPLIED);
$repl = $DOMDocument->importNode($tmp->documentElement, true);
$node->parentNode->replaceChild($repl, $node);

load xml

$DOMDocument = new \DOMDocument();
@$DOMDocument->loadXML($html);
$DOMXPath = new \DOMXPath($DOMDocument);

write xml

$html = $DOMDocument->saveXML();

if domdocument is from xml

if($dom->xmlVersion != '') {}

search in all namespaces

$DOMXPath->query('//loc'); // this does not work, if the <loc> nodes are inside a socalled namespace (<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">)
$DOMXPath->query('//*[name()=\'loc\']'); // this works in all namespaces

vielhuber/README.MD

setup

load html file

load html file (with or without header)

get back html from domdocument

query nodes

check length of query

get first item

types of selectors

get all nodes (including text nodes)

get all nodes (without text nodes)

get text nodes only

class selector

id selector

tag selector

multiple tag selector

tag selector

attribute selector

attribute value selector

attribute value selector

attribute selector (key wildcard)

next sibling selector ".foo + .bar"

check if is text node

check if is dom/element node

get tag name of node

get/set content of text node

get children of node (recursively)

get children count of node (recursively)

get text siblings (including oneself if text node) node

get text siblings that are longer than 3 chars

get text siblings that are longer than 1 char (excluding whitespace)

get dom elements without content inside (empty tags)

get direct sibling of node

get attributes of node beginning with "data-"

get dom attribute

set dom attribute

check if dom attribute exists

get unique id of node (this is very neat for comparing nodes etc)

get unique id of node (way faster)

add text node

add / append child

prepend child

insert before

insert after

copy clone node

remove node

get outer html of node

get inner html of node

set inner html of node

string to single node

replace node with string

load xml

write xml

if domdocument is from xml

search in all namespaces