Skip to content

Instantly share code, notes, and snippets.

@akkunchoi
Created October 6, 2012 00:55
Show Gist options
  • Select an option

  • Save akkunchoi/3843298 to your computer and use it in GitHub Desktop.

Select an option

Save akkunchoi/3843298 to your computer and use it in GitHub Desktop.
create DOM through Tidy
<?php
$html = file_get_contents($argv[1]);
$dom = dom($html);
$xpath = new DOMXPath($dom);
foreach ($xpath->query('//a') as $e){
var_dump($e->nodeValue);
}
function dom($html){
$config = array(
'output-xml' => true,
'numeric-entities' => true,
);
$tidy = tidy_parse_string($html, $config, 'UTF8');
$tidy->cleanRepair();
$html = tidy_get_output($tidy);
//var_dump($html);
// PHP Warning: DOMDocument::loadXML(): StartTag: invalid element name in Entity
$html = preg_replace('/\<\!\[(.*?)\]\>/um', '<!--[\1]-->', $html);
// loadXML() keeps multibyte strings even if partial html given
$html = '<?xml version="1.0" encoding="UTF-8"?>' . "\n" . $html;
//libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadXML($html);
//$doc->loadHTML($html);
//libxml_clear_errors();
//var_dump($doc->saveXML());
return $doc;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment