Created
November 15, 2018 14:46
-
-
Save 1naveengiri/28f8bb96cdd441e7f98f913dd0ac0589 to your computer and use it in GitHub Desktop.
Extract content from a html using Xpath
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Function to extract content from html. | |
* $html HTML data | |
* $expression Expression used to extract the data ex. //div[@id='breadcrumbs']/span | |
*/ | |
function extract_content_from_html( $html, $expression ){ | |
$dom = new DOMDocument(); | |
$output = ''; | |
libxml_use_internal_errors( true ); | |
$dom->loadHTML( $html ); | |
libxml_clear_errors(); | |
try{ | |
$xml = simplexml_import_dom( $dom ); | |
} catch(Exception $e) { | |
$error = 'Message: ' .$e->getMessage(); | |
error_log( $error ); | |
} | |
$result = $xml->xpath( $expression ); | |
$domXPath = new DOMXPath($dom); | |
$domNodeList = $domXPath->query($expression); | |
$domDocument = new DOMDocument(); | |
if( !empty( $domNodeList ) ): | |
foreach ($domNodeList as $node) { | |
$domDocument->appendChild($domDocument->importNode($node, true)); | |
} | |
$content = $domDocument->saveHTML(); | |
$output = $content; | |
else: | |
$output = __( 'No match found.', 'wpscraper' ); | |
endif; | |
return $output; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment