Skip to content

Instantly share code, notes, and snippets.

@1naveengiri
Created November 15, 2018 14:46
Show Gist options
  • Save 1naveengiri/28f8bb96cdd441e7f98f913dd0ac0589 to your computer and use it in GitHub Desktop.
Save 1naveengiri/28f8bb96cdd441e7f98f913dd0ac0589 to your computer and use it in GitHub Desktop.
Extract content from a html using Xpath
<?php
/**
* Function to extract content from html.
* $html HTML data
* $expression Expression used to extract the data ex. //div[@id='breadcrumbs']/span
*/
function extract_content_from_html( $html, $expression ){
$dom = new DOMDocument();
$output = '';
libxml_use_internal_errors( true );
$dom->loadHTML( $html );
libxml_clear_errors();
try{
$xml = simplexml_import_dom( $dom );
} catch(Exception $e) {
$error = 'Message: ' .$e->getMessage();
error_log( $error );
}
$result = $xml->xpath( $expression );
$domXPath = new DOMXPath($dom);
$domNodeList = $domXPath->query($expression);
$domDocument = new DOMDocument();
if( !empty( $domNodeList ) ):
foreach ($domNodeList as $node) {
$domDocument->appendChild($domDocument->importNode($node, true));
}
$content = $domDocument->saveHTML();
$output = $content;
else:
$output = __( 'No match found.', 'wpscraper' );
endif;
return $output;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment