Skip to content

Instantly share code, notes, and snippets.

@dmsnell
Last active August 9, 2024 17:55
Show Gist options
  • Save dmsnell/8e36bd2957c3834a62d39c2f306afaa6 to your computer and use it in GitHub Desktop.
Save dmsnell/8e36bd2957c3834a62d39c2f306afaa6 to your computer and use it in GitHub Desktop.
Extract the HTML within P tags using the HTML API.
<?php
/**
* Place this file into a WordPress directory with a proper config and run.
* It doesn't need the config, but it needs to load all of the HTML API and
* Token Map modules. This can be done by replacing the `require_once` with
* a `require_once` for each of the appropriate files.
*
* - With no arguments it prints the sample HTML.
* - If the last argument is a dash (-), it reads the HTML from stdin.
*
* Example:
*
* curl 'https://wordpress.org' -L | php extract-paragraphs.php -
*/
require_once __DIR__ . '/src/wp-load.php';
$html = <<<'HTML'
<div>
<div>
<p>Hello, world!</p>
</div>
<div>
<p>Look, <a href="https://example.com">a link!</a></p>
</div>
</div>
HTML;
if ( '-' === $argv[ $argc - 1 ] ) {
$html = file_get_contents( 'php://stdin' );
}
$processor = WP_HTML_Processor::create_full_parser( $html, 'UTF-8' );
$extracted = '';
$extracting = false;
while ( $processor->next_token() ) {
if ( 'P' === $processor->get_token_name() && 'html' === $processor->get_namespace() ) {
$extracting = ! $processor->is_tag_closer();
if ( $processor->is_tag_closer() ) {
$extracted .= "\n\n";
}
continue;
}
if ( ! $extracting ) {
continue;
}
// Copy text and tags but ignore comments and other syntax.
switch ( $processor->get_token_type() ) {
case '#text':
$extracted .= htmlspecialchars( $processor->get_modifiable_text() );
break;
case '#tag':
if ( $processor->is_tag_closer() ) {
$extracted .= "</{$processor->get_qualified_tag_name()}>";
break;
}
$tag_name = $processor->get_qualified_tag_name();
$attributes = $processor->get_attribute_names_with_prefix( '' ) ?? [];
$text = $processor->get_modifiable_text();
$extracted .= "<{$tag_name}";
foreach ( $attributes as $attribute_name ) {
$value = $processor->get_attribute( $attribute_name );
$extracted .= " {$processor->get_qualified_attribute_name($attribute_name)}";
if ( is_string( $value ) ) {
$extracted .= '="' . str_replace( '"', '&quot;', $value ) . '"';
}
}
// Self-contained elements need a closing.
$extracted .= ">{$text}";
if (
'html' === $processor->get_namespace() &&
in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true )
) {
$extracted = "</{$tag_name}>";
}
break;
}
}
echo $extracted;
if ( null !== $processor->get_last_error() ) {
echo "\n\nFailed to fully parse HTML: {$processor->get_unsupported_exception()->getMessage()}\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment