Last active
August 9, 2024 17:55
-
-
Save dmsnell/8e36bd2957c3834a62d39c2f306afaa6 to your computer and use it in GitHub Desktop.
Extract the HTML within P tags using the HTML API.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Place this file into a WordPress directory with a proper config and run. | |
* It doesn't need the config, but it needs to load all of the HTML API and | |
* Token Map modules. This can be done by replacing the `require_once` with | |
* a `require_once` for each of the appropriate files. | |
* | |
* - With no arguments it prints the sample HTML. | |
* - If the last argument is a dash (-), it reads the HTML from stdin. | |
* | |
* Example: | |
* | |
* curl 'https://wordpress.org' -L | php extract-paragraphs.php - | |
*/ | |
require_once __DIR__ . '/src/wp-load.php'; | |
$html = <<<'HTML' | |
<div> | |
<div> | |
<p>Hello, world!</p> | |
</div> | |
<div> | |
<p>Look, <a href="https://example.com">a link!</a></p> | |
</div> | |
</div> | |
HTML; | |
if ( '-' === $argv[ $argc - 1 ] ) { | |
$html = file_get_contents( 'php://stdin' ); | |
} | |
$processor = WP_HTML_Processor::create_full_parser( $html, 'UTF-8' ); | |
$extracted = ''; | |
$extracting = false; | |
while ( $processor->next_token() ) { | |
if ( 'P' === $processor->get_token_name() && 'html' === $processor->get_namespace() ) { | |
$extracting = ! $processor->is_tag_closer(); | |
if ( $processor->is_tag_closer() ) { | |
$extracted .= "\n\n"; | |
} | |
continue; | |
} | |
if ( ! $extracting ) { | |
continue; | |
} | |
// Copy text and tags but ignore comments and other syntax. | |
switch ( $processor->get_token_type() ) { | |
case '#text': | |
$extracted .= htmlspecialchars( $processor->get_modifiable_text() ); | |
break; | |
case '#tag': | |
if ( $processor->is_tag_closer() ) { | |
$extracted .= "</{$processor->get_qualified_tag_name()}>"; | |
break; | |
} | |
$tag_name = $processor->get_qualified_tag_name(); | |
$attributes = $processor->get_attribute_names_with_prefix( '' ) ?? []; | |
$text = $processor->get_modifiable_text(); | |
$extracted .= "<{$tag_name}"; | |
foreach ( $attributes as $attribute_name ) { | |
$value = $processor->get_attribute( $attribute_name ); | |
$extracted .= " {$processor->get_qualified_attribute_name($attribute_name)}"; | |
if ( is_string( $value ) ) { | |
$extracted .= '="' . str_replace( '"', '"', $value ) . '"'; | |
} | |
} | |
// Self-contained elements need a closing. | |
$extracted .= ">{$text}"; | |
if ( | |
'html' === $processor->get_namespace() && | |
in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) | |
) { | |
$extracted = "</{$tag_name}>"; | |
} | |
break; | |
} | |
} | |
echo $extracted; | |
if ( null !== $processor->get_last_error() ) { | |
echo "\n\nFailed to fully parse HTML: {$processor->get_unsupported_exception()->getMessage()}\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment