dmsnell · August 9, 2024 17:55
diff --git a/extract-paragraphs.php b/extract-paragraphs.php
 <?php

 /**
 * Place this file into a WordPress directory with a proper config and run.
 * It doesn't need the config, but it needs to load all of the HTML API and
 * Token Map modules. This can be done by replacing the `require_once` with
 * a `require_once` for each of the appropriate files.
 *
 *  - With no arguments it prints the sample HTML.
 *  - If the last argument is a dash (-), it reads the HTML from stdin.
 *
 * Example:
 *
 *     curl 'https://wordpress.org' -L | php extract-paragraphs.php -
 */

 require_once __DIR__ . '/src/wp-load.php';

 $html = <<<'HTML'
 <div>
    <div>
        <p>Hello, world!</p>
    </div>
    <div>
        <p>Look, <a href="https://example.com">a link!</a></p>
    </div>
 </div>
 HTML;

 if ( '-' === $argv[ $argc - 1 ] ) {
    $html = file_get_contents( 'php://stdin' );
 }

 $processor = WP_HTML_Processor::create_full_parser( $html, 'UTF-8' );
 $extracted = '';

 $extracting = false;
 while ( $processor->next_token() ) {
    if ( 'P' === $processor->get_token_name() && 'html' === $processor->get_namespace() ) {
        $extracting = ! $processor->is_tag_closer();
        if ( $processor->is_tag_closer() ) {
            $extracted .= "\n\n";    
        }
        continue;
    }

    if ( ! $extracting ) {
        continue;
    }

    // Copy text and tags but ignore comments and other syntax.
    switch ( $processor->get_token_type() ) {
        case '#text':
            $extracted .= htmlspecialchars( $processor->get_modifiable_text() );
            break;

        case '#tag':
            if ( $processor->is_tag_closer() ) {
                $extracted .= "</{$processor->get_qualified_tag_name()}>";
                break;
            }

            $tag_name   = $processor->get_qualified_tag_name();
            $attributes = $processor->get_attribute_names_with_prefix( '' ) ?? [];
            $text       = $processor->get_modifiable_text();

            $extracted .= "<{$tag_name}";
            foreach ( $attributes as $attribute_name ) {
                $value = $processor->get_attribute( $attribute_name );
                $extracted .= " {$processor->get_qualified_attribute_name($attribute_name)}";
                if ( is_string( $value ) ) {
                    $extracted .= '="' . str_replace( '"', '&quot;', $value ) . '"';
                }
            }

            // Self-contained elements need a closing.
            $extracted .= ">{$text}";
            if (
                'html' === $processor->get_namespace() &&
                in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true )
            ) {
                $extracted = "</{$tag_name}>";
            }
            break;
    }
 }

 echo $extracted;

 if ( null !== $processor->get_last_error() ) {
    echo "\n\nFailed to fully parse HTML: {$processor->get_unsupported_exception()->getMessage()}\n";
 }
	<?php

	/**
	* Place this file into a WordPress directory with a proper config and run.
	* It doesn't need the config, but it needs to load all of the HTML API and
	* Token Map modules. This can be done by replacing the `require_once` with
	* a `require_once` for each of the appropriate files.
	*
	* - With no arguments it prints the sample HTML.
	* - If the last argument is a dash (-), it reads the HTML from stdin.
	*
	* Example:
	*
	* curl 'https://wordpress.org' -L \| php extract-paragraphs.php -
	*/

	require_once __DIR__ . '/src/wp-load.php';

	$html = <<<'HTML'
	<div>
	<div>
	<p>Hello, world!</p>
	</div>
	<div>
	<p>Look, <a href="https://example.com">a link!</a></p>
	</div>
	</div>
	HTML;

	if ( '-' === $argv[ $argc - 1 ] ) {
	$html = file_get_contents( 'php://stdin' );
	}

	$processor = WP_HTML_Processor::create_full_parser( $html, 'UTF-8' );
	$extracted = '';

	$extracting = false;
	while ( $processor->next_token() ) {
	if ( 'P' === $processor->get_token_name() && 'html' === $processor->get_namespace() ) {
	$extracting = ! $processor->is_tag_closer();
	if ( $processor->is_tag_closer() ) {
	$extracted .= "\n\n";
	}
	continue;
	}

	if ( ! $extracting ) {
	continue;
	}

	// Copy text and tags but ignore comments and other syntax.
	switch ( $processor->get_token_type() ) {
	case '#text':
	$extracted .= htmlspecialchars( $processor->get_modifiable_text() );
	break;

	case '#tag':
	if ( $processor->is_tag_closer() ) {
	$extracted .= "</{$processor->get_qualified_tag_name()}>";
	break;
	}

	$tag_name = $processor->get_qualified_tag_name();
	$attributes = $processor->get_attribute_names_with_prefix( '' ) ?? [];
	$text = $processor->get_modifiable_text();

	$extracted .= "<{$tag_name}";
	foreach ( $attributes as $attribute_name ) {
	$value = $processor->get_attribute( $attribute_name );
	$extracted .= " {$processor->get_qualified_attribute_name($attribute_name)}";
	if ( is_string( $value ) ) {
	$extracted .= '="' . str_replace( '"', '"', $value ) . '"';
	}
	}

	// Self-contained elements need a closing.
	$extracted .= ">{$text}";
	if (
	'html' === $processor->get_namespace() &&
	in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true )
	) {
	$extracted = "</{$tag_name}>";
	}
	break;
	}
	}

	echo $extracted;

	if ( null !== $processor->get_last_error() ) {
	echo "\n\nFailed to fully parse HTML: {$processor->get_unsupported_exception()->getMessage()}\n";
	}