Skip to content

Instantly share code, notes, and snippets.

@gmazzap
Created February 18, 2018 20:08
Show Gist options
  • Save gmazzap/754d7aea0b01c362b737c1788ab2e947 to your computer and use it in GitHub Desktop.
Save gmazzap/754d7aea0b01c362b737c1788ab2e947 to your computer and use it in GitHub Desktop.
A basic, PHP 5.2+ compatible, HTML parser.
<?php
/**
* Brain_HtmlTokenizer class file.
*
* (c) Giuseppe Mazzapica
*
* @license http://opensource.org/licenses/MIT MIT
* @author Giuseppe Mazzapica <[email protected]>
*/
/**
* A basic, PHP 5.2+ compatible, HTML parser.
*/
class Brain_HtmlTokenizer {
/**
* Parsed HTML tokens.
*
* @var array[]
*/
private $tokens = array();
/**
* Named constructor, creates an instance from given HTML string.
*
* @param string $html HTML string to created instance from.
*
* @return Brain_HtmlTokenizer New Instance.
*/
public static function load_html( $html ) {
if ( ! is_string( $html ) || ! $html ) {
return new static( array() );
}
$instance = new static();
$instance->tokens = $instance->tokenize( $html );
return $instance;
}
/**
* Private on purpose, use named constructor;
*
* @param array[] $tokens HTML tokens.
*/
private function __construct( array $tokens = array() ) {
$this->tokens = $tokens;
}
/**
* Return (maybe) processed HTML string.
*
* @return string processed HTML string.
*/
public function __toString() {
return implode( '', array_map( array( $this, 'serialize_token' ), $this->tokens ) );
}
/**
* Strips given HTML tags.
*
* @param string[] $tags HTML tags to remove.
*
* @return Brain_HtmlTokenizer Instance without given tags.
*/
public function strip_tags( array $tags ) {
if ( ! $tags ) {
return $this;
}
$tokens = array();
foreach ( $this->tokens as $i => $token ) {
list($tag) = $token['tag'];
if (
( ! $tag && ! array_intersect( $token['parents'], $tags ) )
|| ( $tag && ! in_array( $tag, $tags, true )
&& ! in_array( $tag, $token['parents'], true ) )
) {
$tokens[ $i ] = $token;
}
}
$this->tokens = $tokens;
return $this;
}
/**
* Keep only given HTML tags and remove all the others.
*
* @param string[] $tags HTML tags to keep.
*
* @return Brain_HtmlTokenizer Instance with only given tags.
*/
public function keep_tags( array $tags ) {
$tokens = array();
foreach ( $this->tokens as $i => $token ) {
if ( array_diff( $token['parents'], $tags ) ) {
continue;
}
list($tag) = $token['tag'];
if ( ! $tag && $token['parents'] ) {
$tag = end( $token['parents'] );
}
if ( ! $tag || in_array( $tag, $tags, true ) ) {
$tokens[ $i ] = $token;
}
}
$this->tokens = $tokens;
return $this;
}
/**
* Strips given attributes from given HTML tags (or all of them).
*
* @param array $attributes Attributes to remove.
* @param array|null $tags HTML tags to remove attributes from. When null (default),
* means "all of them".
*
* @return Brain_HtmlTokenizer Instance without given attributes in given tags.
*/
public function strip_attributes( array $attributes, array $tags = null ) {
if ( array() === $tags || array() === $attributes ) {
return $this;
}
$attributes = array_filter( array_filter( array_values( $attributes ), 'is_string' ) );
if ( ! $attributes || array() === $tags ) {
return $this;
}
$tokens = array();
foreach ( $this->tokens as $i => $token ) {
list($tag) = $token['tag'];
if (
! $tag
|| ! $token['attributes']
|| ( is_array( $tags ) && ! in_array( $tag, $tags, true ) )
) {
$tokens[ $i ] = $token;
continue;
}
$attributes_blacklist = array_fill_keys( $attributes, '' );
$token['attributes'] = array_diff_key( $token['attributes'], $attributes_blacklist );
$tokens[ $i ] = $token;
}
$this->tokens = $tokens;
return $this;
}
/**
* Keep only given attributes in given HTML tags and remove all the others.
*
* @param array $attributes Attributes to keep.
* @param array|null $tags HTML tags to keep attributes for. When null (default), means
* "all of them".
*
* @return Brain_HtmlTokenizer Instance with only given attributes in given tags.
*/
public function keep_attributes( array $attributes, array $tags = null ) {
if ( array() === $tags ) {
return $this;
}
$attributes = array_filter( array_filter( array_values( $attributes ), 'is_string' ) );
$tokens = array();
foreach ( $this->tokens as $i => $token ) {
list($tag) = $token['tag'];
if (
! $tag
|| ! $token['attributes']
|| ( is_array( $tags ) && ! in_array( $tag, $tags, true ) )
) {
$tokens[ $i ] = $token;
continue;
}
if ( ! $attributes ) {
$token['attributes'] = array();
$tokens[ $i ] = $token;
continue;
}
$attributes_whitelist = array_fill_keys( $attributes, '' );
$token['attributes'] = array_intersect_key(
$token['attributes'],
$attributes_whitelist
);
$tokens[ $i ] = $token;
}
$this->tokens = $tokens;
return $this;
}
/**
* Process the given HTML string and build a stack of token.
*
* @param string $html HTML string to process.
*
* @return array[] Array of tokens, each is an array with keys: 'tag', 'content', and
* 'attributes'.
*/
private function tokenize( $html ) {
$first_pass = array();
$tokens = array();
$parents = array();
$token = strtok( $html, '<' );
while ( false !== $token ) {
$first_pass[] = $token;
$token = strtok( '<' );
}
foreach ( $first_pass as $i => $part ) {
$token_parts = explode( '>', $part, 2 );
if ( array( $part ) === $token_parts ) {
$tokens[] = $this->no_tag_token( $part, $parents );
continue;
}
list($tag_name, $tag_type, $attributes) = $this->tokenize_tag( $token_parts[0] );
if ( $parents && 'close' === $tag_type && end( $parents ) === $tag_name ) {
array_pop( $parents );
}
$tokens[] = array(
'tag' => array( $tag_name, $tag_type ),
'attributes' => 'close' !== $tag_type ? $this->tokenize_attributes( $attributes )
: array(),
'content' => '',
'parents' => $parents,
);
if ( 'open' === $tag_type ) {
$parents[] = $tag_name;
}
$content = $token_parts[1];
if ( $content ) {
$tokens[] = $this->no_tag_token( $content, $parents );
}
}
return $tokens;
}
/**
* Creates a token for a string that does not belong to any tag.
*
* @param string $content HTML chunk content.
* @param array $parents HTML tags stack HTML chunk content belongs to.
*
* @return array
*/
private function no_tag_token( $content, array $parents ) {
return array(
'tag' => array( null, null ),
'attributes' => null,
'content' => $content,
'parents' => $parents,
);
}
/**
* Process given HTML tag string and return tag name, tag type (open, close, self-close) and
* the "raw" attributes.
*
* @param string $tag Piece of HTML that contains, one HTML tag.
*
* @return string[] A 3 items array:
* - Tag name, e.g. "div"
* - Tag type, can be "open", "close", and "self-close"
* - Not parsed attributes string
*/
private function tokenize_tag( $tag ) {
$self_close = substr( $tag, -1, 1 ) === '/';
$tag_type = '';
if ( $self_close ) {
$tag_type = 'self-close';
$tag = substr( $tag, 0, -1 );
}
$split = preg_split( '~\s+~', trim( $tag ), 2 );
$attributes = count( $split ) === 2 ? end( $split ) : '';
$tag_name = reset( $split );
if ( ! $self_close ) {
$tag_type = substr( $tag_name, 0, 1 ) === '/' ? 'close' : 'open';
}
return array( trim( $tag_name, '/' ), $tag_type, $attributes );
}
/**
* Process given HTML tag attributes string and return an array of processed attributes in
* array form.
*
* @param string $attr HTML tag attributes string to process.
* @param array $tokens Array of parsed tag attributes, keys are attributes names, values
* attributes values.
*
* @return array
*/
private function tokenize_attributes( $attr, array $tokens = null ) {
if ( is_null( $tokens ) ) {
$tokens = array();
$attr = preg_replace( '~\s+~', ' ', $attr );
}
$attr = trim( $attr );
if ( ! $attr ) {
return $tokens;
}
$search_eq = strpos( $attr, '=' );
$search_space = strpos( $attr, ' ' );
// Flag and last attribute.
if ( false === $search_space && false === $search_eq ) {
$tokens[ $attr ] = true;
return $tokens;
}
// Flag.
if ( false !== $search_space && ( false === $search_eq || ( $search_space < $search_eq ) ) ) {
$attr_parts = explode( ' ', $attr, 2 );
$tokens[ $attr_parts[0] ] = true;
if ( empty( $attr_parts[1] ) ) {
return $tokens;
}
return $this->tokenize_attributes( $attr_parts[1], $tokens );
}
$attr_key = strtok( $attr, '=' );
$token_split = $this->attr_value_token_split( $attr );
$attr_value = strtok( $token_split );
$tokens[ $attr_key ] = $this->strip_quote( $attr_value );
$cursor = strlen( $attr_key ) + strlen( $attr_value );
$cursor += ' ' === $token_split ? 2 : 3;
$to_parse = substr( $attr, $cursor );
if ( $to_parse ) {
return $this->tokenize_attributes( $to_parse, $tokens );
}
return $tokens;
}
/**
* Calculate token value to use with `strtok` to split attribute key from value.
*
* It is necessary because attributes might or might not have wrapping quotes around values.
*
* @param string $attributes_string Raw attributes string.
*
* @return string The token value to use with `strtok` to split attribute key from value
*/
private function attr_value_token_split( $attributes_string ) {
$double_quote_index = strpos( $attributes_string, '"' );
$single_quote_index = strpos( $attributes_string, '\'' );
$quotes = array_filter( compact( 'double_quote_index', 'single_quote_index' ), 'is_int' );
if ( ! $quotes ) {
return ' ';
}
$first_quote = min( $quotes );
if ( $first_quote > strpos( $attributes_string, ' ' ) ) {
return ' ';
}
return array_search( $first_quote, $quotes, true ) === 'double_quote_index' ? '"' : "'";
}
/**
* Remove wrapping quotes from attributes value, if any.
*
* Also ensure the value is suitable to be wrapped in double quotes ie. does not contain
* non-slashed double quotes.
*
* @param string $attribute_value Attribute value.
*
* @return string Attribute value without wrapping quotes.
*/
private function strip_quote( $attribute_value ) {
$first = substr( $attribute_value, 0, 1 );
$last = substr( $attribute_value, -1, 1 );
if ( $first === $last && in_array( $first, array( '"', '\'' ), true ) ) {
$attribute_value = substr( $attribute_value, 1, -1 );
}
if ( substr_count( $attribute_value, '"' ) !== substr_count( $attribute_value, '\"' ) ) {
$attribute_value = str_replace( '\"', '"', $attribute_value );
$attribute_value = str_replace( '"', '\"', $attribute_value );
}
return $attribute_value;
}
/**
* Take a processed token in array form and serialize back to string.
*
* @param array $token HTML token.
*
* @return string
*/
private function serialize_token( array $token ) {
list($tag, $type) = $token['tag'];
$content = $token['content'];
if ( ! $tag ) {
return $content;
}
if ( 'close' === $type ) {
return "</{$tag}>{$content}";
}
$attributes = $token['attributes'];
$string = "<{$tag}";
foreach ( $attributes as $name => $value ) {
if ( true === $value ) {
$string .= " {$name}";
continue;
}
$string .= " {$name}=\"{$value}\"";
}
return 'self-close' === $type ? "{$string}/>{$content}" : "{$string}>{$content}";
}
}
@gmazzap
Copy link
Author

gmazzap commented Feb 18, 2018

The class takes an HTML string and tokenize it.

Few methods are there to process generated token and either whitelist or blacklist tags and attributes.
The __toString() transform can be called to obtain the (maybe) processed HTML string.

For example, assuming following HTML:

<div class="wrapper" id="0">
    <p id="1">Paragraph <strong>bold</strong> normal.</p>
    <p id="2"><strong>bold</strong><span id="3" class="foo">A span!</span></p>
    <input id="4" type="hidden" value="x" />
    <em id="5" class="italic">Italic</em>
    <script src="http://example.com/example.js" />
</div>

And the following code:

$processed = Brain_HtmlTokenizer::load_html( $html )  // HTML from above
    ->keep_tags( array( 'div', 'p', 'strong', 'em' ) )
    ->strip_attributes( array( 'class' ), array( 'div' ) )
    ->strip_attributes( array( 'id' ), array( 'p' ) )
    ->keep_attributes( array( 'class' ), array( 'em' ) )
    ->__toString();

$processed variable will be equal to:

<div id="0">
    <p>Paragraph <strong>bold</strong> normal.</p>
    <p><strong>bold</strong></p>
    <em class="italic">Italic</em>
</div>

@gmazzap
Copy link
Author

gmazzap commented Feb 18, 2018

Another example could be the implementation of genesis_strip_attr function from Genesis framework. It would be:

function genesis_strip_attr( $text, $tags, $attributes ) {

    return (string) Brain_HtmlTokenizer::load_html( $text )->strip_attributes( (array) $tags, (array) $attributes ) );
}

@GaryJones
Copy link

Could this be amended to also include a way to add attributes to tags?

@gmazzap
Copy link
Author

gmazzap commented Feb 19, 2018

@GaryJones yes, sure not hard.

It would be something like this:

/**
 * @param  array $attributes Associative array, where keys are attributes keys and values attributes values.
 *                           For flags, use `true` as value.
 */
public function add_attributes( array $attributes, array $tags = array()) {

    if ( ! $tags || array() === $attributes ) {
        return $this;
     }

     $tokens = array();
     foreach ( $this->tokens as $i => $token ) {
         list($tag) = $token['tag'];
         if ( ! $tag || ! in_array( $tag, $tags, true ) ) {
             $tokens[ $i ] = $token;
             continue;
          }

          foreach ( $attributes as $name => $value ) {
              if ( is_string( $value ) || true === $value ) {
                  $tag['attributes'][$name] = true === $value ? $value : esc_attr( $value );
              }
          }

          $tokens[ $i ] = $token;
     }
     
     $this->tokens = $tokens;
     return $this;
}

Note that is completely untested, typed directly in this comment, and because of esc_attr, coupled to WordPress (unlike the rest of the class).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment