Skip to content

Instantly share code, notes, and snippets.

@gmazzap
Created February 18, 2018 20:08
Show Gist options
  • Save gmazzap/754d7aea0b01c362b737c1788ab2e947 to your computer and use it in GitHub Desktop.
Save gmazzap/754d7aea0b01c362b737c1788ab2e947 to your computer and use it in GitHub Desktop.
A basic, PHP 5.2+ compatible, HTML parser.
<?php
/**
* Brain_HtmlTokenizer class file.
*
* (c) Giuseppe Mazzapica
*
* @license http://opensource.org/licenses/MIT MIT
* @author Giuseppe Mazzapica <[email protected]>
*/
/**
* A basic, PHP 5.2+ compatible, HTML parser.
*/
class Brain_HtmlTokenizer {
/**
* Parsed HTML tokens.
*
* @var array[]
*/
private $tokens = array();
/**
* Named constructor, creates an instance from given HTML string.
*
* @param string $html HTML string to created instance from.
*
* @return Brain_HtmlTokenizer New Instance.
*/
public static function load_html( $html ) {
if ( ! is_string( $html ) || ! $html ) {
return new static( array() );
}
$instance = new static();
$instance->tokens = $instance->tokenize( $html );
return $instance;
}
/**
* Private on purpose, use named constructor;
*
* @param array[] $tokens HTML tokens.
*/
private function __construct( array $tokens = array() ) {
$this->tokens = $tokens;
}
/**
* Return (maybe) processed HTML string.
*
* @return string processed HTML string.
*/
public function __toString() {
return implode( '', array_map( array( $this, 'serialize_token' ), $this->tokens ) );
}
/**
* Strips given HTML tags.
*
* @param string[] $tags HTML tags to remove.
*
* @return Brain_HtmlTokenizer Instance without given tags.
*/
public function strip_tags( array $tags ) {
if ( ! $tags ) {
return $this;
}
$tokens = array();
foreach ( $this->tokens as $i => $token ) {
list($tag) = $token['tag'];
if (
( ! $tag && ! array_intersect( $token['parents'], $tags ) )
|| ( $tag && ! in_array( $tag, $tags, true )
&& ! in_array( $tag, $token['parents'], true ) )
) {
$tokens[ $i ] = $token;
}
}
$this->tokens = $tokens;
return $this;
}
/**
* Keep only given HTML tags and remove all the others.
*
* @param string[] $tags HTML tags to keep.
*
* @return Brain_HtmlTokenizer Instance with only given tags.
*/
public function keep_tags( array $tags ) {
$tokens = array();
foreach ( $this->tokens as $i => $token ) {
if ( array_diff( $token['parents'], $tags ) ) {
continue;
}
list($tag) = $token['tag'];
if ( ! $tag && $token['parents'] ) {
$tag = end( $token['parents'] );
}
if ( ! $tag || in_array( $tag, $tags, true ) ) {
$tokens[ $i ] = $token;
}
}
$this->tokens = $tokens;
return $this;
}
/**
* Strips given attributes from given HTML tags (or all of them).
*
* @param array $attributes Attributes to remove.
* @param array|null $tags HTML tags to remove attributes from. When null (default),
* means "all of them".
*
* @return Brain_HtmlTokenizer Instance without given attributes in given tags.
*/
public function strip_attributes( array $attributes, array $tags = null ) {
if ( array() === $tags || array() === $attributes ) {
return $this;
}
$attributes = array_filter( array_filter( array_values( $attributes ), 'is_string' ) );
if ( ! $attributes || array() === $tags ) {
return $this;
}
$tokens = array();
foreach ( $this->tokens as $i => $token ) {
list($tag) = $token['tag'];
if (
! $tag
|| ! $token['attributes']
|| ( is_array( $tags ) && ! in_array( $tag, $tags, true ) )
) {
$tokens[ $i ] = $token;
continue;
}
$attributes_blacklist = array_fill_keys( $attributes, '' );
$token['attributes'] = array_diff_key( $token['attributes'], $attributes_blacklist );
$tokens[ $i ] = $token;
}
$this->tokens = $tokens;
return $this;
}
/**
* Keep only given attributes in given HTML tags and remove all the others.
*
* @param array $attributes Attributes to keep.
* @param array|null $tags HTML tags to keep attributes for. When null (default), means
* "all of them".
*
* @return Brain_HtmlTokenizer Instance with only given attributes in given tags.
*/
public function keep_attributes( array $attributes, array $tags = null ) {
if ( array() === $tags ) {
return $this;
}
$attributes = array_filter( array_filter( array_values( $attributes ), 'is_string' ) );
$tokens = array();
foreach ( $this->tokens as $i => $token ) {
list($tag) = $token['tag'];
if (
! $tag
|| ! $token['attributes']
|| ( is_array( $tags ) && ! in_array( $tag, $tags, true ) )
) {
$tokens[ $i ] = $token;
continue;
}
if ( ! $attributes ) {
$token['attributes'] = array();
$tokens[ $i ] = $token;
continue;
}
$attributes_whitelist = array_fill_keys( $attributes, '' );
$token['attributes'] = array_intersect_key(
$token['attributes'],
$attributes_whitelist
);
$tokens[ $i ] = $token;
}
$this->tokens = $tokens;
return $this;
}
/**
* Process the given HTML string and build a stack of token.
*
* @param string $html HTML string to process.
*
* @return array[] Array of tokens, each is an array with keys: 'tag', 'content', and
* 'attributes'.
*/
private function tokenize( $html ) {
$first_pass = array();
$tokens = array();
$parents = array();
$token = strtok( $html, '<' );
while ( false !== $token ) {
$first_pass[] = $token;
$token = strtok( '<' );
}
foreach ( $first_pass as $i => $part ) {
$token_parts = explode( '>', $part, 2 );
if ( array( $part ) === $token_parts ) {
$tokens[] = $this->no_tag_token( $part, $parents );
continue;
}
list($tag_name, $tag_type, $attributes) = $this->tokenize_tag( $token_parts[0] );
if ( $parents && 'close' === $tag_type && end( $parents ) === $tag_name ) {
array_pop( $parents );
}
$tokens[] = array(
'tag' => array( $tag_name, $tag_type ),
'attributes' => 'close' !== $tag_type ? $this->tokenize_attributes( $attributes )
: array(),
'content' => '',
'parents' => $parents,
);
if ( 'open' === $tag_type ) {
$parents[] = $tag_name;
}
$content = $token_parts[1];
if ( $content ) {
$tokens[] = $this->no_tag_token( $content, $parents );
}
}
return $tokens;
}
/**
* Creates a token for a string that does not belong to any tag.
*
* @param string $content HTML chunk content.
* @param array $parents HTML tags stack HTML chunk content belongs to.
*
* @return array
*/
private function no_tag_token( $content, array $parents ) {
return array(
'tag' => array( null, null ),
'attributes' => null,
'content' => $content,
'parents' => $parents,
);
}
/**
* Process given HTML tag string and return tag name, tag type (open, close, self-close) and
* the "raw" attributes.
*
* @param string $tag Piece of HTML that contains, one HTML tag.
*
* @return string[] A 3 items array:
* - Tag name, e.g. "div"
* - Tag type, can be "open", "close", and "self-close"
* - Not parsed attributes string
*/
private function tokenize_tag( $tag ) {
$self_close = substr( $tag, -1, 1 ) === '/';
$tag_type = '';
if ( $self_close ) {
$tag_type = 'self-close';
$tag = substr( $tag, 0, -1 );
}
$split = preg_split( '~\s+~', trim( $tag ), 2 );
$attributes = count( $split ) === 2 ? end( $split ) : '';
$tag_name = reset( $split );
if ( ! $self_close ) {
$tag_type = substr( $tag_name, 0, 1 ) === '/' ? 'close' : 'open';
}
return array( trim( $tag_name, '/' ), $tag_type, $attributes );
}
/**
* Process given HTML tag attributes string and return an array of processed attributes in
* array form.
*
* @param string $attr HTML tag attributes string to process.
* @param array $tokens Array of parsed tag attributes, keys are attributes names, values
* attributes values.
*
* @return array
*/
private function tokenize_attributes( $attr, array $tokens = null ) {
if ( is_null( $tokens ) ) {
$tokens = array();
$attr = preg_replace( '~\s+~', ' ', $attr );
}
$attr = trim( $attr );
if ( ! $attr ) {
return $tokens;
}
$search_eq = strpos( $attr, '=' );
$search_space = strpos( $attr, ' ' );
// Flag and last attribute.
if ( false === $search_space && false === $search_eq ) {
$tokens[ $attr ] = true;
return $tokens;
}
// Flag.
if ( false !== $search_space && ( false === $search_eq || ( $search_space < $search_eq ) ) ) {
$attr_parts = explode( ' ', $attr, 2 );
$tokens[ $attr_parts[0] ] = true;
if ( empty( $attr_parts[1] ) ) {
return $tokens;
}
return $this->tokenize_attributes( $attr_parts[1], $tokens );
}
$attr_key = strtok( $attr, '=' );
$token_split = $this->attr_value_token_split( $attr );
$attr_value = strtok( $token_split );
$tokens[ $attr_key ] = $this->strip_quote( $attr_value );
$cursor = strlen( $attr_key ) + strlen( $attr_value );
$cursor += ' ' === $token_split ? 2 : 3;
$to_parse = substr( $attr, $cursor );
if ( $to_parse ) {
return $this->tokenize_attributes( $to_parse, $tokens );
}
return $tokens;
}
/**
* Calculate token value to use with `strtok` to split attribute key from value.
*
* It is necessary because attributes might or might not have wrapping quotes around values.
*
* @param string $attributes_string Raw attributes string.
*
* @return string The token value to use with `strtok` to split attribute key from value
*/
private function attr_value_token_split( $attributes_string ) {
$double_quote_index = strpos( $attributes_string, '"' );
$single_quote_index = strpos( $attributes_string, '\'' );
$quotes = array_filter( compact( 'double_quote_index', 'single_quote_index' ), 'is_int' );
if ( ! $quotes ) {
return ' ';
}
$first_quote = min( $quotes );
if ( $first_quote > strpos( $attributes_string, ' ' ) ) {
return ' ';
}
return array_search( $first_quote, $quotes, true ) === 'double_quote_index' ? '"' : "'";
}
/**
* Remove wrapping quotes from attributes value, if any.
*
* Also ensure the value is suitable to be wrapped in double quotes ie. does not contain
* non-slashed double quotes.
*
* @param string $attribute_value Attribute value.
*
* @return string Attribute value without wrapping quotes.
*/
private function strip_quote( $attribute_value ) {
$first = substr( $attribute_value, 0, 1 );
$last = substr( $attribute_value, -1, 1 );
if ( $first === $last && in_array( $first, array( '"', '\'' ), true ) ) {
$attribute_value = substr( $attribute_value, 1, -1 );
}
if ( substr_count( $attribute_value, '"' ) !== substr_count( $attribute_value, '\"' ) ) {
$attribute_value = str_replace( '\"', '"', $attribute_value );
$attribute_value = str_replace( '"', '\"', $attribute_value );
}
return $attribute_value;
}
/**
* Take a processed token in array form and serialize back to string.
*
* @param array $token HTML token.
*
* @return string
*/
private function serialize_token( array $token ) {
list($tag, $type) = $token['tag'];
$content = $token['content'];
if ( ! $tag ) {
return $content;
}
if ( 'close' === $type ) {
return "</{$tag}>{$content}";
}
$attributes = $token['attributes'];
$string = "<{$tag}";
foreach ( $attributes as $name => $value ) {
if ( true === $value ) {
$string .= " {$name}";
continue;
}
$string .= " {$name}=\"{$value}\"";
}
return 'self-close' === $type ? "{$string}/>{$content}" : "{$string}>{$content}";
}
}
@gmazzap
Copy link
Author

gmazzap commented Feb 18, 2018

Another example could be the implementation of genesis_strip_attr function from Genesis framework. It would be:

function genesis_strip_attr( $text, $tags, $attributes ) {

    return (string) Brain_HtmlTokenizer::load_html( $text )->strip_attributes( (array) $tags, (array) $attributes ) );
}

@GaryJones
Copy link

Could this be amended to also include a way to add attributes to tags?

@gmazzap
Copy link
Author

gmazzap commented Feb 19, 2018

@GaryJones yes, sure not hard.

It would be something like this:

/**
 * @param  array $attributes Associative array, where keys are attributes keys and values attributes values.
 *                           For flags, use `true` as value.
 */
public function add_attributes( array $attributes, array $tags = array()) {

    if ( ! $tags || array() === $attributes ) {
        return $this;
     }

     $tokens = array();
     foreach ( $this->tokens as $i => $token ) {
         list($tag) = $token['tag'];
         if ( ! $tag || ! in_array( $tag, $tags, true ) ) {
             $tokens[ $i ] = $token;
             continue;
          }

          foreach ( $attributes as $name => $value ) {
              if ( is_string( $value ) || true === $value ) {
                  $tag['attributes'][$name] = true === $value ? $value : esc_attr( $value );
              }
          }

          $tokens[ $i ] = $token;
     }
     
     $this->tokens = $tokens;
     return $this;
}

Note that is completely untested, typed directly in this comment, and because of esc_attr, coupled to WordPress (unlike the rest of the class).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment