Created
February 18, 2018 20:08
-
-
Save gmazzap/754d7aea0b01c362b737c1788ab2e947 to your computer and use it in GitHub Desktop.
A basic, PHP 5.2+ compatible, HTML parser.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Brain_HtmlTokenizer class file. | |
* | |
* (c) Giuseppe Mazzapica | |
* | |
* @license http://opensource.org/licenses/MIT MIT | |
* @author Giuseppe Mazzapica <[email protected]> | |
*/ | |
/** | |
* A basic, PHP 5.2+ compatible, HTML parser. | |
*/ | |
class Brain_HtmlTokenizer { | |
/** | |
* Parsed HTML tokens. | |
* | |
* @var array[] | |
*/ | |
private $tokens = array(); | |
/** | |
* Named constructor, creates an instance from given HTML string. | |
* | |
* @param string $html HTML string to created instance from. | |
* | |
* @return Brain_HtmlTokenizer New Instance. | |
*/ | |
public static function load_html( $html ) { | |
if ( ! is_string( $html ) || ! $html ) { | |
return new static( array() ); | |
} | |
$instance = new static(); | |
$instance->tokens = $instance->tokenize( $html ); | |
return $instance; | |
} | |
/** | |
* Private on purpose, use named constructor; | |
* | |
* @param array[] $tokens HTML tokens. | |
*/ | |
private function __construct( array $tokens = array() ) { | |
$this->tokens = $tokens; | |
} | |
/** | |
* Return (maybe) processed HTML string. | |
* | |
* @return string processed HTML string. | |
*/ | |
public function __toString() { | |
return implode( '', array_map( array( $this, 'serialize_token' ), $this->tokens ) ); | |
} | |
/** | |
* Strips given HTML tags. | |
* | |
* @param string[] $tags HTML tags to remove. | |
* | |
* @return Brain_HtmlTokenizer Instance without given tags. | |
*/ | |
public function strip_tags( array $tags ) { | |
if ( ! $tags ) { | |
return $this; | |
} | |
$tokens = array(); | |
foreach ( $this->tokens as $i => $token ) { | |
list($tag) = $token['tag']; | |
if ( | |
( ! $tag && ! array_intersect( $token['parents'], $tags ) ) | |
|| ( $tag && ! in_array( $tag, $tags, true ) | |
&& ! in_array( $tag, $token['parents'], true ) ) | |
) { | |
$tokens[ $i ] = $token; | |
} | |
} | |
$this->tokens = $tokens; | |
return $this; | |
} | |
/** | |
* Keep only given HTML tags and remove all the others. | |
* | |
* @param string[] $tags HTML tags to keep. | |
* | |
* @return Brain_HtmlTokenizer Instance with only given tags. | |
*/ | |
public function keep_tags( array $tags ) { | |
$tokens = array(); | |
foreach ( $this->tokens as $i => $token ) { | |
if ( array_diff( $token['parents'], $tags ) ) { | |
continue; | |
} | |
list($tag) = $token['tag']; | |
if ( ! $tag && $token['parents'] ) { | |
$tag = end( $token['parents'] ); | |
} | |
if ( ! $tag || in_array( $tag, $tags, true ) ) { | |
$tokens[ $i ] = $token; | |
} | |
} | |
$this->tokens = $tokens; | |
return $this; | |
} | |
/** | |
* Strips given attributes from given HTML tags (or all of them). | |
* | |
* @param array $attributes Attributes to remove. | |
* @param array|null $tags HTML tags to remove attributes from. When null (default), | |
* means "all of them". | |
* | |
* @return Brain_HtmlTokenizer Instance without given attributes in given tags. | |
*/ | |
public function strip_attributes( array $attributes, array $tags = null ) { | |
if ( array() === $tags || array() === $attributes ) { | |
return $this; | |
} | |
$attributes = array_filter( array_filter( array_values( $attributes ), 'is_string' ) ); | |
if ( ! $attributes || array() === $tags ) { | |
return $this; | |
} | |
$tokens = array(); | |
foreach ( $this->tokens as $i => $token ) { | |
list($tag) = $token['tag']; | |
if ( | |
! $tag | |
|| ! $token['attributes'] | |
|| ( is_array( $tags ) && ! in_array( $tag, $tags, true ) ) | |
) { | |
$tokens[ $i ] = $token; | |
continue; | |
} | |
$attributes_blacklist = array_fill_keys( $attributes, '' ); | |
$token['attributes'] = array_diff_key( $token['attributes'], $attributes_blacklist ); | |
$tokens[ $i ] = $token; | |
} | |
$this->tokens = $tokens; | |
return $this; | |
} | |
/** | |
* Keep only given attributes in given HTML tags and remove all the others. | |
* | |
* @param array $attributes Attributes to keep. | |
* @param array|null $tags HTML tags to keep attributes for. When null (default), means | |
* "all of them". | |
* | |
* @return Brain_HtmlTokenizer Instance with only given attributes in given tags. | |
*/ | |
public function keep_attributes( array $attributes, array $tags = null ) { | |
if ( array() === $tags ) { | |
return $this; | |
} | |
$attributes = array_filter( array_filter( array_values( $attributes ), 'is_string' ) ); | |
$tokens = array(); | |
foreach ( $this->tokens as $i => $token ) { | |
list($tag) = $token['tag']; | |
if ( | |
! $tag | |
|| ! $token['attributes'] | |
|| ( is_array( $tags ) && ! in_array( $tag, $tags, true ) ) | |
) { | |
$tokens[ $i ] = $token; | |
continue; | |
} | |
if ( ! $attributes ) { | |
$token['attributes'] = array(); | |
$tokens[ $i ] = $token; | |
continue; | |
} | |
$attributes_whitelist = array_fill_keys( $attributes, '' ); | |
$token['attributes'] = array_intersect_key( | |
$token['attributes'], | |
$attributes_whitelist | |
); | |
$tokens[ $i ] = $token; | |
} | |
$this->tokens = $tokens; | |
return $this; | |
} | |
/** | |
* Process the given HTML string and build a stack of token. | |
* | |
* @param string $html HTML string to process. | |
* | |
* @return array[] Array of tokens, each is an array with keys: 'tag', 'content', and | |
* 'attributes'. | |
*/ | |
private function tokenize( $html ) { | |
$first_pass = array(); | |
$tokens = array(); | |
$parents = array(); | |
$token = strtok( $html, '<' ); | |
while ( false !== $token ) { | |
$first_pass[] = $token; | |
$token = strtok( '<' ); | |
} | |
foreach ( $first_pass as $i => $part ) { | |
$token_parts = explode( '>', $part, 2 ); | |
if ( array( $part ) === $token_parts ) { | |
$tokens[] = $this->no_tag_token( $part, $parents ); | |
continue; | |
} | |
list($tag_name, $tag_type, $attributes) = $this->tokenize_tag( $token_parts[0] ); | |
if ( $parents && 'close' === $tag_type && end( $parents ) === $tag_name ) { | |
array_pop( $parents ); | |
} | |
$tokens[] = array( | |
'tag' => array( $tag_name, $tag_type ), | |
'attributes' => 'close' !== $tag_type ? $this->tokenize_attributes( $attributes ) | |
: array(), | |
'content' => '', | |
'parents' => $parents, | |
); | |
if ( 'open' === $tag_type ) { | |
$parents[] = $tag_name; | |
} | |
$content = $token_parts[1]; | |
if ( $content ) { | |
$tokens[] = $this->no_tag_token( $content, $parents ); | |
} | |
} | |
return $tokens; | |
} | |
/** | |
* Creates a token for a string that does not belong to any tag. | |
* | |
* @param string $content HTML chunk content. | |
* @param array $parents HTML tags stack HTML chunk content belongs to. | |
* | |
* @return array | |
*/ | |
private function no_tag_token( $content, array $parents ) { | |
return array( | |
'tag' => array( null, null ), | |
'attributes' => null, | |
'content' => $content, | |
'parents' => $parents, | |
); | |
} | |
/** | |
* Process given HTML tag string and return tag name, tag type (open, close, self-close) and | |
* the "raw" attributes. | |
* | |
* @param string $tag Piece of HTML that contains, one HTML tag. | |
* | |
* @return string[] A 3 items array: | |
* - Tag name, e.g. "div" | |
* - Tag type, can be "open", "close", and "self-close" | |
* - Not parsed attributes string | |
*/ | |
private function tokenize_tag( $tag ) { | |
$self_close = substr( $tag, -1, 1 ) === '/'; | |
$tag_type = ''; | |
if ( $self_close ) { | |
$tag_type = 'self-close'; | |
$tag = substr( $tag, 0, -1 ); | |
} | |
$split = preg_split( '~\s+~', trim( $tag ), 2 ); | |
$attributes = count( $split ) === 2 ? end( $split ) : ''; | |
$tag_name = reset( $split ); | |
if ( ! $self_close ) { | |
$tag_type = substr( $tag_name, 0, 1 ) === '/' ? 'close' : 'open'; | |
} | |
return array( trim( $tag_name, '/' ), $tag_type, $attributes ); | |
} | |
/** | |
* Process given HTML tag attributes string and return an array of processed attributes in | |
* array form. | |
* | |
* @param string $attr HTML tag attributes string to process. | |
* @param array $tokens Array of parsed tag attributes, keys are attributes names, values | |
* attributes values. | |
* | |
* @return array | |
*/ | |
private function tokenize_attributes( $attr, array $tokens = null ) { | |
if ( is_null( $tokens ) ) { | |
$tokens = array(); | |
$attr = preg_replace( '~\s+~', ' ', $attr ); | |
} | |
$attr = trim( $attr ); | |
if ( ! $attr ) { | |
return $tokens; | |
} | |
$search_eq = strpos( $attr, '=' ); | |
$search_space = strpos( $attr, ' ' ); | |
// Flag and last attribute. | |
if ( false === $search_space && false === $search_eq ) { | |
$tokens[ $attr ] = true; | |
return $tokens; | |
} | |
// Flag. | |
if ( false !== $search_space && ( false === $search_eq || ( $search_space < $search_eq ) ) ) { | |
$attr_parts = explode( ' ', $attr, 2 ); | |
$tokens[ $attr_parts[0] ] = true; | |
if ( empty( $attr_parts[1] ) ) { | |
return $tokens; | |
} | |
return $this->tokenize_attributes( $attr_parts[1], $tokens ); | |
} | |
$attr_key = strtok( $attr, '=' ); | |
$token_split = $this->attr_value_token_split( $attr ); | |
$attr_value = strtok( $token_split ); | |
$tokens[ $attr_key ] = $this->strip_quote( $attr_value ); | |
$cursor = strlen( $attr_key ) + strlen( $attr_value ); | |
$cursor += ' ' === $token_split ? 2 : 3; | |
$to_parse = substr( $attr, $cursor ); | |
if ( $to_parse ) { | |
return $this->tokenize_attributes( $to_parse, $tokens ); | |
} | |
return $tokens; | |
} | |
/** | |
* Calculate token value to use with `strtok` to split attribute key from value. | |
* | |
* It is necessary because attributes might or might not have wrapping quotes around values. | |
* | |
* @param string $attributes_string Raw attributes string. | |
* | |
* @return string The token value to use with `strtok` to split attribute key from value | |
*/ | |
private function attr_value_token_split( $attributes_string ) { | |
$double_quote_index = strpos( $attributes_string, '"' ); | |
$single_quote_index = strpos( $attributes_string, '\'' ); | |
$quotes = array_filter( compact( 'double_quote_index', 'single_quote_index' ), 'is_int' ); | |
if ( ! $quotes ) { | |
return ' '; | |
} | |
$first_quote = min( $quotes ); | |
if ( $first_quote > strpos( $attributes_string, ' ' ) ) { | |
return ' '; | |
} | |
return array_search( $first_quote, $quotes, true ) === 'double_quote_index' ? '"' : "'"; | |
} | |
/** | |
* Remove wrapping quotes from attributes value, if any. | |
* | |
* Also ensure the value is suitable to be wrapped in double quotes ie. does not contain | |
* non-slashed double quotes. | |
* | |
* @param string $attribute_value Attribute value. | |
* | |
* @return string Attribute value without wrapping quotes. | |
*/ | |
private function strip_quote( $attribute_value ) { | |
$first = substr( $attribute_value, 0, 1 ); | |
$last = substr( $attribute_value, -1, 1 ); | |
if ( $first === $last && in_array( $first, array( '"', '\'' ), true ) ) { | |
$attribute_value = substr( $attribute_value, 1, -1 ); | |
} | |
if ( substr_count( $attribute_value, '"' ) !== substr_count( $attribute_value, '\"' ) ) { | |
$attribute_value = str_replace( '\"', '"', $attribute_value ); | |
$attribute_value = str_replace( '"', '\"', $attribute_value ); | |
} | |
return $attribute_value; | |
} | |
/** | |
* Take a processed token in array form and serialize back to string. | |
* | |
* @param array $token HTML token. | |
* | |
* @return string | |
*/ | |
private function serialize_token( array $token ) { | |
list($tag, $type) = $token['tag']; | |
$content = $token['content']; | |
if ( ! $tag ) { | |
return $content; | |
} | |
if ( 'close' === $type ) { | |
return "</{$tag}>{$content}"; | |
} | |
$attributes = $token['attributes']; | |
$string = "<{$tag}"; | |
foreach ( $attributes as $name => $value ) { | |
if ( true === $value ) { | |
$string .= " {$name}"; | |
continue; | |
} | |
$string .= " {$name}=\"{$value}\""; | |
} | |
return 'self-close' === $type ? "{$string}/>{$content}" : "{$string}>{$content}"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@GaryJones yes, sure not hard.
It would be something like this:
Note that is completely untested, typed directly in this comment, and because of
esc_attr
, coupled to WordPress (unlike the rest of the class).