Created
October 24, 2012 03:14
-
-
Save dantman/3943466 to your computer and use it in GitHub Desktop.
New templated skin parser attempt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class TemplateNode { | |
} | |
class TemplateText extends TemplateNode { | |
} | |
class TemplateCondition extends TemplateNode {} | |
class TemplateFunction extends TemplateNode {} | |
class TemplateSubstitution extends TemplateNode {} | |
class TemplateBlock extends TemplateNode { | |
private $children; | |
public function __construct() { | |
$this->children = new SplDoublyLinkedList(); | |
} | |
public function append( TemplateNode $node ) { | |
$this->children->push( $node ); | |
} | |
} | |
class TemplateElement extends TemplateBlock { | |
} | |
class TemplateDocument extends TemplateBlock { | |
} | |
class Match { | |
private $parser; | |
private $start; | |
private $matches; | |
public function __construct( $parser, $start, $matches ) { | |
$this->parser = $parser; | |
$this->start = $start; | |
$this->matches = $matches; | |
} | |
public function group( $id ) { | |
return $this->matches[$id]; | |
} | |
public function text() { | |
return $this->group( 0 ); | |
} | |
public function length() { | |
return strlen( $this->text() ); | |
} | |
public function rollback() { | |
$this->parser->rollback( $this->length(), $this->start ); | |
} | |
} | |
class Regexp { | |
private $source; | |
public function __construct( $source ) { | |
$this->source = $source; | |
} | |
public function source() { | |
return $this->source; | |
} | |
} | |
class AParser { | |
protected $string; | |
protected $pos; | |
public function __construct( $string ) { | |
$this->string = $string; | |
$this->pos = 0; | |
} | |
/** | |
* @protected | |
*/ | |
function rollback( $chars, $to ) { | |
if ( $this->pos - $chars !== $to ) { | |
throw new Exception( __METHOD__ . ': Parser implementation error. A rollback was atempted after more data had been parsed.' ); | |
} | |
$this->pos -= $chars; | |
} | |
protected function makeRegexp( $test ) { | |
if ( is_string( $test ) ) { | |
$re = '/'; | |
$re .= preg_quote( $test, '/' ); | |
$re .= '/'; | |
return $re; | |
} elseif ( $test instanceof Regexp ) { | |
return $test->source(); | |
} else { | |
throw new Exception( __METHOD__ . ': Parser implementation error. Unknown type of data was patched to a parser match function.' ); | |
} | |
} | |
protected function reMatch( $re ) { | |
$re .= 'ADsu'; | |
$m = null; | |
if ( preg_match( $re, $this->string, $m, 0, $this->pos ) ) { | |
return new Match( $this, $this->pos, $m ); | |
} else { | |
return false; | |
} | |
} | |
protected function nextIs( $test ) { | |
$re = $this->makeRegexp( $test ); | |
$m = null; | |
if( $this->reMatch( $re ) ) { | |
return true; | |
} else { | |
return false; | |
} | |
} | |
protected function consume( $test ) { | |
$re = $this->makeRegexp( $test ); | |
if( $m = $this->reMatch( $re ) ) { | |
$this->pos += $m->length(); | |
return $m; | |
} else { | |
return false; | |
} | |
} | |
protected function capture( $test ) { | |
$m = $this->consume( $test ); | |
if ( $m ) { | |
return $m->text(); | |
} else { | |
return false; | |
} | |
} | |
protected function reap( $test, $msg ) { | |
$m = $this->consume( $test ); | |
$this->assert( $m, $msg ); | |
return $m; | |
} | |
// protected function parseOne( $list ) { | |
// foreach ( $list as $token ) { | |
// $tokenMethod = 'token' . ucfirst( $token ); | |
// $res = $this->{$tokenMethod}(); | |
// if ( $res === false ) { | |
// continue; | |
// } | |
// return $res; | |
// } | |
// // error? | |
// return false; | |
// } | |
protected function ws() { | |
if ( $m = $this->reMatch( '/\s+/' ) ) { | |
$this->pos += $m->length(); | |
return true; | |
} | |
return false; | |
} | |
protected function following() { | |
return substr( $this->string, $this->pos, 64 ); | |
} | |
protected function assert( $test, $msg ) { | |
if ( !$test ) { | |
$this->error( $msg ); | |
} | |
} | |
protected function error( $msg ) { | |
// @todo line and char numbers | |
// @fixme Use a real ParseError class | |
throw new Exception( "ParseError: Parser encountered an error while parsing. Error message: \"$msg\"; Parsing stopped at \"{$this->following()}\"." ); | |
} | |
} | |
class Token { | |
public function __construct( $tokenType ) { | |
$this->tokenType = $tokenType; | |
} | |
} | |
class TemplateParser extends AParser { | |
public function parseDocument() { | |
$this->doc = new TemplateDocument(); | |
$this->tagStack = new SplStack(); | |
$this->tagStack->push( $this->doc ); | |
$this->rule = (object)array( | |
'tagname' => new Regexp( '/[a-z][_a-z0-9]*/i' ), | |
'attrName' => new Regexp( '/(mw:)?[a-z][-_a-z0-9]*/i' ), | |
'funcName' => new Regexp( '/([a-z][-_a-z0-9]*):/i' ), | |
'conditional' => new Regexp( '/(if|unless)\s+/' ), | |
); | |
while( true ) { | |
if ( $this->tokenText( array( 'mode' => 'html', 'emit' => true ) ) ) {} | |
elseif ( $this->tokenTag() ) {} | |
else { | |
break; | |
} | |
} | |
if ( $this->pos !== strlen( $this->string ) ) { | |
throw new Exception( "Parser was unable to finish parsing. Syntax continuation could not be found around: \"{$this->following()}\"" ); | |
} | |
return $this->doc; | |
} | |
protected function emit( $token ) { | |
switch ( $token->tokenType ) { | |
case 'tag': | |
if ( $token->open ) { | |
$parent = $this->tagStack->top(); | |
$element = new TemplateElement(); | |
$element->name = $token->name; | |
$element->nameHint = $token->nameHint; | |
$parent->append( $element ); | |
$this->tagStack->push( $element ); | |
} else { | |
$element = $this->tagStack->top(); | |
if ( $token->name ) { | |
$this->assert( $element->name, "Cannot close a <{$token->name}> element. A null <{$element->nameHint}> has not yet been closed." ); | |
// @fixme Instead we should be doing some html-like implicit close handling | |
$this->assert( $token->name && $element->name, | |
"Closing tag <{$token->name}> did not match the currently opened <{$element->name}>." ); | |
$this->tagStack->pop(); | |
} else { | |
if ( !$element->name ) { | |
$hintmatch = $token->nameHint === "" | |
|| $token->nameHint === "mw" | |
|| $token->nameHint === "mw:" | |
|| $token->nameHint === $element->nameHint; | |
$this->assert( $hintmatch, "Null element closing tag </{$token->nameHint}> did not match the currently opened <{$element->nameHint}>." ); | |
$this->tagStack->pop(); | |
} else { | |
$this->error( "Cannot close a null <{$token->nameHint}> element. A standard <{$element->name}> has not yet been closed." ); | |
} | |
} | |
} | |
break; | |
case 'text': | |
$node = new TemplateText(); | |
$this->tagStack->top()->append( $node ); | |
break; | |
case 'cond': | |
$node = new TemplateCondition(); | |
$this->tagStack->top()->append( $node ); | |
break; | |
case 'func': | |
$node = new TemplateFunction(); | |
$this->tagStack->top()->append( $node ); | |
break; | |
case 'subst': | |
$node = new TemplateSubstitution(); | |
$this->tagStack->top()->append( $node ); | |
break; | |
default: | |
throw new Exception( __METHOD__ . ': Tried to emit an unknown token type.' ); | |
} | |
} | |
protected function tokenTag() { | |
if ( !$this->consume( "<" ) ) { | |
// No tag to parse, move on to another type of token | |
return false; | |
} | |
$closed = false; // Have we found the ending > yet? | |
$endTag = false; // Is this a </foo> end tag? | |
$selfClosing = false; // Is this a self closing <foo />? | |
// State indicating that whitespace was consumed before the current token. | |
// Used by attribute parsing to make sure there is whitespace preceding an attribute | |
$wsConsumed = false; | |
if ( $_ = $this->consume( '/' ) ) { | |
$endTag = true; | |
} | |
$tag = new Token( 'tag' ); | |
$tag->open = !$endTag; | |
$tag->nameHint = null; | |
if ( $_ = $this->consume( 'mw:' ) ) { | |
// Go back to before the mw: so we can parse it as an argument name. | |
$_->rollback(); | |
$tag->name = null; | |
if ( $endTag ) { | |
$m = $this->consume( $this->rule->attrName ); | |
if ( !$m ) { | |
$m = $this->consume( new Regexp( '/mw:?/' ) ); | |
} | |
if ( $m ) { | |
$tag->nameHint = $m->text(); | |
} else { | |
$tag->nameHint = ''; | |
} | |
} else { | |
$m = $this->consume( $this->rule->attrName ); | |
$this->assert( $m, "Could not parse a required attribute name out of a null tag." ); | |
// Rollback the argument name so we can parse an actual argument. | |
$m->rollback(); | |
$tag->nameHint = $m->text(); | |
// There is no tag name so pretend that whitespace has been consumed so we can parse attributes. | |
$wsConsumed = true; | |
} | |
} else { | |
$tagname = $this->capture( $this->rule->tagname ); | |
// @fixme Message does not handle the case where a / was found and now we want the tag name. | |
$this->assert( $tagname, "A < indicating the start of a tag was found but none of \"/\", a tag name, or the start of a null mw: tag could be found." ); | |
$tag->name = $tagname; | |
} | |
$tag->attributes = array(); | |
while ( !$closed ) { | |
if ( $this->consume( '>' ) ) { | |
$closed = true; | |
} elseif ( $this->consume( '/' ) ) { | |
if ( $endTag ) { | |
// @todo Differentiate between "Last consumed token is an error" and "Could not find anything following this matching expectations" | |
$this->error( "An end tag may not have a self closing /." ); | |
} | |
$selfClosing = true; | |
$this->reap( '>', "A closing > must directly follow a self closing tag's /." ); | |
$closed = true; | |
} elseif ( $this->ws() ) { | |
// Mark whitespace as consumed so we know we can consume an attribute | |
$wsConsumed = true; | |
} elseif ( $wsConsumed && $attrName = $this->consume( $this->rule->attrName ) ) { | |
$wsConsumed = false; | |
$this->assert( !$endTag, "End tags may not have attributes." ); | |
if ( $this->consume( '=' ) ) { | |
$q = $this->consume( new Regexp( '/[\'"]/' ) ); | |
if ( $q ) { | |
$text = $this->tokenText( array( 'mode' => 'attr-quoted', 'forquote' => $q->text(), 'empty' => true ) ); | |
if ( !$this->consume( $q->text() ) ) { | |
$this->error( "Unexpected character found while trying to find an attribute's closing quote." ); | |
} | |
} else { | |
$text = $this->tokenText( array( 'mode' => 'attr-unquoted' ) ); | |
$this->assert( $text, "An unqouted attribute was found missing it's text." ); | |
} | |
} else { | |
$text = true;// Just a truthy attribute | |
} | |
$tag->attributes[] = array( | |
'name' => $attrName->text(), | |
'value' => $text | |
); | |
} else { | |
$this->error( "Unexpected characters found while parsing a tag." ); | |
} | |
} | |
// @todo Special handling for script and style tags that use a different parse model | |
$this->emit( $tag ); | |
if ( $selfClosing ) { | |
$close = new Token( 'tag' ); | |
$close->open = false; | |
$close->name = $tag->name; | |
$close->nameHint = $tag->nameHint; | |
$this->emit( $close ); | |
} | |
return true; | |
} | |
public function tokenText( $options = array( 'mode' => 'html' ) ) { | |
$allquotes = "'" . '"' . '`'; | |
// Handle options | |
$defaults = array( | |
'quotes' => true, | |
'|' => true, | |
'=' => true, | |
'/' => true, | |
'whitespace' => true, | |
'func' => true, | |
'subst' => true, | |
'conditional' => false, | |
'empty' => false, | |
'emit' => false, | |
); | |
foreach ( $defaults as $key => $default ) { | |
if ( !array_key_exists( $key, $options ) ) { | |
$options[$key] = $default; | |
} | |
} | |
if ( isset( $options['mode'] ) ) { | |
switch ( $options['mode'] ) { | |
case 'html': | |
break; | |
case 'attr-unquoted': | |
$options['func'] = false; | |
$options['subst'] = false; | |
$options['quotes'] = false; | |
$options['whitespace'] = false; | |
$options['='] = false; | |
$options['/'] = false; | |
break; | |
case 'attr-quoted': | |
$options['conditional'] = true; | |
// Use a string replace trick to list the quotes that aren't the same as the one matched | |
$options['quotes'] = str_replace( $options['forquote'], '', $allquotes ); | |
break; | |
case 'expr': | |
$options['|'] = false; | |
break; | |
case 'block-expr': | |
$options['conditional'] = true; | |
break; | |
default: | |
throw new Exception( __METHOD__ . ': Unknown mode.' ); | |
break; | |
} | |
unset( $options['mode'] ); | |
} | |
// Build plaintext catching regexp | |
$re = '/[^'; | |
// Never parse a < or > | |
$re .= '<>'; | |
// & is handled exclusively by our entity handling code | |
$re .= '&'; | |
// Curly braces are exclusively used by func, subst, and conditional syntax | |
$re .= '{}'; | |
// If pipe is not permitted (eg: in a curly expr) don't allow it | |
if ( !$options['|'] ) { | |
$re .= '|'; | |
} | |
// If = is not permitted (eg: in an unquoted attribute) don't allow it | |
if ( !$options['='] ) { | |
$re .= '='; | |
} | |
// If / is not permitted (eg: in an unquoted attribute) don't allow it | |
if ( !$options['/'] ) { | |
$re .= '\/'; | |
} | |
// If whitespace is not permitted don't allow it | |
if ( !$options['whitespace'] ) { | |
$re .= '\s'; | |
} | |
// | |
$quotes = $options['quotes']; | |
if ( $quotes === true ) { | |
$quotes = str_split( $allquotes ); | |
} elseif ( $quotes === false ) { | |
$quotes = array(); | |
} else { | |
$quotes = str_split( $quotes ); | |
} | |
$badquotes = array_diff( str_split( $allquotes ), $quotes ); | |
foreach ( $badquotes as $quote ) { | |
$re .= $quote; | |
} | |
$re .= ']+/'; | |
$re = new Regexp( $re ); | |
// Start parsing plaintext and curly expressions | |
$nodes = array(); | |
while ( true ) { | |
if ( $m = $this->consume( $re ) ) { | |
$text = new Token( 'text' ); | |
$text->text = $m->text(); | |
$nodes[] = $text; | |
} elseif ( $curly = $this->consume( '{' ) ) { | |
// @fixme This code doesn't consult the options to test if something is allowed | |
if ( $m = $this->consume( '/' ) ) { | |
// Looks like an end tag. Exit this text handling so that the parent condition (if any) can handle it. | |
$m->rollback(); | |
$curly->rollback(); | |
break; | |
} elseif ( $m = $this->consume( 'else}' ) ) { | |
// Looks like an {else} tag. Exit this text handling so that the parent condition (if any) can handle it. | |
$m->rollback(); | |
$curly->rollback(); | |
break; | |
} elseif ( $m = $this->consume( $this->rule->conditional ) ) { | |
// Conditional | |
$condition = $m->group( 1 ); | |
$cond = new Token( 'cond' ); | |
$cond->condition = $condition; | |
$cond->test = $this->tokenText( array( 'mode' => 'expr' ) ); | |
$this->assert( $cond->test, "Unexpected characters found while parsing a condition expression." ); | |
$this->reap( '}', "Unexpected characters found while parsing a condition." ); | |
$cond->then = $this->tokenText( array( 'mode' => 'block-expr', 'empty' => true ) ); | |
if ( $this->consume( '{else}' ) ) { | |
$cond->else = $this->tokenText( array( 'mode' => 'block-expr', 'empty' => true ) ); | |
} | |
$this->reap( '{/', "Unexpected characters found while parsing conditional text." ); | |
$m = $this->consume( new Regexp( '/([a-z][-_a-z0-9]*)\}/i' ) ); | |
$this->assert( $m, "Unexpected characters found while parsing a condition end." ); | |
$this->assert( $m->group( 1 ) == $cond->condition, "Condition end name did not match the name of the opened condition." ); | |
$nodes[] = $cond; | |
} elseif ( $m = $this->consume( $this->rule->funcName ) ) { | |
// func | |
$funcName = $m->group( 1 ); | |
$func = new Token( 'func' ); | |
$func->name = $funcName; | |
$func->text = $this->tokenText( array( 'mode' => 'expr' ) ); | |
$this->assert( $func->text, "Unexpected characters found while parsing a function expression." ); | |
$this->reap( '}', "Unexpected characters found while parsing a function." ); | |
$nodes[] = $func; | |
} else { | |
// subst | |
$subst = new Token( 'subst' ); | |
$subst->text = $this->tokenText( array( 'mode' => 'expr' ) ); | |
$this->assert( $subst->text, "Unexpected characters found while parsing a substitution expression." ); | |
$this->reap( '}', "Unexpected characters found while parsing a substitution." ); | |
$nodes[] = $subst; | |
} | |
} else { | |
break; | |
} | |
} | |
if ( $options['emit'] ) { | |
foreach ( $nodes as $node ) { | |
$this->emit( $node ); | |
} | |
} | |
if ( !$options['empty'] && count( $nodes ) <= 0 ) { | |
return false; | |
} | |
return $nodes; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment