Skip to content

Instantly share code, notes, and snippets.

@dantman
Created October 24, 2012 03:14
Show Gist options
  • Save dantman/3943466 to your computer and use it in GitHub Desktop.
Save dantman/3943466 to your computer and use it in GitHub Desktop.
New templated skin parser attempt
<?php
class TemplateNode {
}
class TemplateText extends TemplateNode {
}
class TemplateCondition extends TemplateNode {}
class TemplateFunction extends TemplateNode {}
class TemplateSubstitution extends TemplateNode {}
class TemplateBlock extends TemplateNode {
private $children;
public function __construct() {
$this->children = new SplDoublyLinkedList();
}
public function append( TemplateNode $node ) {
$this->children->push( $node );
}
}
class TemplateElement extends TemplateBlock {
}
class TemplateDocument extends TemplateBlock {
}
class Match {
private $parser;
private $start;
private $matches;
public function __construct( $parser, $start, $matches ) {
$this->parser = $parser;
$this->start = $start;
$this->matches = $matches;
}
public function group( $id ) {
return $this->matches[$id];
}
public function text() {
return $this->group( 0 );
}
public function length() {
return strlen( $this->text() );
}
public function rollback() {
$this->parser->rollback( $this->length(), $this->start );
}
}
class Regexp {
private $source;
public function __construct( $source ) {
$this->source = $source;
}
public function source() {
return $this->source;
}
}
class AParser {
protected $string;
protected $pos;
public function __construct( $string ) {
$this->string = $string;
$this->pos = 0;
}
/**
* @protected
*/
function rollback( $chars, $to ) {
if ( $this->pos - $chars !== $to ) {
throw new Exception( __METHOD__ . ': Parser implementation error. A rollback was atempted after more data had been parsed.' );
}
$this->pos -= $chars;
}
protected function makeRegexp( $test ) {
if ( is_string( $test ) ) {
$re = '/';
$re .= preg_quote( $test, '/' );
$re .= '/';
return $re;
} elseif ( $test instanceof Regexp ) {
return $test->source();
} else {
throw new Exception( __METHOD__ . ': Parser implementation error. Unknown type of data was patched to a parser match function.' );
}
}
protected function reMatch( $re ) {
$re .= 'ADsu';
$m = null;
if ( preg_match( $re, $this->string, $m, 0, $this->pos ) ) {
return new Match( $this, $this->pos, $m );
} else {
return false;
}
}
protected function nextIs( $test ) {
$re = $this->makeRegexp( $test );
$m = null;
if( $this->reMatch( $re ) ) {
return true;
} else {
return false;
}
}
protected function consume( $test ) {
$re = $this->makeRegexp( $test );
if( $m = $this->reMatch( $re ) ) {
$this->pos += $m->length();
return $m;
} else {
return false;
}
}
protected function capture( $test ) {
$m = $this->consume( $test );
if ( $m ) {
return $m->text();
} else {
return false;
}
}
protected function reap( $test, $msg ) {
$m = $this->consume( $test );
$this->assert( $m, $msg );
return $m;
}
// protected function parseOne( $list ) {
// foreach ( $list as $token ) {
// $tokenMethod = 'token' . ucfirst( $token );
// $res = $this->{$tokenMethod}();
// if ( $res === false ) {
// continue;
// }
// return $res;
// }
// // error?
// return false;
// }
protected function ws() {
if ( $m = $this->reMatch( '/\s+/' ) ) {
$this->pos += $m->length();
return true;
}
return false;
}
protected function following() {
return substr( $this->string, $this->pos, 64 );
}
protected function assert( $test, $msg ) {
if ( !$test ) {
$this->error( $msg );
}
}
protected function error( $msg ) {
// @todo line and char numbers
// @fixme Use a real ParseError class
throw new Exception( "ParseError: Parser encountered an error while parsing. Error message: \"$msg\"; Parsing stopped at \"{$this->following()}\"." );
}
}
class Token {
public function __construct( $tokenType ) {
$this->tokenType = $tokenType;
}
}
class TemplateParser extends AParser {
public function parseDocument() {
$this->doc = new TemplateDocument();
$this->tagStack = new SplStack();
$this->tagStack->push( $this->doc );
$this->rule = (object)array(
'tagname' => new Regexp( '/[a-z][_a-z0-9]*/i' ),
'attrName' => new Regexp( '/(mw:)?[a-z][-_a-z0-9]*/i' ),
'funcName' => new Regexp( '/([a-z][-_a-z0-9]*):/i' ),
'conditional' => new Regexp( '/(if|unless)\s+/' ),
);
while( true ) {
if ( $this->tokenText( array( 'mode' => 'html', 'emit' => true ) ) ) {}
elseif ( $this->tokenTag() ) {}
else {
break;
}
}
if ( $this->pos !== strlen( $this->string ) ) {
throw new Exception( "Parser was unable to finish parsing. Syntax continuation could not be found around: \"{$this->following()}\"" );
}
return $this->doc;
}
protected function emit( $token ) {
switch ( $token->tokenType ) {
case 'tag':
if ( $token->open ) {
$parent = $this->tagStack->top();
$element = new TemplateElement();
$element->name = $token->name;
$element->nameHint = $token->nameHint;
$parent->append( $element );
$this->tagStack->push( $element );
} else {
$element = $this->tagStack->top();
if ( $token->name ) {
$this->assert( $element->name, "Cannot close a <{$token->name}> element. A null <{$element->nameHint}> has not yet been closed." );
// @fixme Instead we should be doing some html-like implicit close handling
$this->assert( $token->name && $element->name,
"Closing tag <{$token->name}> did not match the currently opened <{$element->name}>." );
$this->tagStack->pop();
} else {
if ( !$element->name ) {
$hintmatch = $token->nameHint === ""
|| $token->nameHint === "mw"
|| $token->nameHint === "mw:"
|| $token->nameHint === $element->nameHint;
$this->assert( $hintmatch, "Null element closing tag </{$token->nameHint}> did not match the currently opened <{$element->nameHint}>." );
$this->tagStack->pop();
} else {
$this->error( "Cannot close a null <{$token->nameHint}> element. A standard <{$element->name}> has not yet been closed." );
}
}
}
break;
case 'text':
$node = new TemplateText();
$this->tagStack->top()->append( $node );
break;
case 'cond':
$node = new TemplateCondition();
$this->tagStack->top()->append( $node );
break;
case 'func':
$node = new TemplateFunction();
$this->tagStack->top()->append( $node );
break;
case 'subst':
$node = new TemplateSubstitution();
$this->tagStack->top()->append( $node );
break;
default:
throw new Exception( __METHOD__ . ': Tried to emit an unknown token type.' );
}
}
protected function tokenTag() {
if ( !$this->consume( "<" ) ) {
// No tag to parse, move on to another type of token
return false;
}
$closed = false; // Have we found the ending > yet?
$endTag = false; // Is this a </foo> end tag?
$selfClosing = false; // Is this a self closing <foo />?
// State indicating that whitespace was consumed before the current token.
// Used by attribute parsing to make sure there is whitespace preceding an attribute
$wsConsumed = false;
if ( $_ = $this->consume( '/' ) ) {
$endTag = true;
}
$tag = new Token( 'tag' );
$tag->open = !$endTag;
$tag->nameHint = null;
if ( $_ = $this->consume( 'mw:' ) ) {
// Go back to before the mw: so we can parse it as an argument name.
$_->rollback();
$tag->name = null;
if ( $endTag ) {
$m = $this->consume( $this->rule->attrName );
if ( !$m ) {
$m = $this->consume( new Regexp( '/mw:?/' ) );
}
if ( $m ) {
$tag->nameHint = $m->text();
} else {
$tag->nameHint = '';
}
} else {
$m = $this->consume( $this->rule->attrName );
$this->assert( $m, "Could not parse a required attribute name out of a null tag." );
// Rollback the argument name so we can parse an actual argument.
$m->rollback();
$tag->nameHint = $m->text();
// There is no tag name so pretend that whitespace has been consumed so we can parse attributes.
$wsConsumed = true;
}
} else {
$tagname = $this->capture( $this->rule->tagname );
// @fixme Message does not handle the case where a / was found and now we want the tag name.
$this->assert( $tagname, "A < indicating the start of a tag was found but none of \"/\", a tag name, or the start of a null mw: tag could be found." );
$tag->name = $tagname;
}
$tag->attributes = array();
while ( !$closed ) {
if ( $this->consume( '>' ) ) {
$closed = true;
} elseif ( $this->consume( '/' ) ) {
if ( $endTag ) {
// @todo Differentiate between "Last consumed token is an error" and "Could not find anything following this matching expectations"
$this->error( "An end tag may not have a self closing /." );
}
$selfClosing = true;
$this->reap( '>', "A closing > must directly follow a self closing tag's /." );
$closed = true;
} elseif ( $this->ws() ) {
// Mark whitespace as consumed so we know we can consume an attribute
$wsConsumed = true;
} elseif ( $wsConsumed && $attrName = $this->consume( $this->rule->attrName ) ) {
$wsConsumed = false;
$this->assert( !$endTag, "End tags may not have attributes." );
if ( $this->consume( '=' ) ) {
$q = $this->consume( new Regexp( '/[\'"]/' ) );
if ( $q ) {
$text = $this->tokenText( array( 'mode' => 'attr-quoted', 'forquote' => $q->text(), 'empty' => true ) );
if ( !$this->consume( $q->text() ) ) {
$this->error( "Unexpected character found while trying to find an attribute's closing quote." );
}
} else {
$text = $this->tokenText( array( 'mode' => 'attr-unquoted' ) );
$this->assert( $text, "An unqouted attribute was found missing it's text." );
}
} else {
$text = true;// Just a truthy attribute
}
$tag->attributes[] = array(
'name' => $attrName->text(),
'value' => $text
);
} else {
$this->error( "Unexpected characters found while parsing a tag." );
}
}
// @todo Special handling for script and style tags that use a different parse model
$this->emit( $tag );
if ( $selfClosing ) {
$close = new Token( 'tag' );
$close->open = false;
$close->name = $tag->name;
$close->nameHint = $tag->nameHint;
$this->emit( $close );
}
return true;
}
public function tokenText( $options = array( 'mode' => 'html' ) ) {
$allquotes = "'" . '"' . '`';
// Handle options
$defaults = array(
'quotes' => true,
'|' => true,
'=' => true,
'/' => true,
'whitespace' => true,
'func' => true,
'subst' => true,
'conditional' => false,
'empty' => false,
'emit' => false,
);
foreach ( $defaults as $key => $default ) {
if ( !array_key_exists( $key, $options ) ) {
$options[$key] = $default;
}
}
if ( isset( $options['mode'] ) ) {
switch ( $options['mode'] ) {
case 'html':
break;
case 'attr-unquoted':
$options['func'] = false;
$options['subst'] = false;
$options['quotes'] = false;
$options['whitespace'] = false;
$options['='] = false;
$options['/'] = false;
break;
case 'attr-quoted':
$options['conditional'] = true;
// Use a string replace trick to list the quotes that aren't the same as the one matched
$options['quotes'] = str_replace( $options['forquote'], '', $allquotes );
break;
case 'expr':
$options['|'] = false;
break;
case 'block-expr':
$options['conditional'] = true;
break;
default:
throw new Exception( __METHOD__ . ': Unknown mode.' );
break;
}
unset( $options['mode'] );
}
// Build plaintext catching regexp
$re = '/[^';
// Never parse a < or >
$re .= '<>';
// & is handled exclusively by our entity handling code
$re .= '&';
// Curly braces are exclusively used by func, subst, and conditional syntax
$re .= '{}';
// If pipe is not permitted (eg: in a curly expr) don't allow it
if ( !$options['|'] ) {
$re .= '|';
}
// If = is not permitted (eg: in an unquoted attribute) don't allow it
if ( !$options['='] ) {
$re .= '=';
}
// If / is not permitted (eg: in an unquoted attribute) don't allow it
if ( !$options['/'] ) {
$re .= '\/';
}
// If whitespace is not permitted don't allow it
if ( !$options['whitespace'] ) {
$re .= '\s';
}
//
$quotes = $options['quotes'];
if ( $quotes === true ) {
$quotes = str_split( $allquotes );
} elseif ( $quotes === false ) {
$quotes = array();
} else {
$quotes = str_split( $quotes );
}
$badquotes = array_diff( str_split( $allquotes ), $quotes );
foreach ( $badquotes as $quote ) {
$re .= $quote;
}
$re .= ']+/';
$re = new Regexp( $re );
// Start parsing plaintext and curly expressions
$nodes = array();
while ( true ) {
if ( $m = $this->consume( $re ) ) {
$text = new Token( 'text' );
$text->text = $m->text();
$nodes[] = $text;
} elseif ( $curly = $this->consume( '{' ) ) {
// @fixme This code doesn't consult the options to test if something is allowed
if ( $m = $this->consume( '/' ) ) {
// Looks like an end tag. Exit this text handling so that the parent condition (if any) can handle it.
$m->rollback();
$curly->rollback();
break;
} elseif ( $m = $this->consume( 'else}' ) ) {
// Looks like an {else} tag. Exit this text handling so that the parent condition (if any) can handle it.
$m->rollback();
$curly->rollback();
break;
} elseif ( $m = $this->consume( $this->rule->conditional ) ) {
// Conditional
$condition = $m->group( 1 );
$cond = new Token( 'cond' );
$cond->condition = $condition;
$cond->test = $this->tokenText( array( 'mode' => 'expr' ) );
$this->assert( $cond->test, "Unexpected characters found while parsing a condition expression." );
$this->reap( '}', "Unexpected characters found while parsing a condition." );
$cond->then = $this->tokenText( array( 'mode' => 'block-expr', 'empty' => true ) );
if ( $this->consume( '{else}' ) ) {
$cond->else = $this->tokenText( array( 'mode' => 'block-expr', 'empty' => true ) );
}
$this->reap( '{/', "Unexpected characters found while parsing conditional text." );
$m = $this->consume( new Regexp( '/([a-z][-_a-z0-9]*)\}/i' ) );
$this->assert( $m, "Unexpected characters found while parsing a condition end." );
$this->assert( $m->group( 1 ) == $cond->condition, "Condition end name did not match the name of the opened condition." );
$nodes[] = $cond;
} elseif ( $m = $this->consume( $this->rule->funcName ) ) {
// func
$funcName = $m->group( 1 );
$func = new Token( 'func' );
$func->name = $funcName;
$func->text = $this->tokenText( array( 'mode' => 'expr' ) );
$this->assert( $func->text, "Unexpected characters found while parsing a function expression." );
$this->reap( '}', "Unexpected characters found while parsing a function." );
$nodes[] = $func;
} else {
// subst
$subst = new Token( 'subst' );
$subst->text = $this->tokenText( array( 'mode' => 'expr' ) );
$this->assert( $subst->text, "Unexpected characters found while parsing a substitution expression." );
$this->reap( '}', "Unexpected characters found while parsing a substitution." );
$nodes[] = $subst;
}
} else {
break;
}
}
if ( $options['emit'] ) {
foreach ( $nodes as $node ) {
$this->emit( $node );
}
}
if ( !$options['empty'] && count( $nodes ) <= 0 ) {
return false;
}
return $nodes;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment