drslump · August 13, 2009 22:11
diff --git a/AnnotationsLexerGenerator.php b/AnnotationsLexerGenerator.php
 <?php
 /**
 * A runtime lexer generator which obtains the lexical rules from annotations.
 *
 * Copyright 2009 (c) Iván -DrSlump- Montes <[email protected]>
 * Released to the Public Domain
 * 
 * It supports the following annotations:
 *
 *  - @lexer-mode
 *  
 *  	Sets the matching mode for the lexing engine. It can be either "first"
 *  	or "longest". The former will consume the first rule that matches while
 *  	the later will try to match all the rules and use the one that matches
 *  	the longer amount of text.
 *  	This annotation can only be placed at the class level.
 *
 *  - @lexer-pattern:
 *
 *		Defines a pattern to use in rules. The first value is an identifier,
 *		the second is a PHP compatible regular expression without delimiters.
 *		This annotation can be placed at class and method levels.
 *  	
 *  - @lexer-rule:
 *
 *  	Associates the given rule with the class method just below its comment
 *  	block. It's composed of an optional lexer state name ("START" if not
 *  	defined) between < and > characters and one or more pattern identifiers or 
 *  	double quotes enclosed strings.
 *  	This annotation can only be placed at the method level.
 *
 * Example:
 *
 *  /**
 *   * @lexer-mode first
 *   * @lexer-pattern whitespace [ \t\r\n]+
 *   * /
 * 	class MyLexer extends Lexer {
 * 		/** @lexer-rule whitespace    +/
 *      function _space() {
 *      	return self::TOKEN_IGNORE;
 *      }
 *
 *      /** 
 *       * @lexer-pattern 	ident ([A-Za-z_]+)
 *       * @lexer-rule 		"$" ident
 *       +/
 *      function _ident($m) {
 *      	echo "Ident found: " . $m[1] . "\n";
 *			$this->_token = MyParser::TKN_IDENT;
 *		}
 *
 *		/** @lexer-rule "'"   +/
 *		function _stringStart() {
 *			$this->pushState('STRING');
 *			return self::TOKEN_IGNORE;
 *		}
 *
 *		/**
 *		 * @lexer-pattern 	text [^']*
 *		 * @lexer-rule		<STRING> text
 *		 +/
 * 		function _stringText() {
 *			$this->_token = MyParser::TKN_STRING;
 *		}
 *
 *		/** @lexer-rule	<STRING> "'"    +/
 *		function _stringEnd() {
 *			$this->popState();
 *		}
 *	}
 *
 */
 class Lexer implements Iterator {
 	
 	// Scanner modes
 	const MODE_FIRST = 'first';
 	const MODE_LONGEST = 'longest';
 	
 	/**
 	 * Consume the token and advance
 	 */
 	const TOKEN_ACCEPT = null;
 	/**
 	 * Repeat at the same position (in another state)
 	 */
 	const TOKEN_REPEAT = true;
 	/**
 	 * Ignore the token and advance
 	 */
 	const TOKEN_IGNORE = false;
 	/**
 	 * Repeat at the same position skiping the rule
 	 */
 	const TOKEN_SKIP = 'skip';
 	
 	// Cache the lexer configuration for each class used
 	private static $__lexers = array();
 	
 	// Internal data
 	private $__mode = self::MODE_FIRST;
 	private $__classname;
 	private $__object;
 	private $__stack = array();
 	private $__state;
 	
 	// Lexer current position in the data
 	protected $__line = 0;
 	protected $__pos = 0;
 	protected $__col = 0;
 	
 	protected $_data;
 	protected $_token;
 	protected $_value;
 	
 	/**
 	 * Constructor
 	 *
 	 * @param string $data				The text to scan
 	 * @param stirng | object $class 	Either a class name or an object
 	 */
 	 function __construct($data = '', $class = null)
 	{
 		$this->__state = 'START';
 		
 		$this->_data = $data;
 		
 		if (NULL === $class) {
 			$this->__classname = get_class($this);
 			$this->__object = $this;
 		} else if (is_string($class)) {
 			$this->__classname = $class;
 			$this->__object = $class;
 		} else {
 			$this->__classname = get_class($class);
 			$this->__object = $class;
 		}
 	}
 	
 	public function getLine()
 	{
 		return $this->__line;
 	}
 	
 	public function getOffset()
 	{
 		return $this->__pos;
 	}
 	
 	public function getColumn()
 	{
 		return $this->__col;
 	}
 	
 	/**
 	 * Changes the lexer state
 	 *
 	 * @param string $state	The new lexer state
 	 * @return void
 	 */
 	public function setState( $state )
 	{
 		$lexer = $this->__lexer();
 		if (!isset($lexer['states'][$state])) {
 			$states = array_keys($lexer['states']);
 			throw new Exception('State ' . $state . ' not found. Available states: "' .
 								 implode('", "', $states) . '"');
 		}
 		
 		$this->__state = $state;
 	}
 	
 	/**
 	 * Returns the current state
 	 *
 	 * @return string
 	 */
 	public function getState()
 	{
 		return $this->__state;
 	}
 	
 	/**
 	 * Pushes the current state into a stack and sets a new one
 	 *
 	 * @param string $state
 	 * @return void
 	 */
 	public function pushState( $state )
 	{
 		$this->__stack[] = $this->__state;
 		$this->setState($state);
 	}
 	
 	/**
 	 * Pops the top state from the stack and makes it the current one
 	 *
 	 * @return void
 	 */
 	public function popState()
 	{
 		if (!count($this->__stack)) {
 			throw new Exception('States stack is empty, unable to pop from state "' .
 								 $this->__state . '"');
 		}
 		$this->setState(array_pop($this->__stack));
 	}	
 	
 	/**
 	 * Returns the current token valud
 	 *
 	 * @return string
 	 * @see Iterator
 	 */
 	public function current()
 	{
 		return $this->_value;
 	}
 	
 	/**
 	 * Returns the current token identifier
 	 *
 	 * @return int
 	 * @see Iterator
 	 */
 	public function key()
 	{
 		return $this->_token;
 	}
 	
 	/**
 	 * Resets the lexer to make it as it was just initialized
 	 *
 	 * @param string | null $data 	If set changes the data string to tokenize
 	 * @see Iterator
 	 */
 	public function rewind()
 	{
 		$this->__state = 'START';
 		$this->__stack = array();
 		
 		$this->__line = 0;
 		$this->__pos = 0;
 		$this->__col = 0;
 		
 		$this->_token = null;
 		$this->_value = null;
 		
 	}
 	
 	/**
 	 * Tells if the lexer is ready to consume more tokens
 	 *
 	 * @return bool
 	 * @see Iterator
 	 */
 	public function valid()
 	{		
 		return $this->__pos < strlen($this->_data);
 	}
 	
 	/**
 	 * Consumes the next token from the data
 	 *
 	 * @return void
 	 * @see Iterator
 	 */
 	public function next()
 	{
 		$lexer = $this->__lexer();
 		
 		if ($lexer['mode'] === self::MODE_LONGEST) {
 			$this->__matchLongest();
 		} else {
 			$this->__matchFirst();
 		}
 	}
 	
 	/**
 	 * Use this method to apply a lexer definition to skip the runtime building
 	 * process.
 	 *
 	 * @param array $definition		A lexer definition
 	 * @return void
 	 * @see getLexerDefinition
 	 */	 
 	public function setLexerDefinition($definition)
 	{
 		self::$__lexers[ $this->__classname ] = $definition;
 		$this->rewind();
 	}
 	
 	/**
 	 * Returns the lexer definition so it can be persisted to improve performance
 	 *
 	 * @return array
 	 * @see setLexerDefinition
 	 */
 	public function getLexerDefinition()
 	{
 		return $this->__lexer();
 	}

 	/**
 	 * Tries to match the token with a first-to-match policy
 	 *
 	 * @return void
 	 */
 	private function __matchFirst()
 	{
 		// Check if the input is over
        if ($this->__pos >= strlen($this->_data)) {
            return;
        }
 		
 		$lexer = $this->__lexer();
 		$skip = array();
 		
        do {
 			$state = $this->__state;
 			
 			// Use the pregenerated regular expression
 			if (empty($skip)) {
 				$regexp = $lexer['regexps'][ $state ];				
 			// Build the regexp ignoring the skipped tokens
 			} else {
 				$regexp = array();
 				foreach($lexer['states'][$state] as $idx => $rule) {
 					if (!in_array($idx, $skip)) {
 						$regexp[] = $rule['regexp'];
 					} else {
 						// Add an impossible to match regexp in its place
 						$regexp[] = '$^';
 					}
 				}
 				$regexp = '@(' . implode(')|(', $regexp) . ')@Ax';
 			}
 			
 			if (preg_match($regexp, $this->_data, $submatches, null, $this->__pos)) {
 				// Remove empty sub-patterns
 				$matches = array_filter($submatches, 'strlen'); 			
                if (!count($matches)) {
                    throw new Exception(
 						'Lexer failed because a rule matched an empty string ' .
 						'in state "' . $this->__state . '"'
 					);
                }
 				
 				// Skip global match
                next($matches);
 				
 				// Get lexer token and value
                $idx = key($matches);
                $this->_value = $value = current($matches);				
 				
 				// Adjust subpatterns count to get the correct index
 				$token = 0; $subs = 1;
 				while ($subs < $idx) {
 					$subs += 1 + $lexer['states'][$state][$token]['subs'];
 					$token++;
 				}
 				
 				$this->_token = $token;
 				
 				// Extract submatches
 				$subs = $lexer['states'][$state][$token]['subs'];
 				$submatches = array_slice($submatches, $idx, $subs);
 				
 				// Check the rule result
 				$result = $this->__result($token, $value, $submatches);
 				if ($result === TRUE) {
 					return;
 				} else if ($result === FALSE) {
 					$skip = array();
 					continue;
 				} else {
 					$skip[] = $result;
 					continue;
 				}
 				
            } else {
                throw new Exception(
 					'Unexpected input at line' . $this->__line . ' column ' .
 					$this->__col . ': ' . $this->_data[$this->__pos]
 				);
            }
            break;
        } while (true);
 	}

 	
 	/**
 	 * Tries to match the token with a longest-match policy
 	 *
 	 * @return void
 	 */
 	public function __matchLongest()
 	{
 		// Check if the input is over
        if ($this->__pos >= strlen($this->_data)) {
            return false;
        }
 		
 		$lexer = $this->__lexer();
 		$skip = array();
 		
    	do {
 			$state = $this->__state;
 			$token = 0;
 			$value = false;
 			$submatches = array();
 			foreach ($lexer['states'][$state] as $i => $rule) {
 				$re = '@' . $rule['regexp'] . '@ASx';
 				if (!in_array($i, $skip) &&
 					preg_match($re, $this->_data, $matches, null, $this->__pos)) {
 					// Check if we found a longer token
 					if ( $value === false || strlen($matches[0]) > strlen($value) ) {
 						$value = $matches[0];
 						$submatches = $matches;
 						$token = $i;
 					}
 				}
 			}
 			
 	    	if ($value === false) {
 	            throw new Exception(
 					'Unexpected input at line ' . $this->_line . ' column ' .
 					$this->__col . ':  ' . $this->_data[$this->__pos]
 				);
 	    	}
 	    	
 			$this->_token = $token;
 	    	$this->_value = $value;
 			
 			// Check the rule result
 			$result = $this->__result($token, $value, $submatches);
 			if ($result === TRUE) {
 				return;
 			} else if ($result === FALSE) {
 				$skip = array();
 				continue;
 			} else {
 				$skip[] = $result;
 			}
 			
        } while (true);
 	}
 	
 	/**
 	 * Calls the associated method to a rule
 	 *
 	 * If this function returns TRUE means that the token has been accepted.
 	 * If the return value is FALSE means that the token should be ignored.
 	 * If it is an integer it means a rule index and it should be skipped.
 	 *
 	 * @param int $ruleIdx		The matched rule index
 	 * @param string $value		The matched value
 	 * @param array $submatches	The submatches to pass to the method
 	 * @return boolean | string
 	 */
 	protected function __result($ruleIdx, $value, $submatches)
 	{
 		$lexer = $this->__lexer();
 		
 		// Keep the current state to detect changes
 		$state = $this->__state;
 		
 		// Call the handler
 		$method = $lexer['states'][$state][$ruleIdx]['method'];
 		$result = call_user_func( array($this->__object, $method), $submatches );
 		
 		if ($result === self::TOKEN_ACCEPT) {
 			$this->__pos += strlen($value);
 			$this->__line += substr_count($value, "\n");
 			if ($pos = strrpos($value, "\n")) {
 				$this->__col = strlen($value) - $pos;
 			} else {
 				$this->__col += strlen($value);
 			}
 			
 			// accept this token
 			return true;
 		} elseif ($result === self::TOKEN_REPEAT) {
 			if ($state === $this->__state) {
 				throw new Exception('Rule signalled a repeat but didn\'t change the lexer state');
 			}
 			
 			// we have changed state process this token in the new state
 			$this->next();
 			return false;
 		
 		} elseif ($result === self::TOKEN_IGNORE) {
 			$this->__pos += strlen($value);
 			$this->__line += substr_count($value, "\n");
 			if ($pos = strrpos($value, "\n")) {
 				$this->__col = strlen($value) - $pos;
 			} else {
 				$this->__col += strlen($value);
 			}
 			
 			// end of input
 			if ($this->__pos >= strlen($this->_data)) {
 				return true; 
 			}
 			
 			// skip this token
 			return false;
 		} elseif ($result === self::TOKEN_SKIP) {
 			if ($state === $this->__state) {
 				return $ruleIdx;
 			} else {
 				// We have changed the state so this is like IGNORE
 				$this->next();
 				return true;
 			}
 		} else {
 			throw new Exception(
 				'Rule returned an invalid value. Only NULL, TRUE and FALSE ' .
 				'and "skip" values are allowed'
 			);
 		}		
 	}
 	
 	
 	/**
 	 * Returns the lexer data structure and triggers the build process if needed
 	 *
 	 * @return array 	The lexer data structure
 	 */
 	protected function __lexer()
 	{
 		if (!isset(self::$__lexers[$this->__classname])) {
 			self::$__lexers[$this->__classname] = $this->__build();
 		}
 		return self::$__lexers[$this->__classname];
 	}
 	
 	/**
 	 * Builds the lexer data structure by following the annotations found in the
 	 * target class
 	 *
 	 * @return array 	The data structure
 	 */
 	protected function __build()
 	{
 		$states = array();
 		$regexps = array();
 		$patterns = array();
 		$rules = array();
 		
 		$refl = new ReflectionClass($this->__classname);		
 		
 		// Get global configuration
 		$tags = $this->__getCommentTags( $refl->getDocComment() );
 		foreach($tags as $tag) {
 			if ($tag[0] === 'lexer-pattern') {
 				if (!preg_match('/^([A-Za-z_]+)\s+(.+)$/', $tag[1], $m)) {
 					throw new Exception('Malformed @lexer-alias: ' . $tag[1]);
 				}
 				
 				$patterns[$m[1]] = rtrim($m[2]);
 			} else if ($tag[0] === 'lexer-mode') {
 				if (!preg_match('/^(longest|first)\s*$/', $tag[1], $m)) {
 					throw new Exception('Malformed @lexer-mode. Only "longest" and "first" are allowed. Found: ' . $tag[1]);
 				}
 				
 				$this->__mode = $m[1] === 'first' ? self::MODE_FIRST : self::MODE_LONGEST;
 			}
 		}
 		
 		// Get function rules
 		$methods = $refl->getMethods();
 		foreach($methods as $method) {
 			$tags = $this->__getCommentTags( $method->getDocComment() );
 			foreach ($tags as $tag) {
 				
 				if ($tag[0] === 'lexer-pattern') {
 					
 					if (!preg_match('/^([A-Za-z_]+)\s+(.*)$/', $tag[1], $m)) {
 						throw new Exception('Malformed @lexer-alias: ' . $tag[1]);
 					}
 					
 					$patterns[$m[1]] = rtrim($m[2]);
 					
 				} else if ($tag[0] === 'lexer-rule') {
 					
 					if (!preg_match('/^(?:<([A-Za-z_]+)>\s+)?(.+)$/', $tag[1], $m)) {
 						throw new Exception('Malformed @lexer-parser: ' . $tag[1]);
 					}
 					
 					$rules[] = array(
 						'state'		=> $m[1] ? $m[1] : 'START',
 						'method' 	=> $method->getName(),
 						'expression'=> trim($m[2])
 					);
 				}
 			}
 		}
 		
 		// Build the states
 		foreach ($rules as $rule) {
 			// Create the state if doesn't exists yet
 			if (!isset($states[ $rule['state'] ])) {
 				$states[ $rule['state'] ]  = array();
 			}
 			
 			$expr = $rule['expression'] . ' ';
 			$regexp = '';
 			$pos = 0;
 			while (preg_match('/"([^"]+)"\s+|([A-Za-z_][A-Za-z0-9_]*)\s+/', $expr, $m, null, $pos)) {
 				$pos += strlen($m[0]);
 					
 				if (!empty($m[1])) {
 					$regexp .= preg_quote($m[1]);
 				} else {
 					if (!isset($patterns[$m[2]])) {
 						throw new Exception('Pattern not found: ' . $m[2]);
 					}
 					$regexp .= $patterns[$m[2]];
 				}
 			}
 			if ($pos < strlen($expr)) {
 				throw new Exception('Malformed rule expression: ' . $rule['expression']);
 			}
 			
 			// Calculate the number of subpatterns in the regular expression
 			// Remove escaped backslashes
 			$str = str_replace('\\\\', '', $regexp);
 			// Remove escaped parenthesis
 			$str = str_replace('\\(', '', $str);
 			// Remove non-capturing sets
 			$str = str_replace('(?:', '', $str);
 			$count = substr_count($str, '(');
 			
 			$states[ $rule['state'] ][] = array(
 				'method' => $rule['method'],
 				'regexp' => str_replace('@', '\@', $regexp),
 				'subs'   => $count
 			);
 		}
 		
 		// For first-match mode build the global regexps and mapping
 		if ($this->__mode === 'first') {
 			foreach ($states as $state => $rules) {
 				$res = array();
 				foreach($rules as $rule) {
 					$res[] = $rule['regexp'];
 				}
 				$regexps[$state] = '@(' . implode(')|(', $res) . ')@ASx';
 			}
 		}
 		
 		// Store the lexer
 		return array(
 			'mode'			=> $this->__mode,
 			'states' 		=> $states,
 			'regexps' 		=> $regexps					 
 		);
 	}
 	
 	/**
 	 * Parses a comment looking for annotations
 	 *
 	 * @param string $comment
 	 * @return array
 	 */
 	private function __getCommentTags($comment)
 	{
 		$tags = array();
 		
 		if (strpos($comment, '/**') !== 0) {
 			return $tags;
 		}
 		
 		// Remove start and end marks
 		$comment = substr($comment, 3, -2);
 		
 		$lines = explode("\n", $comment);
 		foreach ($lines as $ln) {
 			$ln = ltrim($ln, " \t*");
 			if (preg_match('/^@([A-Za-z_][A-Za-z_-]*)\s+(.*)$/', $ln, $m)) {
 				$tags[] = array( $m[1], $m[2] );
 			}
 		}
 		return $tags;
 	}
 	
 }
	<?php
	/**
	* A runtime lexer generator which obtains the lexical rules from annotations.
	*
	* Copyright 2009 (c) Iván -DrSlump- Montes <[email protected]>
	* Released to the Public Domain
	*
	* It supports the following annotations:
	*
	* - @lexer-mode
	*
	* Sets the matching mode for the lexing engine. It can be either "first"
	* or "longest". The former will consume the first rule that matches while
	* the later will try to match all the rules and use the one that matches
	* the longer amount of text.
	* This annotation can only be placed at the class level.
	*
	* - @lexer-pattern:
	*
	* Defines a pattern to use in rules. The first value is an identifier,
	* the second is a PHP compatible regular expression without delimiters.
	* This annotation can be placed at class and method levels.
	*
	* - @lexer-rule:
	*
	* Associates the given rule with the class method just below its comment
	* block. It's composed of an optional lexer state name ("START" if not
	* defined) between < and > characters and one or more pattern identifiers or
	* double quotes enclosed strings.
	* This annotation can only be placed at the method level.
	*
	* Example:
	*
	* /**
	* * @lexer-mode first
	* * @lexer-pattern whitespace [ \t\r\n]+
	* * /
	* class MyLexer extends Lexer {
	* /** @lexer-rule whitespace +/
	* function _space() {
	* return self::TOKEN_IGNORE;
	* }
	*
	* /**
	* * @lexer-pattern ident ([A-Za-z_]+)
	* * @lexer-rule "$" ident
	* +/
	* function _ident($m) {
	* echo "Ident found: " . $m[1] . "\n";
	* $this->_token = MyParser::TKN_IDENT;
	* }
	*
	* /** @lexer-rule "'" +/
	* function _stringStart() {
	* $this->pushState('STRING');
	* return self::TOKEN_IGNORE;
	* }
	*
	* /**
	* * @lexer-pattern text [^']*
	* * @lexer-rule <STRING> text
	* +/
	* function _stringText() {
	* $this->_token = MyParser::TKN_STRING;
	* }
	*
	* /** @lexer-rule <STRING> "'" +/
	* function _stringEnd() {
	* $this->popState();
	* }
	* }
	*
	*/
	class Lexer implements Iterator {

	// Scanner modes
	const MODE_FIRST = 'first';
	const MODE_LONGEST = 'longest';

	/**
	* Consume the token and advance
	*/
	const TOKEN_ACCEPT = null;
	/**
	* Repeat at the same position (in another state)
	*/
	const TOKEN_REPEAT = true;
	/**
	* Ignore the token and advance
	*/
	const TOKEN_IGNORE = false;
	/**
	* Repeat at the same position skiping the rule
	*/
	const TOKEN_SKIP = 'skip';

	// Cache the lexer configuration for each class used
	private static $__lexers = array();

	// Internal data
	private $__mode = self::MODE_FIRST;
	private $__classname;
	private $__object;
	private $__stack = array();
	private $__state;

	// Lexer current position in the data
	protected $__line = 0;
	protected $__pos = 0;
	protected $__col = 0;

	protected $_data;
	protected $_token;
	protected $_value;

	/**
	* Constructor
	*
	* @param string $data The text to scan
	* @param stirng \| object $class Either a class name or an object
	*/
	function __construct($data = '', $class = null)
	{
	$this->__state = 'START';

	$this->_data = $data;

	if (NULL === $class) {
	$this->__classname = get_class($this);
	$this->__object = $this;
	} else if (is_string($class)) {
	$this->__classname = $class;
	$this->__object = $class;
	} else {
	$this->__classname = get_class($class);
	$this->__object = $class;
	}
	}

	public function getLine()
	{
	return $this->__line;
	}

	public function getOffset()
	{
	return $this->__pos;
	}

	public function getColumn()
	{
	return $this->__col;
	}

	/**
	* Changes the lexer state
	*
	* @param string $state The new lexer state
	* @return void
	*/
	public function setState( $state )
	{
	$lexer = $this->__lexer();
	if (!isset($lexer['states'][$state])) {
	$states = array_keys($lexer['states']);
	throw new Exception('State ' . $state . ' not found. Available states: "' .
	implode('", "', $states) . '"');
	}

	$this->__state = $state;
	}

	/**
	* Returns the current state
	*
	* @return string
	*/
	public function getState()
	{
	return $this->__state;
	}

	/**
	* Pushes the current state into a stack and sets a new one
	*
	* @param string $state
	* @return void
	*/
	public function pushState( $state )
	{
	$this->__stack[] = $this->__state;
	$this->setState($state);
	}

	/**
	* Pops the top state from the stack and makes it the current one
	*
	* @return void
	*/
	public function popState()
	{
	if (!count($this->__stack)) {
	throw new Exception('States stack is empty, unable to pop from state "' .
	$this->__state . '"');
	}
	$this->setState(array_pop($this->__stack));
	}

	/**
	* Returns the current token valud
	*
	* @return string
	* @see Iterator
	*/
	public function current()
	{
	return $this->_value;
	}

	/**
	* Returns the current token identifier
	*
	* @return int
	* @see Iterator
	*/
	public function key()
	{
	return $this->_token;
	}

	/**
	* Resets the lexer to make it as it was just initialized
	*
	* @param string \| null $data If set changes the data string to tokenize
	* @see Iterator
	*/
	public function rewind()
	{
	$this->__state = 'START';
	$this->__stack = array();

	$this->__line = 0;
	$this->__pos = 0;
	$this->__col = 0;

	$this->_token = null;
	$this->_value = null;

	}

	/**
	* Tells if the lexer is ready to consume more tokens
	*
	* @return bool
	* @see Iterator
	*/
	public function valid()
	{
	return $this->__pos < strlen($this->_data);
	}

	/**
	* Consumes the next token from the data
	*
	* @return void
	* @see Iterator
	*/
	public function next()
	{
	$lexer = $this->__lexer();

	if ($lexer['mode'] === self::MODE_LONGEST) {
	$this->__matchLongest();
	} else {
	$this->__matchFirst();
	}
	}

	/**
	* Use this method to apply a lexer definition to skip the runtime building
	* process.
	*
	* @param array $definition A lexer definition
	* @return void
	* @see getLexerDefinition
	*/
	public function setLexerDefinition($definition)
	{
	self::$__lexers[ $this->__classname ] = $definition;
	$this->rewind();
	}

	/**
	* Returns the lexer definition so it can be persisted to improve performance
	*
	* @return array
	* @see setLexerDefinition
	*/
	public function getLexerDefinition()
	{
	return $this->__lexer();
	}

	/**
	* Tries to match the token with a first-to-match policy
	*
	* @return void
	*/
	private function __matchFirst()
	{
	// Check if the input is over
	if ($this->__pos >= strlen($this->_data)) {
	return;
	}

	$lexer = $this->__lexer();
	$skip = array();

	do {
	$state = $this->__state;

	// Use the pregenerated regular expression
	if (empty($skip)) {
	$regexp = $lexer['regexps'][ $state ];
	// Build the regexp ignoring the skipped tokens
	} else {
	$regexp = array();
	foreach($lexer['states'][$state] as $idx => $rule) {
	if (!in_array($idx, $skip)) {
	$regexp[] = $rule['regexp'];
	} else {
	// Add an impossible to match regexp in its place
	$regexp[] = '$^';
	}
	}
	$regexp = '@(' . implode(')\|(', $regexp) . ')@Ax';
	}

	if (preg_match($regexp, $this->_data, $submatches, null, $this->__pos)) {
	// Remove empty sub-patterns
	$matches = array_filter($submatches, 'strlen');
	if (!count($matches)) {
	throw new Exception(
	'Lexer failed because a rule matched an empty string ' .
	'in state "' . $this->__state . '"'
	);
	}

	// Skip global match
	next($matches);

	// Get lexer token and value
	$idx = key($matches);
	$this->_value = $value = current($matches);

	// Adjust subpatterns count to get the correct index
	$token = 0; $subs = 1;
	while ($subs < $idx) {
	$subs += 1 + $lexer['states'][$state][$token]['subs'];
	$token++;
	}

	$this->_token = $token;

	// Extract submatches
	$subs = $lexer['states'][$state][$token]['subs'];
	$submatches = array_slice($submatches, $idx, $subs);

	// Check the rule result
	$result = $this->__result($token, $value, $submatches);
	if ($result === TRUE) {
	return;
	} else if ($result === FALSE) {
	$skip = array();
	continue;
	} else {
	$skip[] = $result;
	continue;
	}

	} else {
	throw new Exception(
	'Unexpected input at line' . $this->__line . ' column ' .
	$this->__col . ': ' . $this->_data[$this->__pos]
	);
	}
	break;
	} while (true);
	}


	/**
	* Tries to match the token with a longest-match policy
	*
	* @return void
	*/
	public function __matchLongest()
	{
	// Check if the input is over
	if ($this->__pos >= strlen($this->_data)) {
	return false;
	}

	$lexer = $this->__lexer();
	$skip = array();

	do {
	$state = $this->__state;
	$token = 0;
	$value = false;
	$submatches = array();
	foreach ($lexer['states'][$state] as $i => $rule) {
	$re = '@' . $rule['regexp'] . '@ASx';
	if (!in_array($i, $skip) &&
	preg_match($re, $this->_data, $matches, null, $this->__pos)) {
	// Check if we found a longer token
	if ( $value === false \|\| strlen($matches[0]) > strlen($value) ) {
	$value = $matches[0];
	$submatches = $matches;
	$token = $i;
	}
	}
	}

	if ($value === false) {
	throw new Exception(
	'Unexpected input at line ' . $this->_line . ' column ' .
	$this->__col . ': ' . $this->_data[$this->__pos]
	);
	}

	$this->_token = $token;
	$this->_value = $value;

	// Check the rule result
	$result = $this->__result($token, $value, $submatches);
	if ($result === TRUE) {
	return;
	} else if ($result === FALSE) {
	$skip = array();
	continue;
	} else {
	$skip[] = $result;
	}

	} while (true);
	}

	/**
	* Calls the associated method to a rule
	*
	* If this function returns TRUE means that the token has been accepted.
	* If the return value is FALSE means that the token should be ignored.
	* If it is an integer it means a rule index and it should be skipped.
	*
	* @param int $ruleIdx The matched rule index
	* @param string $value The matched value
	* @param array $submatches The submatches to pass to the method
	* @return boolean \| string
	*/
	protected function __result($ruleIdx, $value, $submatches)
	{
	$lexer = $this->__lexer();

	// Keep the current state to detect changes
	$state = $this->__state;

	// Call the handler
	$method = $lexer['states'][$state][$ruleIdx]['method'];
	$result = call_user_func( array($this->__object, $method), $submatches );

	if ($result === self::TOKEN_ACCEPT) {
	$this->__pos += strlen($value);
	$this->__line += substr_count($value, "\n");
	if ($pos = strrpos($value, "\n")) {
	$this->__col = strlen($value) - $pos;
	} else {
	$this->__col += strlen($value);
	}

	// accept this token
	return true;
	} elseif ($result === self::TOKEN_REPEAT) {
	if ($state === $this->__state) {
	throw new Exception('Rule signalled a repeat but didn\'t change the lexer state');
	}

	// we have changed state process this token in the new state
	$this->next();
	return false;

	} elseif ($result === self::TOKEN_IGNORE) {
	$this->__pos += strlen($value);
	$this->__line += substr_count($value, "\n");
	if ($pos = strrpos($value, "\n")) {
	$this->__col = strlen($value) - $pos;
	} else {
	$this->__col += strlen($value);
	}

	// end of input
	if ($this->__pos >= strlen($this->_data)) {
	return true;
	}

	// skip this token
	return false;
	} elseif ($result === self::TOKEN_SKIP) {
	if ($state === $this->__state) {
	return $ruleIdx;
	} else {
	// We have changed the state so this is like IGNORE
	$this->next();
	return true;
	}
	} else {
	throw new Exception(
	'Rule returned an invalid value. Only NULL, TRUE and FALSE ' .
	'and "skip" values are allowed'
	);
	}
	}


	/**
	* Returns the lexer data structure and triggers the build process if needed
	*
	* @return array The lexer data structure
	*/
	protected function __lexer()
	{
	if (!isset(self::$__lexers[$this->__classname])) {
	self::$__lexers[$this->__classname] = $this->__build();
	}
	return self::$__lexers[$this->__classname];
	}

	/**
	* Builds the lexer data structure by following the annotations found in the
	* target class
	*
	* @return array The data structure
	*/
	protected function __build()
	{
	$states = array();
	$regexps = array();
	$patterns = array();
	$rules = array();

	$refl = new ReflectionClass($this->__classname);

	// Get global configuration
	$tags = $this->__getCommentTags( $refl->getDocComment() );
	foreach($tags as $tag) {
	if ($tag[0] === 'lexer-pattern') {
	if (!preg_match('/^([A-Za-z_]+)\s+(.+)$/', $tag[1], $m)) {
	throw new Exception('Malformed @lexer-alias: ' . $tag[1]);
	}

	$patterns[$m[1]] = rtrim($m[2]);
	} else if ($tag[0] === 'lexer-mode') {
	if (!preg_match('/^(longest\|first)\s*$/', $tag[1], $m)) {
	throw new Exception('Malformed @lexer-mode. Only "longest" and "first" are allowed. Found: ' . $tag[1]);
	}

	$this->__mode = $m[1] === 'first' ? self::MODE_FIRST : self::MODE_LONGEST;
	}
	}

	// Get function rules
	$methods = $refl->getMethods();
	foreach($methods as $method) {
	$tags = $this->__getCommentTags( $method->getDocComment() );
	foreach ($tags as $tag) {

	if ($tag[0] === 'lexer-pattern') {

	if (!preg_match('/^([A-Za-z_]+)\s+(.*)$/', $tag[1], $m)) {
	throw new Exception('Malformed @lexer-alias: ' . $tag[1]);
	}

	$patterns[$m[1]] = rtrim($m[2]);

	} else if ($tag[0] === 'lexer-rule') {

	if (!preg_match('/^(?:<([A-Za-z_]+)>\s+)?(.+)$/', $tag[1], $m)) {
	throw new Exception('Malformed @lexer-parser: ' . $tag[1]);
	}

	$rules[] = array(
	'state' => $m[1] ? $m[1] : 'START',
	'method' => $method->getName(),
	'expression'=> trim($m[2])
	);
	}
	}
	}

	// Build the states
	foreach ($rules as $rule) {
	// Create the state if doesn't exists yet
	if (!isset($states[ $rule['state'] ])) {
	$states[ $rule['state'] ] = array();
	}

	$expr = $rule['expression'] . ' ';
	$regexp = '';
	$pos = 0;
	while (preg_match('/"([^"]+)"\s+\|([A-Za-z_][A-Za-z0-9_]*)\s+/', $expr, $m, null, $pos)) {
	$pos += strlen($m[0]);

	if (!empty($m[1])) {
	$regexp .= preg_quote($m[1]);
	} else {
	if (!isset($patterns[$m[2]])) {
	throw new Exception('Pattern not found: ' . $m[2]);
	}
	$regexp .= $patterns[$m[2]];
	}
	}
	if ($pos < strlen($expr)) {
	throw new Exception('Malformed rule expression: ' . $rule['expression']);
	}

	// Calculate the number of subpatterns in the regular expression
	// Remove escaped backslashes
	$str = str_replace('\\\\', '', $regexp);
	// Remove escaped parenthesis
	$str = str_replace('\\(', '', $str);
	// Remove non-capturing sets
	$str = str_replace('(?:', '', $str);
	$count = substr_count($str, '(');

	$states[ $rule['state'] ][] = array(
	'method' => $rule['method'],
	'regexp' => str_replace('@', '\@', $regexp),
	'subs' => $count
	);
	}

	// For first-match mode build the global regexps and mapping
	if ($this->__mode === 'first') {
	foreach ($states as $state => $rules) {
	$res = array();
	foreach($rules as $rule) {
	$res[] = $rule['regexp'];
	}
	$regexps[$state] = '@(' . implode(')\|(', $res) . ')@ASx';
	}
	}

	// Store the lexer
	return array(
	'mode' => $this->__mode,
	'states' => $states,
	'regexps' => $regexps
	);
	}

	/**
	* Parses a comment looking for annotations
	*
	* @param string $comment
	* @return array
	*/
	private function __getCommentTags($comment)
	{
	$tags = array();

	if (strpos($comment, '/**') !== 0) {
	return $tags;
	}

	// Remove start and end marks
	$comment = substr($comment, 3, -2);

	$lines = explode("\n", $comment);
	foreach ($lines as $ln) {
	$ln = ltrim($ln, " \t*");
	if (preg_match('/^@([A-Za-z_][A-Za-z_-])\s+(.)$/', $ln, $m)) {
	$tags[] = array( $m[1], $m[2] );
	}
	}
	return $tags;
	}

	}