-
-
Save Opencontent/af33ee960dc1a942517e to your computer and use it in GitHub Desktop.
PHP: Nested Parenthesis Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// @rodneyrehm | |
// http://stackoverflow.com/a/7917979/99923 | |
class ParensParser | |
{ | |
protected $length = null; | |
// something to keep track of parens nesting | |
protected $stack = null; | |
// current level | |
protected $current = null; | |
// input string to parse | |
protected $string = null; | |
// current character offset in string | |
protected $position = null; | |
protected $lastPosition = 0; | |
// start of text-buffer | |
protected $buffer_start = null; | |
public function parse($string) | |
{ | |
if (!$string) { | |
// no string, no data | |
return array(); | |
} | |
if ($string[0] == '(') { | |
// killer outer parens, as they're unnecessary | |
$string = substr($string, 1, -1); | |
} | |
$this->current = array(); | |
$this->stack = array(); | |
$this->string = $string; | |
$this->length = strlen($this->string); | |
// look at each character | |
for ($this->position=0; $this->position < $this->length; $this->position++) { | |
switch ($this->string[$this->position]) { | |
case '(': | |
$this->push(); | |
// push current scope to the stack an begin a new scope | |
array_push($this->stack, $this->current); | |
$this->current = array(); | |
break; | |
case ')': | |
$this->push(); | |
// save current scope | |
$t = $this->current; | |
// get the last scope from stack | |
$this->current = array_pop($this->stack); | |
// add just saved scope to current scope | |
$this->current[] = $t; | |
$this->lastPosition = $this->position + 1; | |
break; | |
/* | |
case ' ': | |
// make each word its own token | |
$this->push(); | |
break; | |
*/ | |
default: | |
// remember the offset to do a string capture later | |
// could've also done $buffer .= $string[$position] | |
// but that would just be wasting resources… | |
if ($this->buffer_start === null) { | |
$this->buffer_start = $this->position; | |
} | |
} | |
} | |
if ( $this->lastPosition+1 < $this->length ) | |
{ | |
$this->current[] = substr($this->string, $this->lastPosition, $this->length - $this->lastPosition); | |
} | |
return $this->current; | |
} | |
protected function push() | |
{ | |
if ($this->buffer_start !== null) { | |
// extract string from buffer start to current position | |
$buffer = substr($this->string, $this->buffer_start, $this->position - $this->buffer_start); | |
// clean buffer | |
$this->buffer_start = null; | |
// throw token into current scope | |
$this->current[] = $buffer; | |
} | |
} | |
} | |
$string = '(TOP (S (NP (PRP I)) (VP (VBP love) (NP (NP (DT a) (JJ big) (NN bed)) (PP (IN of) (NP (NNS roses))))) (. .)))'; | |
$p = new ParensParser(); | |
$result = $p->parse($string); | |
var_dump($result); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment