Created
February 4, 2013 22:28
-
-
Save xeoncross/4710324 to your computer and use it in GitHub Desktop.
PHP: Nested Parenthesis Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// @rodneyrehm | |
// http://stackoverflow.com/a/7917979/99923 | |
class ParensParser | |
{ | |
// something to keep track of parens nesting | |
protected $stack = null; | |
// current level | |
protected $current = null; | |
// input string to parse | |
protected $string = null; | |
// current character offset in string | |
protected $position = null; | |
// start of text-buffer | |
protected $buffer_start = null; | |
public function parse($string) | |
{ | |
if (!$string) { | |
// no string, no data | |
return array(); | |
} | |
if ($string[0] == '(') { | |
// killer outer parens, as they're unnecessary | |
$string = substr($string, 1, -1); | |
} | |
$this->current = array(); | |
$this->stack = array(); | |
$this->string = $string; | |
$this->length = strlen($this->string); | |
// look at each character | |
for ($this->position=0; $this->position < $this->length; $this->position++) { | |
switch ($this->string[$this->position]) { | |
case '(': | |
$this->push(); | |
// push current scope to the stack an begin a new scope | |
array_push($this->stack, $this->current); | |
$this->current = array(); | |
break; | |
case ')': | |
$this->push(); | |
// save current scope | |
$t = $this->current; | |
// get the last scope from stack | |
$this->current = array_pop($this->stack); | |
// add just saved scope to current scope | |
$this->current[] = $t; | |
break; | |
/* | |
case ' ': | |
// make each word its own token | |
$this->push(); | |
break; | |
*/ | |
default: | |
// remember the offset to do a string capture later | |
// could've also done $buffer .= $string[$position] | |
// but that would just be wasting resources… | |
if ($this->buffer_start === null) { | |
$this->buffer_start = $this->position; | |
} | |
} | |
} | |
return $this->current; | |
} | |
protected function push() | |
{ | |
if ($this->buffer_start !== null) { | |
// extract string from buffer start to current position | |
$buffer = substr($this->string, $this->buffer_start, $this->position - $this->buffer_start); | |
// clean buffer | |
$this->buffer_start = null; | |
// throw token into current scope | |
$this->current[] = $buffer; | |
} | |
} | |
} | |
$string = '(TOP (S (NP (PRP I)) (VP (VBP love) (NP (NP (DT a) (JJ big) (NN bed)) (PP (IN of) (NP (NNS roses))))) (. .)))'; | |
$p = new ParensParser(); | |
$result = $p->parse($string); | |
var_dump($result); |
awesome! thanks
Exactly what I need! [
Extracting affiliations from scientific publications like
"Palmén, Rachel (Universitat Oberta de Catalunya (UOC) (Open University of Catalonia), Barcelona, Spain)" ]
Best,
per funke at gmail dot com
Alternatively, you can use:
/**
* Parse a string into an array.
*
* @param string $subject
* The subject string.
*
* @return array|bool
* The array.
*/
private function parse(string $subject)
{
$result = [];
\preg_match_all('~[^\[\]]+|\[(?<nested>(?R)*)\]~', $subject, $matches);
foreach (\array_filter($matches['nested']) as $match) {
$item = [];
$position = \strpos($match, '[');
if (false !== $position) {
$item['value'] = \substr($match, 0, $position);
} else {
$item['value'] = $match;
}
if ([] !== $children = $this->parse($match)) {
$item['children'] = $children;
}
$result[] = $item;
}
return $result;
}
This has been used in phptree.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thank you.