Created
February 23, 2012 18:56
-
-
Save hakre/1894360 to your computer and use it in GitHub Desktop.
TextRange and TextRangeTrimmer HTML/DOMDocument text processing classes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* TextRange.php - DOMDocument based Textrange and text manipulation | |
* | |
* @author hakre <http://hakre.wordpress.com/credits> | |
* @version 1.1.1 | |
*/ | |
/** | |
* TextRange - Collection of DOMText nodes | |
* | |
* @since 1.0.0 | |
*/ | |
class TextRange implements Countable | |
{ | |
/** | |
* @var DOMText[] | |
*/ | |
private $nodes; | |
public function __construct($nodes) | |
{ | |
$this->setNodes($nodes); | |
} | |
/** | |
* @param array|DOMNodeList|DOMElement $nodes DOMText nodes | |
* @throws InvalidArgumentException | |
* @since 1.1.0 | |
*/ | |
public function setNodes($nodes) | |
{ | |
if ($nodes instanceof DOMNode) { | |
$nodes = $this->getChildTextNodes($nodes); | |
} | |
if ($nodes instanceof DOMNodeList) { | |
$nodes = iterator_to_array($nodes); | |
} | |
if (!is_array($nodes)) { | |
throw new InvalidArgumentException('Not an array/DOMNodeList/DOMElement.'); | |
} | |
$this->nodes = $this->validateDOMTextArray($nodes); | |
} | |
/** | |
* @return array | |
*/ | |
public function getNodes() | |
{ | |
return $this->nodes; | |
} | |
/** | |
* @param DOMNode $DOMNode | |
* @return array DOMText | |
*/ | |
private function getChildTextNodes(DOMNode $DOMNode) | |
{ | |
$xp = new DOMXPath($DOMNode->ownerDocument); | |
$textNodes = $xp->query('.//child::text()', $DOMNode); | |
if (!$textNodes) { | |
throw new RuntimeException('XPath query to obtain DOMText childnodes failed.'); | |
} | |
return $textNodes; | |
} | |
/** | |
* @return array | |
*/ | |
public function getStrings() | |
{ | |
$strings = array(); | |
foreach ($this->nodes as $node) | |
$strings[] = $node->nodeValue; | |
return $strings; | |
} | |
private function validateDOMTextArray(array $list) | |
{ | |
foreach ($list as $node) | |
{ | |
if (!$node instanceof DOMText) { | |
throw new InvalidArgumentException('Not a DOMText'); | |
} | |
} | |
return $list; | |
} | |
private function validateOffsetArgument($offset) | |
{ | |
$list = $this->nodes; | |
if (!$list) throw new BadMethodCallException('Range has no elements.'); | |
$offset = (int)$offset; | |
if ($offset < 0) throw new OutOfBoundsException('Invalid offset (<0).'); | |
$length = $this->utf8StrLen($this); | |
if ($offset > $length) throw new OutOfBoundsException(sprintf('Invalid offset (>%d).', $length)); | |
return $offset; | |
} | |
/** | |
* Split (cutoff) at offset. | |
* | |
* @param int|string $offset UTF-8 character offset or string (of which the length is taken as $offset) | |
* @return TextRange new remainder range | |
*/ | |
public function split($offset) | |
{ | |
if (is_string($offset)) { | |
$offset = $this->utf8StrLen($offset); | |
} | |
if ($offset < 0) { | |
$offset = $this->stringLength() + $offset; | |
} | |
$index = $this->splitText($offset); | |
if ($index === count($this)) { | |
$new = array(); // virtually the next node | |
} | |
else | |
{ | |
$new = array_splice($this->nodes, $index); | |
} | |
return new TextRange($new); | |
} | |
/** | |
* @return int string length (UTF-8) | |
*/ | |
public function stringLength() | |
{ | |
return $this->utf8StrLen($this); | |
} | |
private function utf8StrLen($string) | |
{ | |
preg_filter('(.)su', '', $string, -1, $count); | |
return $count; | |
} | |
/** | |
* @param int $offset UTF-8 character offset | |
* @return int index of new node / offset start node | |
*/ | |
public function splitText($offset) | |
{ | |
$offset = $this->validateOffsetArgument($offset); | |
$runOffset = 0; | |
foreach ($this->nodes as $index => $node) | |
{ | |
$len = $this->utf8StrLen($node->nodeValue); | |
// at the start or a text node | |
if ($offset === $runOffset) { | |
return $index; | |
} | |
// at the end of a text node, it's the next node (can be virtual) | |
if ($offset === $runOffset + $len) { | |
return $index + 1; | |
} | |
// match, split this node | |
if ($offset > $runOffset && $offset < $runOffset + $len) { | |
$splitAt = $offset - $runOffset; | |
$newNode = $node->splitText($splitAt); | |
array_splice($this->nodes, $index + 1, 0, array($newNode)); | |
return $index + 1; | |
} | |
$runOffset += $len; | |
} | |
throw new Exception('Implementation Error - should never come here, check input validation or function code.'); | |
} | |
public function __toString() | |
{ | |
return implode('', $this->getStrings()); | |
} | |
public function getCharacter($offset) | |
{ | |
return mb_substr((string)$this, $offset, 1, 'UTF-8'); | |
} | |
public function count() | |
{ | |
return count($this->nodes); | |
} | |
} | |
/** | |
* TextRangeTrimmer - trim, ltrim and rtrim for TextRange | |
* | |
* @since 1.0.0 | |
*/ | |
class TextRangeTrimmer | |
{ | |
/** | |
* @var TextRange | |
*/ | |
private $range; | |
/** | |
* @var array | |
*/ | |
private $charlist; | |
public function __construct(TextRange $range, array $charlist = NULL) | |
{ | |
$this->range = $range; | |
$this->setCharlist($charlist); | |
} | |
/** | |
* @param array $charlist list of UTF-8 encoded characters | |
* @throws InvalidArgumentException | |
*/ | |
public function setCharlist(array $charlist = NULL) | |
{ | |
if (NULL === $charlist) | |
$charlist = str_split(" \t\n\r\0\x0B"); | |
$list = array(); | |
foreach ($charlist as $char) | |
{ | |
if (!is_string($char)) { | |
throw new InvalidArgumentException('Not an array of strings.'); | |
} | |
if (strlen($char)) { | |
$list[] = $char; | |
} | |
} | |
$this->charlist = array_flip($list); | |
} | |
/** | |
* @return array characters | |
*/ | |
public function getCharlist() | |
{ | |
return array_keys($this->charlist); | |
} | |
public function trim() | |
{ | |
if (!$this->charlist) return; | |
$this->ltrim(); | |
$this->rtrim(); | |
} | |
public function ltrim() | |
{ | |
$count = $this->lengthOfCharacterSequence($this->charlist, 0); | |
if ($count) { | |
$remainder = $this->range->split($count); | |
foreach ($this->range->getNodes() as $textNode) | |
{ | |
$textNode->parentNode->removeChild($textNode); | |
} | |
$this->range->setNodes($remainder->getNodes()); | |
} | |
} | |
public function rtrim() | |
{ | |
$count = $this->lengthOfCharacterSequence($this->charlist, -1, -1); | |
if ($count) { | |
$chop = $this->range->split(-$count); | |
foreach ($chop->getNodes() as $textNode) | |
{ | |
$textNode->parentNode->removeChild($textNode); | |
} | |
} | |
} | |
/** | |
* Number of consecutive characters of $charlist from $start to $direction. | |
* | |
* @param array $charlist | |
* @param int $start offset | |
* @param int $direction 1: forward, -1: backward | |
* @return int | |
* @throws InvalidArgumentException | |
*/ | |
private function lengthOfCharacterSequence(array $charlist, $start, $direction = 1) | |
{ | |
$start = (int)$start; | |
$direction = max(-1, min(1, $direction)); | |
if (!$direction) throw new InvalidArgumentException('Direction must be 1 or -1.'); | |
$count = 0; | |
for (; $char = $this->range->getCharacter($start), $char !== ''; $start += $direction, $count++) | |
if (!isset($charlist[$char])) break; | |
return $count; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment