Created
October 27, 2011 08:59
-
-
Save fetus-hina/1319099 to your computer and use it in GitHub Desktop.
N-gram
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class Text_Ngram implements Countable, SeekableIterator, ArrayAccess { | |
private | |
$text = '', | |
$chunk_size = 0, | |
$max_count = 0, | |
$charset = 'UTF-8', | |
$current = 0; // for iterator | |
public function __construct($text, $n, $charset = 'AUTO') { | |
if($charset === 'AUTO') { | |
$charset = mb_internal_encoding(); | |
} | |
$this->text = (string)$text; | |
$this->charset = (string)$charset; | |
$this->chunk_size = (int)$n; | |
$this->max_count = ($text === '') ? 0 : max(1, mb_strlen($this->text, $this->charset) - ($this->chunk_size - 1)); | |
} | |
// Countable | |
public function count() { | |
return $this->max_count; | |
} | |
// Iterator | |
public function current() { | |
return $this->get($this->current); | |
} | |
// Iterator | |
public function key() { | |
return $this->current; | |
} | |
// Iterator | |
public function next() { | |
++$this->current; | |
} | |
// Iterator | |
public function rewind() { | |
$this->seek(0); | |
} | |
// Iterator | |
public function valid() { | |
return $this->offsetExists($this->current); | |
} | |
// SeekableIterator | |
public function seek($pos) { | |
$this->current = (int)$pos; | |
} | |
// ArrayAccess | |
public function offsetExists($offset) { | |
return is_int($offset) && (0 <= $offset) && ($offset < $this->count()); | |
} | |
// ArrayAccess | |
public function offsetGet($offset) { | |
return $this->get($offset); | |
} | |
// ArrayAccess | |
public function offsetSet($offset, $value) { | |
} | |
// ArrayAccess | |
public function offsetUnset($offset) { | |
} | |
public function toArray() { | |
$retval = array(); | |
foreach($this as $value) { | |
$retval[] = $value; | |
} | |
return $retval; | |
} | |
public function toString($glue = ' ') { | |
return implode($glue, $this->toArray()); | |
} | |
public function __toString() { | |
return $this->toString(); | |
} | |
private function get($offset) { | |
if(!$this->offsetExists($offset)) { | |
return false; | |
} | |
return mb_substr($this->text, $offset, $this->chunk_size, $this->charset); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment