Skip to content

Instantly share code, notes, and snippets.

@masakielastic
Created December 8, 2012 15:52
Show Gist options
  • Save masakielastic/4240816 to your computer and use it in GitHub Desktop.
Save masakielastic/4240816 to your computer and use it in GitHub Desktop.
refactored and clarified the logic of checking lead byte and trail bytes
class Escaper
{
public function replaceInvalidByteSequence($str, $substitute = "\xEF\xBF\xBD")
{
$ret = '';
$size = strlen($str);
$pos = 0;
while ($pos < $size) {
$seq = $this->getNextByteSequence($str, $pos);
if ($seq['valid']) {
$ret .= $seq['byte'];
} else {
$ret .= $substitute;
}
$pos += $seq['bytesize'];
}
return $ret;
}
public function getNextByteSequence($str, $pos)
{
$spec = $this->getTailSpec($str[$pos]);
// check the range of lead byte
if (0 === $spec['length'] || false === $spec['valid']) {
return ['valid' => $spec['valid'],
'bytesize' => 1,
'byte' => $str[$pos]];
}
$ret = ['valid' => true, 'bytesize' => 1, 'byte' => $str[$pos]];
$tail = substr($str, $pos + 1, $spec['length']);
// check the range of trail bytes
for ($len = 0; $len < $spec['length']; $len += 1) {
if (isset($tail[$len])
&& $spec['range'][2 * $len] <= $tail[$len]
&& $tail[$len] <= $spec['range'][2 * $len + 1]) {
$ret['byte'] .= $tail[$len];
} else {
$ret['valid'] = false;
break;
}
}
// check whether the character is valid
if ($spec['length'] !== $len) {
$ret['valid'] = false;
}
// add the size of valid trail bytes
$ret['bytesize'] += $len;
return $ret;
}
public function getTailSpec($lead)
{
if ($lead[0] <= "\x7F") {
return ['valid' => true, 'length' => 0,
'range' => ''];
} else if ("\xC2" <= $lead[0] && $lead[0] <= "\xDF") {
return ['valid' => true, 'length' => 1,
'range' => "\x80\xBF"];
} else if ("\xE0" === $lead[0]) {
return ['valid' => true, 'length' => 2,
'range' => "\xA0\xBF\x80\xBF"];
} else if (("\xE1" <= $lead[0] && $lead[0] <= "\xEC")
|| "\xEE" === $lead[0]
|| "\xEF" === $lead[0]) {
return ['valid' => true, 'length' => 2,
'range' => "\x80\xBF\x80\xBF"];
} else if ("\xED" === $lead[0]) {
return ['valid' => true, 'length' => 2,
'range' => "\x80\x9F\x80\xBF"];
} else if ("\xF0" === $lead[0]) {
return ['valid' => true, 'length' => 3,
'range' => "\x90\xBF\x80\xBF\x80\xBF"];
} else if ("\xF1" <= $lead[0] && $lead[0] <= "\xF3") {
return ['valid' => true, 'length' => 3,
'range' => "\x80\xBF\x80\xBF\x80\xBF"];
} else if ("\xF4" === $lead[0]) {
return ['valid' => true, 'length' => 3,
'range' => "\x80\x8F\x80\xBF\x80\xBF"];
} else {
return ['valid' => false, 'length' => 0, 'range' => ''];
}
}
}
function replace_invalid_byte_sequence($str) {
$size = strlen($str);
$substitute = "\xEF\xBF\xBD";
$ret = '';
$pos = 0;
while ($pos < $size) {
$lune_info = get_lune_info($str, $pos);
if ($lune_info['valid']) {
$ret .= $lune_info['lune'];
} else {
$ret .= $substitute;
}
$pos += $lune_info['size'];
}
return $ret;
}
function get_lune_info($str, $pos) {
// [valid, size, char]
$ret = ['valid' => true, 'size' => 1, 'lune' => ''];
$spec = get_spec_info($str[$pos]);
$char = substr($str, $pos, $spec['size']);
if (!$spec['valid']) {
return ['valid' => false,
'size' => $spec['size'],
'lune' => $char];
}
for ($i = 0; $i < $spec['size']; $i += 1) {
if (isset($char[$i]) && $spec['range'][2 * $i] <= $char[$i] &&
$char[$i] <= $spec['range'][2 * $i + 1]) {
$ret['lune'] .= $char[$i];
} else {
$ret['valid'] = false;
break;
}
}
$ret['size'] = $i;
if ($spec['size'] !== $ret['size']) {
$ret['valid'] = false;
}
return $ret;
}
function get_spec_info($lead) {
$ret = [];
if ($lead[0] <= "\x7F") {
$ret = ['valid' => true, 'size' => 1,
'range' => "\x00\x7F"];
} else if ("\xC2" <= $lead[0] && $lead[0] <= "\xDF") {
$ret = ['valid' => true, 'size' => 2,
'range' => "\xC2\xDF\x80\xBF"];
} else if ("\xE0" === $lead[0]) {
$ret = ['valid' => true, 'size' => 3,
'range' => "\xE0\xE0\xA0\xBF\x80\xBF"];
} else if ("\xE1" <= $lead[0] && $lead[0] <= "\xEC") {
$ret = ['valid' => true, 'size' => 3,
'range' => "\xE1\xEC\x80\xBF\x80\xBF"];
} else if ("\xED" === $lead[0]) {
$ret = ['valid' => true, 'size' => 3,
'range' => "\xED\xED\x80\x9F\x80\xBF"];
} else if ("\xEE" <= $lead[0] && $lead[0] <= "\xEF") {
$ret = ['valid' => true, 'size' => 3,
'range' => "\xEE\xEF\x80\xBF\x80\xBF"];
} else if ("\xF0" === $lead[0]) {
$ret = ['valid' => true, 'size' => 4,
'range' => "\xF0\xF0\x90\xBF\x80\xBF\x80\xBF"];
} else if ("\xF1" <= $lead[0] && $lead[0] <= "\xF3") {
$ret = ['valid' => true, 'size' => 4,
'range' => "\xF1\xF3\x80\xBF\x80\xBF\x80\xBF"];
} else if ("\xF4" === $lead[0]) {
$ret = ['valid' => true, 'size' => 4,
'range' => "\xF4\xF4\x80\x8F\x80\xBF\x80\xBF"];
} else {
$ret = ['valid' => false, 'size' => 1, 'range' => ''];
}
return $ret;
}
class Escaper
{
public function replaceInvalidByteSequence($str, $substitute = "\xEF\xBF\xBD")
{
$ret = '';
$size = strlen($str);
$pos = 0;
while ($pos < $size) {
$seq = $this->getNextByteSequence($str, $pos);
if ($seq['valid']) {
$ret .= $seq['byte'];
} else {
$ret .= $substitute;
}
$pos += $seq['bytesize'];
}
return $ret;
}
public function getNextByteSequence($str, $pos)
{
$spec = $this->getTailSpec($str[$pos]);
// check the range of lead byte
if (0 === $spec['length'] || false === $spec['valid']) {
return ['valid' => $spec['valid'],
'bytesize' => 1,
'byte' => $str[$pos]];
}
$ret = ['valid' => true, 'bytesize' => 1, 'byte' => $str[$pos]];
$tail = substr($str, $pos + 1, $spec['length']);
// check the range of trail bytes
for ($len = 0; $len < $spec['length']; $len += 1) {
if (isset($tail[$len])
&& $spec['range'][2 * $len] <= $tail[$len]
&& $tail[$len] <= $spec['range'][2 * $len + 1]) {
$ret['byte'] .= $tail[$len];
} else {
$ret['valid'] = false;
break;
}
}
// check whether the character is valid
if ($spec['length'] !== $len) {
$ret['valid'] = false;
}
// add the size of valid trail bytes
$ret['bytesize'] += $len;
return $ret;
}
public function getTailSpec($lead)
{
if ($lead[0] <= "\x7F") {
return ['valid' => true, 'length' => 0,
'range' => ''];
} else if ("\xC2" <= $lead[0] && $lead[0] <= "\xDF") {
return ['valid' => true, 'length' => 1,
'range' => "\x80\xBF"];
} else if ("\xE0" === $lead[0]) {
return ['valid' => true, 'length' => 2,
'range' => "\xA0\xBF\x80\xBF"];
} else if (("\xE1" <= $lead[0] && $lead[0] <= "\xEC")
|| "\xEE" === $lead[0]
|| "\xEF" === $lead[0]) {
return ['valid' => true, 'length' => 2,
'range' => "\x80\xBF\x80\xBF"];
} else if ("\xED" === $lead[0]) {
return ['valid' => true, 'length' => 2,
'range' => "\x80\x9F\x80\xBF"];
} else if ("\xF0" === $lead[0]) {
return ['valid' => true, 'length' => 3,
'range' => "\x90\xBF\x80\xBF\x80\xBF"];
} else if ("\xF1" <= $lead[0] && $lead[0] <= "\xF3") {
return ['valid' => true, 'length' => 3,
'range' => "\x80\xBF\x80\xBF\x80\xBF"];
} else if ("\xF4" === $lead[0]) {
return ['valid' => true, 'length' => 3,
'range' => "\x80\x8F\x80\xBF\x80\xBF"];
} else {
return ['valid' => false, 'length' => 0, 'range' => ''];
}
}
}
function run(array $callables, array $arguments)
{
return array_map(function($callable) use($arguments) {
return array_map($callable, $arguments);
}, $callables);
}
$bytes = [
// Table 3-8. Use of U+FFFD in UTF-8 Conversion
// http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
"\x61"."\xF1\x80\x80"."\xE1\x80"."\xC2"."\x62"."\x80"."\x63"."\x80"."\xBF"."\x64",
// incomplete 3-byte characters
"\xE0\xA0"."\xE1\x80"."\xED\x80",
// incomplete 4-byte characters
"\xF0\x90\x80"."\xF0\x90"."\xF1\x80\x80"."\xF1\x80"."\xF4\x80\x80"."\xF4\x80"
];
var_dump(run([
[new Escaper, 'replaceInvalidByteSequence']
], $bytes));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment