Created
December 8, 2012 15:52
-
-
Save masakielastic/4240816 to your computer and use it in GitHub Desktop.
refactored and clarified the logic of checking lead byte and trail bytes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Escaper | |
{ | |
public function replaceInvalidByteSequence($str, $substitute = "\xEF\xBF\xBD") | |
{ | |
$ret = ''; | |
$size = strlen($str); | |
$pos = 0; | |
while ($pos < $size) { | |
$seq = $this->getNextByteSequence($str, $pos); | |
if ($seq['valid']) { | |
$ret .= $seq['byte']; | |
} else { | |
$ret .= $substitute; | |
} | |
$pos += $seq['bytesize']; | |
} | |
return $ret; | |
} | |
public function getNextByteSequence($str, $pos) | |
{ | |
$spec = $this->getTailSpec($str[$pos]); | |
// check the range of lead byte | |
if (0 === $spec['length'] || false === $spec['valid']) { | |
return ['valid' => $spec['valid'], | |
'bytesize' => 1, | |
'byte' => $str[$pos]]; | |
} | |
$ret = ['valid' => true, 'bytesize' => 1, 'byte' => $str[$pos]]; | |
$tail = substr($str, $pos + 1, $spec['length']); | |
// check the range of trail bytes | |
for ($len = 0; $len < $spec['length']; $len += 1) { | |
if (isset($tail[$len]) | |
&& $spec['range'][2 * $len] <= $tail[$len] | |
&& $tail[$len] <= $spec['range'][2 * $len + 1]) { | |
$ret['byte'] .= $tail[$len]; | |
} else { | |
$ret['valid'] = false; | |
break; | |
} | |
} | |
// check whether the character is valid | |
if ($spec['length'] !== $len) { | |
$ret['valid'] = false; | |
} | |
// add the size of valid trail bytes | |
$ret['bytesize'] += $len; | |
return $ret; | |
} | |
public function getTailSpec($lead) | |
{ | |
if ($lead[0] <= "\x7F") { | |
return ['valid' => true, 'length' => 0, | |
'range' => '']; | |
} else if ("\xC2" <= $lead[0] && $lead[0] <= "\xDF") { | |
return ['valid' => true, 'length' => 1, | |
'range' => "\x80\xBF"]; | |
} else if ("\xE0" === $lead[0]) { | |
return ['valid' => true, 'length' => 2, | |
'range' => "\xA0\xBF\x80\xBF"]; | |
} else if (("\xE1" <= $lead[0] && $lead[0] <= "\xEC") | |
|| "\xEE" === $lead[0] | |
|| "\xEF" === $lead[0]) { | |
return ['valid' => true, 'length' => 2, | |
'range' => "\x80\xBF\x80\xBF"]; | |
} else if ("\xED" === $lead[0]) { | |
return ['valid' => true, 'length' => 2, | |
'range' => "\x80\x9F\x80\xBF"]; | |
} else if ("\xF0" === $lead[0]) { | |
return ['valid' => true, 'length' => 3, | |
'range' => "\x90\xBF\x80\xBF\x80\xBF"]; | |
} else if ("\xF1" <= $lead[0] && $lead[0] <= "\xF3") { | |
return ['valid' => true, 'length' => 3, | |
'range' => "\x80\xBF\x80\xBF\x80\xBF"]; | |
} else if ("\xF4" === $lead[0]) { | |
return ['valid' => true, 'length' => 3, | |
'range' => "\x80\x8F\x80\xBF\x80\xBF"]; | |
} else { | |
return ['valid' => false, 'length' => 0, 'range' => '']; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function replace_invalid_byte_sequence($str) { | |
$size = strlen($str); | |
$substitute = "\xEF\xBF\xBD"; | |
$ret = ''; | |
$pos = 0; | |
while ($pos < $size) { | |
$lune_info = get_lune_info($str, $pos); | |
if ($lune_info['valid']) { | |
$ret .= $lune_info['lune']; | |
} else { | |
$ret .= $substitute; | |
} | |
$pos += $lune_info['size']; | |
} | |
return $ret; | |
} | |
function get_lune_info($str, $pos) { | |
// [valid, size, char] | |
$ret = ['valid' => true, 'size' => 1, 'lune' => '']; | |
$spec = get_spec_info($str[$pos]); | |
$char = substr($str, $pos, $spec['size']); | |
if (!$spec['valid']) { | |
return ['valid' => false, | |
'size' => $spec['size'], | |
'lune' => $char]; | |
} | |
for ($i = 0; $i < $spec['size']; $i += 1) { | |
if (isset($char[$i]) && $spec['range'][2 * $i] <= $char[$i] && | |
$char[$i] <= $spec['range'][2 * $i + 1]) { | |
$ret['lune'] .= $char[$i]; | |
} else { | |
$ret['valid'] = false; | |
break; | |
} | |
} | |
$ret['size'] = $i; | |
if ($spec['size'] !== $ret['size']) { | |
$ret['valid'] = false; | |
} | |
return $ret; | |
} | |
function get_spec_info($lead) { | |
$ret = []; | |
if ($lead[0] <= "\x7F") { | |
$ret = ['valid' => true, 'size' => 1, | |
'range' => "\x00\x7F"]; | |
} else if ("\xC2" <= $lead[0] && $lead[0] <= "\xDF") { | |
$ret = ['valid' => true, 'size' => 2, | |
'range' => "\xC2\xDF\x80\xBF"]; | |
} else if ("\xE0" === $lead[0]) { | |
$ret = ['valid' => true, 'size' => 3, | |
'range' => "\xE0\xE0\xA0\xBF\x80\xBF"]; | |
} else if ("\xE1" <= $lead[0] && $lead[0] <= "\xEC") { | |
$ret = ['valid' => true, 'size' => 3, | |
'range' => "\xE1\xEC\x80\xBF\x80\xBF"]; | |
} else if ("\xED" === $lead[0]) { | |
$ret = ['valid' => true, 'size' => 3, | |
'range' => "\xED\xED\x80\x9F\x80\xBF"]; | |
} else if ("\xEE" <= $lead[0] && $lead[0] <= "\xEF") { | |
$ret = ['valid' => true, 'size' => 3, | |
'range' => "\xEE\xEF\x80\xBF\x80\xBF"]; | |
} else if ("\xF0" === $lead[0]) { | |
$ret = ['valid' => true, 'size' => 4, | |
'range' => "\xF0\xF0\x90\xBF\x80\xBF\x80\xBF"]; | |
} else if ("\xF1" <= $lead[0] && $lead[0] <= "\xF3") { | |
$ret = ['valid' => true, 'size' => 4, | |
'range' => "\xF1\xF3\x80\xBF\x80\xBF\x80\xBF"]; | |
} else if ("\xF4" === $lead[0]) { | |
$ret = ['valid' => true, 'size' => 4, | |
'range' => "\xF4\xF4\x80\x8F\x80\xBF\x80\xBF"]; | |
} else { | |
$ret = ['valid' => false, 'size' => 1, 'range' => '']; | |
} | |
return $ret; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Escaper | |
{ | |
public function replaceInvalidByteSequence($str, $substitute = "\xEF\xBF\xBD") | |
{ | |
$ret = ''; | |
$size = strlen($str); | |
$pos = 0; | |
while ($pos < $size) { | |
$seq = $this->getNextByteSequence($str, $pos); | |
if ($seq['valid']) { | |
$ret .= $seq['byte']; | |
} else { | |
$ret .= $substitute; | |
} | |
$pos += $seq['bytesize']; | |
} | |
return $ret; | |
} | |
public function getNextByteSequence($str, $pos) | |
{ | |
$spec = $this->getTailSpec($str[$pos]); | |
// check the range of lead byte | |
if (0 === $spec['length'] || false === $spec['valid']) { | |
return ['valid' => $spec['valid'], | |
'bytesize' => 1, | |
'byte' => $str[$pos]]; | |
} | |
$ret = ['valid' => true, 'bytesize' => 1, 'byte' => $str[$pos]]; | |
$tail = substr($str, $pos + 1, $spec['length']); | |
// check the range of trail bytes | |
for ($len = 0; $len < $spec['length']; $len += 1) { | |
if (isset($tail[$len]) | |
&& $spec['range'][2 * $len] <= $tail[$len] | |
&& $tail[$len] <= $spec['range'][2 * $len + 1]) { | |
$ret['byte'] .= $tail[$len]; | |
} else { | |
$ret['valid'] = false; | |
break; | |
} | |
} | |
// check whether the character is valid | |
if ($spec['length'] !== $len) { | |
$ret['valid'] = false; | |
} | |
// add the size of valid trail bytes | |
$ret['bytesize'] += $len; | |
return $ret; | |
} | |
public function getTailSpec($lead) | |
{ | |
if ($lead[0] <= "\x7F") { | |
return ['valid' => true, 'length' => 0, | |
'range' => '']; | |
} else if ("\xC2" <= $lead[0] && $lead[0] <= "\xDF") { | |
return ['valid' => true, 'length' => 1, | |
'range' => "\x80\xBF"]; | |
} else if ("\xE0" === $lead[0]) { | |
return ['valid' => true, 'length' => 2, | |
'range' => "\xA0\xBF\x80\xBF"]; | |
} else if (("\xE1" <= $lead[0] && $lead[0] <= "\xEC") | |
|| "\xEE" === $lead[0] | |
|| "\xEF" === $lead[0]) { | |
return ['valid' => true, 'length' => 2, | |
'range' => "\x80\xBF\x80\xBF"]; | |
} else if ("\xED" === $lead[0]) { | |
return ['valid' => true, 'length' => 2, | |
'range' => "\x80\x9F\x80\xBF"]; | |
} else if ("\xF0" === $lead[0]) { | |
return ['valid' => true, 'length' => 3, | |
'range' => "\x90\xBF\x80\xBF\x80\xBF"]; | |
} else if ("\xF1" <= $lead[0] && $lead[0] <= "\xF3") { | |
return ['valid' => true, 'length' => 3, | |
'range' => "\x80\xBF\x80\xBF\x80\xBF"]; | |
} else if ("\xF4" === $lead[0]) { | |
return ['valid' => true, 'length' => 3, | |
'range' => "\x80\x8F\x80\xBF\x80\xBF"]; | |
} else { | |
return ['valid' => false, 'length' => 0, 'range' => '']; | |
} | |
} | |
} | |
function run(array $callables, array $arguments) | |
{ | |
return array_map(function($callable) use($arguments) { | |
return array_map($callable, $arguments); | |
}, $callables); | |
} | |
$bytes = [ | |
// Table 3-8. Use of U+FFFD in UTF-8 Conversion | |
// http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf | |
"\x61"."\xF1\x80\x80"."\xE1\x80"."\xC2"."\x62"."\x80"."\x63"."\x80"."\xBF"."\x64", | |
// incomplete 3-byte characters | |
"\xE0\xA0"."\xE1\x80"."\xED\x80", | |
// incomplete 4-byte characters | |
"\xF0\x90\x80"."\xF0\x90"."\xF1\x80\x80"."\xF1\x80"."\xF4\x80\x80"."\xF4\x80" | |
]; | |
var_dump(run([ | |
[new Escaper, 'replaceInvalidByteSequence'] | |
], $bytes)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment