Last active
July 12, 2017 07:59
-
-
Save masakielastic/5793665 to your computer and use it in GitHub Desktop.
mb_convert_encoding breaks well-formed character (PHP 5.5RC3, Mac OSX 10.8).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// https://en.wikipedia.org/wiki/UTF-8#Examples | |
// 2-byte character: U+00A2 (CENT SIGN) | |
mb_substitute_character(0xFFFD); | |
$data = [ | |
// ill-formed | |
"\xC2\xA2"."\xC2\xA2"."\xC2", | |
// ill-formed | |
"\xC2\xA2"."\xC2\xA2". "\xA2", | |
]; | |
$expected = [ | |
// U+FFFD | |
"\xC2\xA2"."\xC2\xA2"."\xEF\xBF\xBD", | |
// U+FFFD | |
"\xC2\xA2"."\xC2\xA2"."\xEF\xBF\xBD", | |
]; | |
$expected2 = [ | |
"\xC2\xA2"."\xC2\xA2", | |
// U+FFFD | |
"\xC2\xA2"."\xC2\xA2"."\xEF\xBF\xBD", | |
]; | |
var_dump( | |
'2-byte character: U+00A2 (CENT SIGN)', | |
[ | |
$expected === array_map(function($str) { return UConverter::transcode($str, 'UTF-8', 'UTF-8'); }, $data), | |
$expected === array_map(function($str) { return htmlspecialchars($str, ENT_SUBSTITUTE, 'UTF-8'); }, $data), | |
$expected2 === array_map(function($str) { return mb_convert_encoding($str, 'UTF-8', 'UTF-8'); }, $data) | |
],[ | |
// mb_convert_encoding deletes trailing byte without replacing U+FFFD | |
$expected[0] !== $expected2[0], | |
$expected[1] === $expected2[1] | |
]); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// https://en.wikipedia.org/wiki/UTF-8#Examples | |
// 3-byte character: U+20AC (EURO SIGN) | |
mb_substitute_character(0xFFFD); | |
$data2 = [ | |
// ill-formed | |
"\xE2\x82" ."\xE2\x82\xAC"."\xE2\x82\xAC", | |
// ill-formed | |
"\x82\xAC"."\xE2\x82\xAC"."\xE2\x82\xAC", | |
// ill-formed | |
"\xE2\x82\xAC"."\xE2\x82\xAC"."\xE2\x82", | |
// ill-formed | |
"\xE2\x82\xAC"."\xE2\x82\xAC". "\x82\xAC", | |
]; | |
$expected3 = [ | |
// U+FFFD | |
"\xEF\xBF\xBD"."\xE2\x82\xAC"."\xE2\x82\xAC", | |
// U+FFFD U+FFFD | |
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xE2\x82\xAC"."\xE2\x82\xAC", | |
// U+FFFD | |
"\xE2\x82\xAC"."\xE2\x82\xAC"."\xEF\xBF\xBD", | |
// U+FFFD U+FFFD | |
"\xE2\x82\xAC"."\xE2\x82\xAC"."\xEF\xBF\xBD"."\xEF\xBF\xBD" | |
]; | |
$expected4 = [ | |
// U+FFFD U+FFFD U+FFFD | |
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xE2\x82\xAC", | |
// U+FFFD | |
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xE2\x82\xAC"."\xE2\x82\xAC", | |
"\xE2\x82\xAC"."\xE2\x82\xAC", | |
// U+FFFD U+FFFD | |
"\xE2\x82\xAC"."\xE2\x82\xAC"."\xEF\xBF\xBD"."\xEF\xBF\xBD" | |
]; | |
var_dump( | |
'3-byte character: U+20AC (EURO SIGN)', | |
[ | |
$expected3 === array_map(function($str) { return UConverter::transcode($str, 'UTF-8', 'UTF-8'); }, $data2), | |
$expected3 === array_map(function($str) { return htmlspecialchars($str, ENT_SUBSTITUTE, 'UTF-8'); }, $data2), | |
$expected4 === array_map(function($str) { return mb_convert_encoding($str, 'UTF-8', 'UTF-8'); }, $data2) | |
],[ | |
// mb_convert_encoding breaks well-formed character | |
$expected3[0] !== $expected4[0], | |
$expected3[1] === $expected4[1], | |
// mb_convert_encoding deletes trailing bytes without replacing U+FFFD | |
$expected3[2] !== $expected4[2], | |
$expected3[3] === $expected4[3] | |
]); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// https://en.wikipedia.org/wiki/UTF-8#Examples | |
// 4-byte character: U+24B62 (Unicode Han Character) | |
mb_substitute_character(0xFFFD); | |
$data3 = [ | |
// ill-formed | |
"\xF0\xA4\xAD" ."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2", | |
// ill-formed | |
"\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2", | |
// ill-formed | |
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD", | |
// ill-formed | |
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2". "\xA4\xAD\xA2", | |
]; | |
$expected5 = [ | |
// U+FFFD | |
"\xEF\xBF\xBD"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2", | |
// U+FFFD U+FFFD U+FFFD | |
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2", | |
// U+FFFD | |
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xEF\xBF\xBD", | |
// U+FFFD U+FFFD U+FFFD | |
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD" | |
]; | |
$expected6 = [ | |
// U+FFFD U+FFFD U+FFFD U+FFFD | |
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xF0\xA4\xAD\xA2", | |
// U+FFFD U+FFFD U+FFFD | |
"\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2", | |
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2", | |
// U+FFFD U+FFFD U+FFFD | |
"\xF0\xA4\xAD\xA2"."\xF0\xA4\xAD\xA2"."\xEF\xBF\xBD"."\xEF\xBF\xBD"."\xEF\xBF\xBD" | |
]; | |
var_dump( | |
'4-byte character: U+24B62 (Unicode Han Character)', | |
[ | |
$expected5 === array_map(function($str) { return UConverter::transcode($str, 'UTF-8', 'UTF-8'); }, $data3), | |
$expected5 === array_map(function($str) { return htmlspecialchars($str, ENT_SUBSTITUTE, 'UTF-8'); }, $data3), | |
$expected6 === array_map(function($str) { return mb_convert_encoding($str, 'UTF-8', 'UTF-8'); }, $data3) | |
],[ | |
// mb_convert_encoding breaks well-formed character | |
$expected5[0] !== $expected6[0], | |
$expected5[1] === $expected6[1], | |
// mb_convert_encoding deletes trailing bytes without replacing U+FFFD | |
$expected5[2] !== $expected6[2], | |
$expected5[3] === $expected6[3] | |
]); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment