Last active
February 11, 2022 10:43
-
-
Save NewEXE/05c2cb337218d562133e9c715334972f to your computer and use it in GitHub Desktop.
Remove any whitespace character
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Removes any whitespace character. | |
* This includes tabs and newline characters, as well as | |
* multibyte whitespace such as the thin space and ideographic space, | |
* unprintable characters and invalid unicode characters. | |
* | |
* @param string $string | |
* @return string | |
*/ | |
function removeWhitespace(string $string): string | |
{ | |
$string = removeBasicWhitespace($string); | |
return removeFunkyWhitespace($string); | |
} | |
/** | |
* Removes whitespaces: tabs and newline characters, as well as | |
* multibyte whitespace such as the thin space and ideographic space. | |
* | |
* @param string $string | |
* @return string | |
*/ | |
function removeBasicWhitespace(string $string): string | |
{ | |
// Hack for "Halfwidth Hangul Filler" | |
$string = \str_replace("\xef\xbe\xa0", '', $string); | |
return \preg_replace('#[[:space:]]+#u', '', $string); | |
} | |
/** | |
* Remove unprintable characters and invalid unicode characters. | |
* | |
* Remove any next entity: | |
* \p{C} or \p{Other}: invisible control characters and unused code points: | |
* - \p{Cc} or \p{Control}: an ASCII or Latin-1 control character: 0x00–0x1F and 0x7F–0x9F. | |
* - \p{Cf} or \p{Format}: invisible formatting indicator. | |
* - \p{Co} or \p{Private_Use}: any code point reserved for private use. | |
* - \p{Cs} or \p{Surrogate}: one half of a surrogate pair in UTF-16 encoding. | |
* - \p{Cn} or \p{Unassigned}: any code point to which no character has been assigned. | |
* | |
* Result examples: | |
* "some\0/path.txt" => "some/path.txt" ("\0" was removed) | |
* "some\x00/path.txt" => "some/path.txt" ("\x00" was removed) | |
* "s\ttring" => "string" ("\t" was removed) | |
* "str\x09ing" => "string" ("\x09" was removed) | |
* | |
* @source https://github.com/thephpleague/flysystem/commit/a3c694de9f7e844b76f9d1b61296ebf6e8d89d74 | |
* @param string $string | |
* @return string | |
*/ | |
function removeFunkyWhitespace(string $string): string | |
{ | |
// We do this check in a loop, since removing invalid unicode characters | |
// can lead to new characters being created. | |
do { | |
$string = (string) \preg_replace('#\p{C}+#u', '', $string, -1, $count); | |
} while ($count !== 0); | |
return $string; | |
} | |
/* | |
* TESTING | |
*/ | |
$tests = [ | |
// input => expected output | |
"\u{00A0}" => '', // No-break space (in Unicode) | |
' ' => '', // Many simple spaces | |
"\n\r" => '', // Line-break | |
"\x00" => '', // Null (in ASCII) | |
"\x04" => '', // End of transmission (in ASCII) | |
"\x0" => '', // NULL Byte | |
"\x9" => '', // Tab | |
"\xa" => '', // New Line | |
"\xb" => '', // Vertical Tab | |
"\xd" => '', // Carriage Return | |
"\x20" => '', // Ordinary Space | |
"\xc2\xa0" => '', // NO-BREAK SPACE | |
"\xe1\x9a\x80" => '', // OGHAM SPACE MARK | |
"\xe1\xa0\x8e" => '', // MONGOLIAN VOWEL SEPARATOR | |
"\xe2\x80\x80" => '', // EN QUAD | |
"\xe2\x80\x81" => '', // EM QUAD | |
"\xe2\x80\x82" => '', // EN SPACE | |
"\xe2\x80\x83" => '', // EM SPACE | |
"\xe2\x80\x84" => '', // THREE-PER-EM SPACE | |
"\xe2\x80\x85" => '', // FOUR-PER-EM SPACE | |
"\xe2\x80\x86" => '', // SIX-PER-EM SPACE | |
"\xe2\x80\x87" => '', // FIGURE SPACE | |
"\xe2\x80\x88" => '', // PUNCTUATION SPACE | |
"\xe2\x80\x89" => '', // THIN SPACE | |
"\xe2\x80\x8a" => '', // HAIR SPACE | |
"\xe2\x80\xa8" => '', // LINE SEPARATOR | |
"\xe2\x80\xa9" => '', // PARAGRAPH SEPARATOR | |
"\xe2\x80\xaf" => '', // NARROW NO-BREAK SPACE | |
"\xe2\x81\x9f" => '', // MEDIUM MATHEMATICAL SPACE | |
"\xef\xbe\xa0" => '', // HALFWIDTH HANGUL FILLER | |
"\u{FFA0}" => '', // HALFWIDTH HANGUL FILLER (in Unicode) | |
"\xe3\x80\x80" => '', // IDEOGRAPHIC SPACE | |
"some\0/path.txt" => 'some/path.txt', | |
"some\x00/path.txt" => 'some/path.txt', | |
"s\ttring" => 'string', | |
"str\x09ing" => 'string', | |
]; | |
var_dump('Running test cases...'); | |
echo "<br />\n"; | |
foreach ($tests as $input => $expectedOutput) { | |
echo 'Passed: '; | |
echo removeWhitespace($input) === $expectedOutput ? 'true' : 'FALSE'; | |
echo "<br />\n"; | |
} | |
var_dump('Done'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment