NewEXE · February 11, 2022 10:43
diff --git a/remove-any-whitespace.php b/remove-any-whitespace.php
 <?php

 /**
 * Removes any whitespace character.
 * This includes tabs and newline characters, as well as
 * multibyte whitespace such as the thin space and ideographic space,
 * unprintable characters and invalid unicode characters.
 *
 * @param string $string
 * @return string
 */
 function removeWhitespace(string $string): string
 {
    $string = removeBasicWhitespace($string);
    return removeFunkyWhitespace($string);
 }

 /**
 * Removes whitespaces: tabs and newline characters, as well as
 * multibyte whitespace such as the thin space and ideographic space.
 *
 * @param string $string
 * @return string
 */
 function removeBasicWhitespace(string $string): string
 {
    // Hack for "Halfwidth Hangul Filler"
    $string = \str_replace("\xef\xbe\xa0", '', $string);

    return \preg_replace('#[[:space:]]+#u', '', $string);
 }

 /**
 * Remove unprintable characters and invalid unicode characters.
 *
 * Remove any next entity:
 * \p{C} or \p{Other}: invisible control characters and unused code points:
 * - \p{Cc} or \p{Control}: an ASCII or Latin-1 control character: 0x00–0x1F and 0x7F–0x9F.
 * - \p{Cf} or \p{Format}: invisible formatting indicator.
 * - \p{Co} or \p{Private_Use}: any code point reserved for private use.
 * - \p{Cs} or \p{Surrogate}: one half of a surrogate pair in UTF-16 encoding.
 * - \p{Cn} or \p{Unassigned}: any code point to which no character has been assigned.
 *
 * Result examples:
 * "some\0/path.txt"    => "some/path.txt"  ("\0" was removed)
 * "some\x00/path.txt"  => "some/path.txt"  ("\x00" was removed)
 * "s\ttring"           => "string"         ("\t" was removed)
 * "str\x09ing"         => "string"         ("\x09" was removed)
 *
 * @source https://github.com/thephpleague/flysystem/commit/a3c694de9f7e844b76f9d1b61296ebf6e8d89d74
 * @param string $string
 * @return string
 */
 function removeFunkyWhitespace(string $string): string
 {
    // We do this check in a loop, since removing invalid unicode characters
    // can lead to new characters being created.
    do {
        $string = (string) \preg_replace('#\p{C}+#u', '', $string, -1, $count);
    } while ($count !== 0);

    return $string;
 }

 /*
 * TESTING
 */

 $tests = [
    // input => expected output
    "\u{00A0}"          => '', // No-break space (in Unicode)
    '   '               => '', // Many simple spaces
    "\n\r"              => '', // Line-break
    "\x00"              => '', // Null (in ASCII)
    "\x04"              => '', // End of transmission (in ASCII)
    "\x0"               => '', // NULL Byte
    "\x9"               => '', // Tab
    "\xa"               => '', // New Line
    "\xb"               => '', // Vertical Tab
    "\xd"               => '', // Carriage Return
    "\x20"              => '', // Ordinary Space
    "\xc2\xa0"          => '', // NO-BREAK SPACE
    "\xe1\x9a\x80"      => '', // OGHAM SPACE MARK
    "\xe1\xa0\x8e"      => '', // MONGOLIAN VOWEL SEPARATOR
    "\xe2\x80\x80"      => '', // EN QUAD
    "\xe2\x80\x81"      => '', // EM QUAD
    "\xe2\x80\x82"      => '', // EN SPACE
    "\xe2\x80\x83"      => '', // EM SPACE
    "\xe2\x80\x84"      => '', // THREE-PER-EM SPACE
    "\xe2\x80\x85"      => '', // FOUR-PER-EM SPACE
    "\xe2\x80\x86"      => '', // SIX-PER-EM SPACE
    "\xe2\x80\x87"      => '', // FIGURE SPACE
    "\xe2\x80\x88"      => '', // PUNCTUATION SPACE
    "\xe2\x80\x89"      => '', // THIN SPACE
    "\xe2\x80\x8a"      => '', // HAIR SPACE
    "\xe2\x80\xa8"      => '', // LINE SEPARATOR
    "\xe2\x80\xa9"      => '', // PARAGRAPH SEPARATOR
    "\xe2\x80\xaf"      => '', // NARROW NO-BREAK SPACE
    "\xe2\x81\x9f"      => '', // MEDIUM MATHEMATICAL SPACE
    "\xef\xbe\xa0"      => '', // HALFWIDTH HANGUL FILLER
    "\u{FFA0}"          => '', // HALFWIDTH HANGUL FILLER (in Unicode)
    "\xe3\x80\x80"      => '', // IDEOGRAPHIC SPACE
    "some\0/path.txt"   => 'some/path.txt',
    "some\x00/path.txt" => 'some/path.txt',
    "s\ttring"          => 'string',
    "str\x09ing"        => 'string',
 ];

 var_dump('Running test cases...');
 echo "<br />\n";

 foreach ($tests as $input => $expectedOutput) {
    echo 'Passed: ';
    echo removeWhitespace($input) === $expectedOutput ? 'true' : 'FALSE';
    echo "<br />\n";
 }

 var_dump('Done');