Created
September 27, 2016 16:10
-
-
Save rybakit/717018d5f3292ef28fd9de6210f687aa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
const UTF8_REGEX = '/\A(?: | |
[\x00-\x7F]++ # ASCII | |
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | |
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | |
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | |
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | |
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | |
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | |
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 | |
)*+\z/x'; | |
const NON_UTF8_REGEX = '/( | |
[\xC0-\xC1] # Invalid UTF-8 Bytes | |
| [\xF5-\xFF] # Invalid UTF-8 Bytes | |
| \xE0[\x80-\x9F] # Overlong encoding of prior code point | |
| \xF0[\x80-\x8F] # Overlong encoding of prior code point | |
| [\xC2-\xDF](?![\x80-\xBF]) # Invalid UTF-8 Sequence Start | |
| [\xE0-\xEF](?![\x80-\xBF]{2}) # Invalid UTF-8 Sequence Start | |
| [\xF0-\xF4](?![\x80-\xBF]{3}) # Invalid UTF-8 Sequence Start | |
| (?<=[\x0-\x7F\xF5-\xFF])[\x80-\xBF] # Invalid UTF-8 Sequence Middle | |
| (?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]|[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4][\x80-\xBF]{2})[\x80-\xBF] # Overlong Sequence | |
| (?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF]) # Short 3 byte sequence | |
| (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2}) # Short 4 byte sequence | |
| (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2) | |
)/x'; | |
function utf8($n, $str) { | |
for ($i = 0; $i < $n; ++$i) { | |
\preg_match(UTF8_REGEX, $str); | |
} | |
} | |
function non_utf8($n, $str) { | |
for ($i = 0; $i < $n; ++$i) { | |
\preg_match(NON_UTF8_REGEX, $str); | |
} | |
} | |
function empty_loop($n, $str) { | |
for ($i = 0; $i < $n; ++$i) { | |
} | |
} | |
function getmicrotime() | |
{ | |
$t = gettimeofday(); | |
return ($t['sec'] + $t['usec'] / 1000000); | |
} | |
function start_test() | |
{ | |
ob_start(); | |
return getmicrotime(); | |
} | |
function end_test($start, $name, $overhead = null) | |
{ | |
global $total; | |
global $last_time; | |
$end = getmicrotime(); | |
ob_end_clean(); | |
$last_time = $end-$start; | |
$total += $last_time; | |
$num = number_format($last_time,3); | |
$pad = str_repeat(" ", 24-strlen($name)-strlen($num)); | |
if (is_null($overhead)) { | |
echo $name.$pad.$num."\n"; | |
} else { | |
$num2 = number_format($last_time - $overhead,3); | |
echo $name.$pad.$num." ".$num2."\n"; | |
} | |
ob_start(); | |
return getmicrotime(); | |
} | |
function total() | |
{ | |
global $total; | |
$pad = str_repeat("-", 24); | |
echo $pad."\n"; | |
$num = number_format($total,3); | |
$pad = str_repeat(" ", 24-strlen("Total")-strlen($num)); | |
echo "Total".$pad.$num."\n"; | |
} | |
const N = 500000; | |
$str = str_repeat('c', 65535); | |
$t0 = $t = start_test(); | |
empty_loop(N, $str); | |
$t = end_test($t, 'empty_loop'); | |
$overhead = $last_time; | |
utf8(N, $str); | |
$t = end_test($t, 'utf8', $overhead); | |
non_utf8(N, $str); | |
$t = end_test($t, 'non_utf8', $overhead); | |
total(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ php -d pcre.jit=1 tests/bench_utf8_regex.php | |
empty_loop 0.004 | |
utf8 14.833 14.829 | |
non_utf8 14.713 14.709 | |
------------------------ | |
Total 29.550 | |
$ php -d pcre.jit=0 tests/bench_utf8_regex.php | |
empty_loop 0.004 | |
utf8 81.329 81.325 | |
^C | |
$ php -v | |
PHP 7.0.5 (cli) (built: Apr 23 2016 10:48:01) ( NTS ) | |
Copyright (c) 1997-2016 The PHP Group | |
Zend Engine v3.0.0, Copyright (c) 1998-2016 Zend Technologies | |
with Zend OPcache v7.0.6-dev, Copyright (c) 1999-2016, by Zend Technologies |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment