Created
November 16, 2015 19:47
-
-
Save chansen/36544a219c288f09e7dc to your computer and use it in GitHub Desktop.
Faster UTF-X validation (~ 50% - 300% faster)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
perl: 5.022000 (darwin 14.5.0) | |
ar.txt: code points: 14308 (U+0000..U+007F: 2698 U+0080..U+07FF: 11610) | |
Rate core this | |
core 26873/s -- -33% | |
this 39928/s 49% -- | |
el.txt: code points: 58748 (U+0000..U+007F: 13560 U+0080..U+07FF: 45150 U+0800..U+FFFF: 38) | |
Rate core this | |
core 6612/s -- -34% | |
this 9962/s 51% -- | |
en.txt: code points: 82055 (U+0000..U+007F: 81988 U+0080..U+07FF: 18 U+0800..U+FFFF: 49) | |
Rate core this | |
core 20937/s -- -75% | |
this 84038/s 301% -- | |
ja.txt: code points: 64655 (U+0000..U+007F: 6913 U+0080..U+07FF: 30 U+0800..U+FFFF: 57712) | |
Rate core this | |
core 5631/s -- -51% | |
this 11526/s 105% -- | |
lv.txt: code points: 127160 (U+0000..U+007F: 117031 U+0080..U+07FF: 9021 U+0800..U+FFFF: 1108) | |
Rate core this | |
core 6400/s -- -39% | |
this 10449/s 63% -- | |
ru.txt: code points: 85266 (U+0000..U+007F: 19263 U+0080..U+07FF: 65639 U+0800..U+FFFF: 364) | |
Rate core this | |
core 4625/s -- -35% | |
this 7136/s 54% -- | |
sv.txt: code points: 92894 (U+0000..U+007F: 89510 U+0080..U+07FF: 3213 U+0800..U+FFFF: 171) | |
Rate core this | |
core 12456/s -- -54% | |
this 27173/s 118% -- | |
zh.txt: code points: 24519 (U+0000..U+007F: 5317 U+0080..U+07FF: 32 U+0800..U+FFFF: 19170) | |
Rate core this | |
core 14299/s -- -49% | |
this 27965/s 96% -- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Inline C => Config => BUILD_NOISY => 1; | |
use Inline C => <<'END_C', CLEAN_AFTER_BUILD => 0; | |
/* | |
* XXX ARM supports unaligned loads? | |
*/ | |
#if defined(__i386__) || defined(__x86_64__) | |
# define USE_UNALIGNED_U32_LOAD | |
#endif | |
/* | |
* XXX MSC _BitScanForward | |
* XXX portable implementation | |
*/ | |
#if defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) | |
# define HAS_BUILTIN_CTZ | |
#endif | |
bool | |
is_utf8_string_new(const U8 *src, STRLEN len) { | |
const U8 *cur = src; | |
const U8 *end = src + (len ? len : strlen((const char *)src)); | |
const U8 *end4 = end - 4; | |
U32 v; | |
while (cur < end4) { | |
#ifdef USE_UNALIGNED_U32_LOAD | |
v = *(const U32 *)cur; | |
#else | |
v = ((U32)cur[0] ) | |
| ((U32)cur[1] << 8) | |
| ((U32)cur[2] << 16) | |
| ((U32)cur[3] << 24); | |
#endif | |
if ((v & 0x80) == 0) { | |
#ifdef HAS_BUILTIN_CTZ | |
cur += (v &= 0x80808080) ? __builtin_ctz(v) >> 3 : 4; | |
#else | |
cur += 1; | |
#endif | |
} | |
else { | |
check: | |
if ((v & 0xC0E0) == 0x80C0 && (v & 0x1E) != 0) | |
cur += 2; | |
else if ((v & 0xC0C0F0) == 0x8080E0 && (v & 0x200F) != 0) | |
cur += 3; | |
else if ((v & 0xC0C0C0F8) == 0x808080F0 && (v & 0x3007) != 0) | |
cur += 4; | |
else { | |
STRLEN ret; | |
utf8n_to_uvchr(cur, end - cur, &ret, UTF8_CHECK_ONLY); | |
if (ret == (STRLEN) -1) | |
return FALSE; | |
cur += ret; | |
} | |
} | |
} | |
if (cur < end) { | |
while (cur < end && *cur < 0x80) | |
cur++; | |
if (cur < end) { | |
const U8 *p = end; | |
v = 0; | |
while (p > cur) | |
v = (v << 8) | *--p; | |
goto check; | |
} | |
} | |
return TRUE; | |
} | |
bool | |
is_utf8_string_core(SV *string) { | |
STRLEN len; | |
const U8 *s = (const U8 *)SvPV_const(string, len); | |
if (!is_utf8_string(s, len)) | |
croak("Bad UTF-X string"); | |
return TRUE; | |
} | |
bool | |
is_utf8_string_this(SV *string) { | |
STRLEN len; | |
const U8 *s = (const U8 *)SvPV_const(string, len); | |
if (!is_utf8_string_new(s, len)) | |
croak("Bad UTF-X string"); | |
return TRUE; | |
} | |
END_C | |
use Benchmark qw[:hireswallclock]; | |
use Config qw[%Config]; | |
use IO::Dir qw[]; | |
use Unicode::UTF8 qw[decode_utf8]; | |
# https://github.com/chansen/p5-unicode-utf8/tree/master/benchmarks/data | |
my $dir = '/Users/chansen/repos/p5-unicode-utf8/benchmarks/data'; | |
my @docs = do { | |
my $d = IO::Dir->new($dir) | |
or die qq/Could not open directory '$dir': $!/; | |
sort grep { /^[a-z]{2}\.txt/ } $d->read; | |
}; | |
printf "perl: %s (%s %s)\n", $], @Config{qw[osname osvers]}; | |
foreach my $doc (@docs) { | |
my $src = do { | |
open my $fh, '<:raw', "$dir/$doc" or die $!; | |
local $/; <$fh>; | |
}; | |
my $str = decode_utf8($src); | |
my @ranges = ( | |
[ 0x00, 0x7F, qr/[\x{00}-\x{7F}]/ ], | |
[ 0x80, 0x7FF, qr/[\x{80}-\x{7FF}]/ ], | |
[ 0x800, 0xFFFF, qr/[\x{800}-\x{FFFF}]/ ], | |
[ 0x10000, 0x10FFFF, qr/[\x{10000}-\x{10FFFF}]/ ], | |
); | |
my @out; | |
foreach my $r (@ranges) { | |
my ($start, $end, $regexp) = @$r; | |
my $count = () = $str =~ m/$regexp/g; | |
push @out, sprintf "U+%.4X..U+%.4X: %d", $start, $end, $count | |
if $count; | |
} | |
printf "\n\n%s: code points: %d (%s)\n", $doc, length $str, join ' ', @out; | |
Benchmark::cmpthese( -10, { | |
'core' => sub { | |
my $v = is_utf8_string_core($src); | |
}, | |
'this' => sub { | |
my $v = is_utf8_string_this($src); | |
}, | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment