Skip to content

Instantly share code, notes, and snippets.

@chansen
Created November 16, 2015 19:47
Show Gist options
  • Save chansen/36544a219c288f09e7dc to your computer and use it in GitHub Desktop.
Save chansen/36544a219c288f09e7dc to your computer and use it in GitHub Desktop.
Faster UTF-X validation (~ 50% - 300% faster)
perl: 5.022000 (darwin 14.5.0)
ar.txt: code points: 14308 (U+0000..U+007F: 2698 U+0080..U+07FF: 11610)
Rate core this
core 26873/s -- -33%
this 39928/s 49% --
el.txt: code points: 58748 (U+0000..U+007F: 13560 U+0080..U+07FF: 45150 U+0800..U+FFFF: 38)
Rate core this
core 6612/s -- -34%
this 9962/s 51% --
en.txt: code points: 82055 (U+0000..U+007F: 81988 U+0080..U+07FF: 18 U+0800..U+FFFF: 49)
Rate core this
core 20937/s -- -75%
this 84038/s 301% --
ja.txt: code points: 64655 (U+0000..U+007F: 6913 U+0080..U+07FF: 30 U+0800..U+FFFF: 57712)
Rate core this
core 5631/s -- -51%
this 11526/s 105% --
lv.txt: code points: 127160 (U+0000..U+007F: 117031 U+0080..U+07FF: 9021 U+0800..U+FFFF: 1108)
Rate core this
core 6400/s -- -39%
this 10449/s 63% --
ru.txt: code points: 85266 (U+0000..U+007F: 19263 U+0080..U+07FF: 65639 U+0800..U+FFFF: 364)
Rate core this
core 4625/s -- -35%
this 7136/s 54% --
sv.txt: code points: 92894 (U+0000..U+007F: 89510 U+0080..U+07FF: 3213 U+0800..U+FFFF: 171)
Rate core this
core 12456/s -- -54%
this 27173/s 118% --
zh.txt: code points: 24519 (U+0000..U+007F: 5317 U+0080..U+07FF: 32 U+0800..U+FFFF: 19170)
Rate core this
core 14299/s -- -49%
this 27965/s 96% --
#!/usr/bin/perl
use strict;
use warnings;
use Inline C => Config => BUILD_NOISY => 1;
use Inline C => <<'END_C', CLEAN_AFTER_BUILD => 0;
/*
* XXX ARM supports unaligned loads?
*/
#if defined(__i386__) || defined(__x86_64__)
# define USE_UNALIGNED_U32_LOAD
#endif
/*
* XXX MSC _BitScanForward
* XXX portable implementation
*/
#if defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
# define HAS_BUILTIN_CTZ
#endif
bool
is_utf8_string_new(const U8 *src, STRLEN len) {
const U8 *cur = src;
const U8 *end = src + (len ? len : strlen((const char *)src));
const U8 *end4 = end - 4;
U32 v;
while (cur < end4) {
#ifdef USE_UNALIGNED_U32_LOAD
v = *(const U32 *)cur;
#else
v = ((U32)cur[0] )
| ((U32)cur[1] << 8)
| ((U32)cur[2] << 16)
| ((U32)cur[3] << 24);
#endif
if ((v & 0x80) == 0) {
#ifdef HAS_BUILTIN_CTZ
cur += (v &= 0x80808080) ? __builtin_ctz(v) >> 3 : 4;
#else
cur += 1;
#endif
}
else {
check:
if ((v & 0xC0E0) == 0x80C0 && (v & 0x1E) != 0)
cur += 2;
else if ((v & 0xC0C0F0) == 0x8080E0 && (v & 0x200F) != 0)
cur += 3;
else if ((v & 0xC0C0C0F8) == 0x808080F0 && (v & 0x3007) != 0)
cur += 4;
else {
STRLEN ret;
utf8n_to_uvchr(cur, end - cur, &ret, UTF8_CHECK_ONLY);
if (ret == (STRLEN) -1)
return FALSE;
cur += ret;
}
}
}
if (cur < end) {
while (cur < end && *cur < 0x80)
cur++;
if (cur < end) {
const U8 *p = end;
v = 0;
while (p > cur)
v = (v << 8) | *--p;
goto check;
}
}
return TRUE;
}
bool
is_utf8_string_core(SV *string) {
STRLEN len;
const U8 *s = (const U8 *)SvPV_const(string, len);
if (!is_utf8_string(s, len))
croak("Bad UTF-X string");
return TRUE;
}
bool
is_utf8_string_this(SV *string) {
STRLEN len;
const U8 *s = (const U8 *)SvPV_const(string, len);
if (!is_utf8_string_new(s, len))
croak("Bad UTF-X string");
return TRUE;
}
END_C
use Benchmark qw[:hireswallclock];
use Config qw[%Config];
use IO::Dir qw[];
use Unicode::UTF8 qw[decode_utf8];
# https://github.com/chansen/p5-unicode-utf8/tree/master/benchmarks/data
my $dir = '/Users/chansen/repos/p5-unicode-utf8/benchmarks/data';
my @docs = do {
my $d = IO::Dir->new($dir)
or die qq/Could not open directory '$dir': $!/;
sort grep { /^[a-z]{2}\.txt/ } $d->read;
};
printf "perl: %s (%s %s)\n", $], @Config{qw[osname osvers]};
foreach my $doc (@docs) {
my $src = do {
open my $fh, '<:raw', "$dir/$doc" or die $!;
local $/; <$fh>;
};
my $str = decode_utf8($src);
my @ranges = (
[ 0x00, 0x7F, qr/[\x{00}-\x{7F}]/ ],
[ 0x80, 0x7FF, qr/[\x{80}-\x{7FF}]/ ],
[ 0x800, 0xFFFF, qr/[\x{800}-\x{FFFF}]/ ],
[ 0x10000, 0x10FFFF, qr/[\x{10000}-\x{10FFFF}]/ ],
);
my @out;
foreach my $r (@ranges) {
my ($start, $end, $regexp) = @$r;
my $count = () = $str =~ m/$regexp/g;
push @out, sprintf "U+%.4X..U+%.4X: %d", $start, $end, $count
if $count;
}
printf "\n\n%s: code points: %d (%s)\n", $doc, length $str, join ' ', @out;
Benchmark::cmpthese( -10, {
'core' => sub {
my $v = is_utf8_string_core($src);
},
'this' => sub {
my $v = is_utf8_string_this($src);
},
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment