chansen · November 16, 2015 19:47
diff --git a/results.txt b/results.txt
 perl: 5.022000 (darwin 14.5.0)

 ar.txt: code points: 14308 (U+0000..U+007F: 2698 U+0080..U+07FF: 11610)
        Rate core this
 core 26873/s   -- -33%
 this 39928/s  49%   --


 el.txt: code points: 58748 (U+0000..U+007F: 13560 U+0080..U+07FF: 45150 U+0800..U+FFFF: 38)
       Rate core this
 core 6612/s   -- -34%
 this 9962/s  51%   --


 en.txt: code points: 82055 (U+0000..U+007F: 81988 U+0080..U+07FF: 18 U+0800..U+FFFF: 49)
        Rate core this
 core 20937/s   -- -75%
 this 84038/s 301%   --


 ja.txt: code points: 64655 (U+0000..U+007F: 6913 U+0080..U+07FF: 30 U+0800..U+FFFF: 57712)
        Rate core this
 core  5631/s   -- -51%
 this 11526/s 105%   --


 lv.txt: code points: 127160 (U+0000..U+007F: 117031 U+0080..U+07FF: 9021 U+0800..U+FFFF: 1108)
        Rate core this
 core  6400/s   -- -39%
 this 10449/s  63%   --


 ru.txt: code points: 85266 (U+0000..U+007F: 19263 U+0080..U+07FF: 65639 U+0800..U+FFFF: 364)
       Rate core this
 core 4625/s   -- -35%
 this 7136/s  54%   --


 sv.txt: code points: 92894 (U+0000..U+007F: 89510 U+0080..U+07FF: 3213 U+0800..U+FFFF: 171)
        Rate core this
 core 12456/s   -- -54%
 this 27173/s 118%   --


 zh.txt: code points: 24519 (U+0000..U+007F: 5317 U+0080..U+07FF: 32 U+0800..U+FFFF: 19170)
        Rate core this
 core 14299/s   -- -49%
 this 27965/s  96%   --
diff --git a/utfx.pl b/utfx.pl
 #!/usr/bin/perl
 use strict;
 use warnings;

 use Inline C => Config => BUILD_NOISY => 1;
 use Inline C => <<'END_C', CLEAN_AFTER_BUILD => 0;

 /*
 *  XXX ARM supports unaligned loads?
 */
 #if defined(__i386__) || defined(__x86_64__)
 # define USE_UNALIGNED_U32_LOAD
 #endif

 /*
 *  XXX MSC _BitScanForward
 *  XXX portable implementation
 */
 #if defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
 # define HAS_BUILTIN_CTZ
 #endif

 bool
 is_utf8_string_new(const U8 *src, STRLEN len) {
    const U8 *cur = src;
    const U8 *end  = src + (len ? len : strlen((const char *)src));
    const U8 *end4 = end - 4;
    U32 v;

    while (cur < end4) {
 #ifdef USE_UNALIGNED_U32_LOAD
        v = *(const U32 *)cur;
 #else
        v = ((U32)cur[0]      )
          | ((U32)cur[1] <<  8) 
          | ((U32)cur[2] << 16)
          | ((U32)cur[3] << 24);
 #endif
        if ((v & 0x80) == 0) {
 #ifdef HAS_BUILTIN_CTZ
            cur += (v &= 0x80808080) ? __builtin_ctz(v) >> 3 : 4;
 #else
            cur += 1;
 #endif
        }
        else {
          check:
            if ((v & 0xC0E0) == 0x80C0 && (v & 0x1E) != 0)
                cur += 2;
            else if ((v & 0xC0C0F0) == 0x8080E0 && (v & 0x200F) != 0)
                cur += 3;
            else if ((v & 0xC0C0C0F8) == 0x808080F0 && (v & 0x3007) != 0)
                cur += 4;
            else {
                STRLEN ret;

                utf8n_to_uvchr(cur, end - cur, &ret, UTF8_CHECK_ONLY);
                if (ret == (STRLEN) -1)
                    return FALSE;
                cur += ret;
            }
        }
    }
    if (cur < end) {
        while (cur < end && *cur < 0x80)
            cur++;
        if (cur < end) {
            const U8 *p = end;
            v = 0;
            while (p > cur)
                v = (v << 8) | *--p;
            goto check;
        }
    }
    return TRUE;
 }

 bool
 is_utf8_string_core(SV *string) {
    STRLEN len;
    const U8 *s = (const U8 *)SvPV_const(string, len);
    if (!is_utf8_string(s, len))
        croak("Bad UTF-X string");
    return TRUE;
 }

 bool
 is_utf8_string_this(SV *string) {
    STRLEN len;
    const U8 *s = (const U8 *)SvPV_const(string, len);  
    if (!is_utf8_string_new(s, len))
        croak("Bad UTF-X string");
    return TRUE;
 }
 END_C

 use Benchmark     qw[:hireswallclock];
 use Config        qw[%Config];
 use IO::Dir       qw[];
 use Unicode::UTF8 qw[decode_utf8];

 # https://github.com/chansen/p5-unicode-utf8/tree/master/benchmarks/data
 my $dir = '/Users/chansen/repos/p5-unicode-utf8/benchmarks/data';

 my @docs = do {
    my $d = IO::Dir->new($dir)
      or die qq/Could not open directory '$dir': $!/;
    sort grep { /^[a-z]{2}\.txt/ } $d->read;
 };

 printf "perl: %s (%s %s)\n", $], @Config{qw[osname osvers]};

 foreach my $doc (@docs) {

    my $src = do {
        open my $fh, '<:raw', "$dir/$doc" or die $!;
        local $/; <$fh>;
    };

    my $str = decode_utf8($src);

    my @ranges = (
        [    0x00,     0x7F, qr/[\x{00}-\x{7F}]/        ],
        [    0x80,    0x7FF, qr/[\x{80}-\x{7FF}]/       ],
        [   0x800,   0xFFFF, qr/[\x{800}-\x{FFFF}]/     ],
        [ 0x10000, 0x10FFFF, qr/[\x{10000}-\x{10FFFF}]/ ],
    );

    my @out;
    foreach my $r (@ranges) {
        my ($start, $end, $regexp) = @$r;
        my $count = () = $str =~ m/$regexp/g;
        push @out, sprintf "U+%.4X..U+%.4X: %d", $start, $end, $count
          if $count;
    }

    printf "\n\n%s: code points: %d (%s)\n", $doc, length $str, join ' ', @out;
    Benchmark::cmpthese( -10, {
        'core' => sub {
            my $v = is_utf8_string_core($src);
        },
        'this' => sub {
            my $v = is_utf8_string_this($src);
        },
    });
 }
	perl: 5.022000 (darwin 14.5.0)

	ar.txt: code points: 14308 (U+0000..U+007F: 2698 U+0080..U+07FF: 11610)
	Rate core this
	core 26873/s -- -33%
	this 39928/s 49% --


	el.txt: code points: 58748 (U+0000..U+007F: 13560 U+0080..U+07FF: 45150 U+0800..U+FFFF: 38)
	Rate core this
	core 6612/s -- -34%
	this 9962/s 51% --


	en.txt: code points: 82055 (U+0000..U+007F: 81988 U+0080..U+07FF: 18 U+0800..U+FFFF: 49)
	Rate core this
	core 20937/s -- -75%
	this 84038/s 301% --


	ja.txt: code points: 64655 (U+0000..U+007F: 6913 U+0080..U+07FF: 30 U+0800..U+FFFF: 57712)
	Rate core this
	core 5631/s -- -51%
	this 11526/s 105% --


	lv.txt: code points: 127160 (U+0000..U+007F: 117031 U+0080..U+07FF: 9021 U+0800..U+FFFF: 1108)
	Rate core this
	core 6400/s -- -39%
	this 10449/s 63% --


	ru.txt: code points: 85266 (U+0000..U+007F: 19263 U+0080..U+07FF: 65639 U+0800..U+FFFF: 364)
	Rate core this
	core 4625/s -- -35%
	this 7136/s 54% --


	sv.txt: code points: 92894 (U+0000..U+007F: 89510 U+0080..U+07FF: 3213 U+0800..U+FFFF: 171)
	Rate core this
	core 12456/s -- -54%
	this 27173/s 118% --


	zh.txt: code points: 24519 (U+0000..U+007F: 5317 U+0080..U+07FF: 32 U+0800..U+FFFF: 19170)
	Rate core this
	core 14299/s -- -49%
	this 27965/s 96% --
	#!/usr/bin/perl
	use strict;
	use warnings;

	use Inline C => Config => BUILD_NOISY => 1;
	use Inline C => <<'END_C', CLEAN_AFTER_BUILD => 0;

	/*
	* XXX ARM supports unaligned loads?
	*/
	#if defined(__i386__) \|\| defined(__x86_64__)
	# define USE_UNALIGNED_U32_LOAD
	#endif

	/*
	* XXX MSC _BitScanForward
	* XXX portable implementation
	*/
	#if defined(__GNUC__) && ((__GNUC__ >= 4) \|\| (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
	# define HAS_BUILTIN_CTZ
	#endif

	bool
	is_utf8_string_new(const U8 *src, STRLEN len) {
	const U8 *cur = src;
	const U8 end = src + (len ? len : strlen((const char )src));
	const U8 *end4 = end - 4;
	U32 v;

	while (cur < end4) {
	#ifdef USE_UNALIGNED_U32_LOAD
	v = (const U32 )cur;
	#else
	v = ((U32)cur[0] )
	\| ((U32)cur[1] << 8)
	\| ((U32)cur[2] << 16)
	\| ((U32)cur[3] << 24);
	#endif
	if ((v & 0x80) == 0) {
	#ifdef HAS_BUILTIN_CTZ
	cur += (v &= 0x80808080) ? __builtin_ctz(v) >> 3 : 4;
	#else
	cur += 1;
	#endif
	}
	else {
	check:
	if ((v & 0xC0E0) == 0x80C0 && (v & 0x1E) != 0)
	cur += 2;
	else if ((v & 0xC0C0F0) == 0x8080E0 && (v & 0x200F) != 0)
	cur += 3;
	else if ((v & 0xC0C0C0F8) == 0x808080F0 && (v & 0x3007) != 0)
	cur += 4;
	else {
	STRLEN ret;

	utf8n_to_uvchr(cur, end - cur, &ret, UTF8_CHECK_ONLY);
	if (ret == (STRLEN) -1)
	return FALSE;
	cur += ret;
	}
	}
	}
	if (cur < end) {
	while (cur < end && *cur < 0x80)
	cur++;
	if (cur < end) {
	const U8 *p = end;
	v = 0;
	while (p > cur)
	v = (v << 8) \| *--p;
	goto check;
	}
	}
	return TRUE;
	}

	bool
	is_utf8_string_core(SV *string) {
	STRLEN len;
	const U8 s = (const U8 )SvPV_const(string, len);
	if (!is_utf8_string(s, len))
	croak("Bad UTF-X string");
	return TRUE;
	}

	bool
	is_utf8_string_this(SV *string) {
	STRLEN len;
	const U8 s = (const U8 )SvPV_const(string, len);
	if (!is_utf8_string_new(s, len))
	croak("Bad UTF-X string");
	return TRUE;
	}
	END_C

	use Benchmark qw[:hireswallclock];
	use Config qw[%Config];
	use IO::Dir qw[];
	use Unicode::UTF8 qw[decode_utf8];

	# https://github.com/chansen/p5-unicode-utf8/tree/master/benchmarks/data
	my $dir = '/Users/chansen/repos/p5-unicode-utf8/benchmarks/data';

	my @docs = do {
	my $d = IO::Dir->new($dir)
	or die qq/Could not open directory '$dir': $!/;
	sort grep { /^[a-z]{2}\.txt/ } $d->read;
	};

	printf "perl: %s (%s %s)\n", $], @Config{qw[osname osvers]};

	foreach my $doc (@docs) {

	my $src = do {
	open my $fh, '<:raw', "$dir/$doc" or die $!;
	local $/; <$fh>;
	};

	my $str = decode_utf8($src);

	my @ranges = (
	[ 0x00, 0x7F, qr/[\x{00}-\x{7F}]/ ],
	[ 0x80, 0x7FF, qr/[\x{80}-\x{7FF}]/ ],
	[ 0x800, 0xFFFF, qr/[\x{800}-\x{FFFF}]/ ],
	[ 0x10000, 0x10FFFF, qr/[\x{10000}-\x{10FFFF}]/ ],
	);

	my @out;
	foreach my $r (@ranges) {
	my ($start, $end, $regexp) = @$r;
	my $count = () = $str =~ m/$regexp/g;
	push @out, sprintf "U+%.4X..U+%.4X: %d", $start, $end, $count
	if $count;
	}

	printf "\n\n%s: code points: %d (%s)\n", $doc, length $str, join ' ', @out;
	Benchmark::cmpthese( -10, {
	'core' => sub {
	my $v = is_utf8_string_core($src);
	},
	'this' => sub {
	my $v = is_utf8_string_this($src);
	},
	});
	}