chansen · August 29, 2015 14:25
diff --git a/result.txt b/result.txt
 $ perl benchmarks/slurp.pl 
 perl:                5.023001 (darwin 14.4.0)
 Encode:              2.75
 Unicode::UTF8:       0.60
 PerlIO::encoding:    0.21
 PerlIO::utf8_strict: 0.006


 ar.txt: Size: 25918 Code points: 14308 (U+0000..U+007F: 2698 U+0080..U+07FF: 11610)
                    Rate :encoding(UTF-8)      Encode :utf8_strict Unicode::UTF8
 :encoding(UTF-8)  3058/s               --        -19%         -73%          -87%
 Encode            3754/s              23%          --         -67%          -84%
 :utf8_strict     11361/s             272%        203%           --          -52%
 Unicode::UTF8    23620/s             672%        529%         108%            --


 el.txt: Size: 103974 Code points: 58748 (U+0000..U+007F: 13560 U+0080..U+07FF: 45150 U+0800..U+FFFF: 38)
                   Rate :encoding(UTF-8)       Encode :utf8_strict Unicode::UTF8
 :encoding(UTF-8)  780/s               --         -19%         -73%          -86%
 Encode            958/s              23%           --         -66%          -83%
 :utf8_strict     2855/s             266%         198%           --          -48%
 Unicode::UTF8    5498/s             605%         474%          93%            --


 en.txt: Size: 82171 Code points: 82055 (U+0000..U+007F: 81988 U+0080..U+07FF: 18 U+0800..U+FFFF: 49)
                    Rate :encoding(UTF-8)      Encode :utf8_strict Unicode::UTF8
 :encoding(UTF-8)  1111/s               --        -16%         -90%          -96%
 Encode            1327/s              19%          --         -88%          -95%
 :utf8_strict     11446/s             931%        763%           --          -60%
 Unicode::UTF8    28635/s            2478%       2058%         150%            --


 ja.txt: Size: 180109 Code points: 64655 (U+0000..U+007F: 6913 U+0080..U+07FF: 30 U+0800..U+FFFF: 57712)
                   Rate :encoding(UTF-8)       Encode :utf8_strict Unicode::UTF8
 :encoding(UTF-8)  553/s               --         -27%         -72%          -91%
 Encode            757/s              37%           --         -61%          -87%
 :utf8_strict     1960/s             254%         159%           --          -67%
 Unicode::UTF8    5915/s             970%         682%         202%            --


 lv.txt: Size: 138397 Code points: 127160 (U+0000..U+007F: 117031 U+0080..U+07FF: 9021 U+0800..U+FFFF: 1108)
                   Rate :encoding(UTF-8)       Encode :utf8_strict Unicode::UTF8
 :encoding(UTF-8)  605/s               --         -19%         -80%          -91%
 Encode            746/s              23%           --         -75%          -88%
 :utf8_strict     3043/s             403%         308%           --          -53%
 Unicode::UTF8    6453/s             967%         765%         112%            --


 ru.txt: Size: 151633 Code points: 85266 (U+0000..U+007F: 19263 U+0080..U+07FF: 65639 U+0800..U+FFFF: 364)
                   Rate :encoding(UTF-8)       Encode :utf8_strict Unicode::UTF8
 :encoding(UTF-8)  542/s               --         -19%         -73%          -86%
 Encode            673/s              24%           --         -66%          -83%
 :utf8_strict     2001/s             269%         197%           --          -50%
 Unicode::UTF8    4010/s             640%         496%         100%            --


 sv.txt: Size: 96449 Code points: 92894 (U+0000..U+007F: 89510 U+0080..U+07FF: 3213 U+0800..U+FFFF: 171)
                    Rate :encoding(UTF-8)      Encode :utf8_strict Unicode::UTF8
 :encoding(UTF-8)   923/s               --        -17%         -85%          -93%
 Encode            1109/s              20%          --         -82%          -92%
 :utf8_strict      5998/s             550%        441%           --          -56%
 Unicode::UTF8    13604/s            1374%       1127%         127%            --


 zh.txt: Size: 62891 Code points: 24519 (U+0000..U+007F: 5317 U+0080..U+07FF: 32 U+0800..U+FFFF: 19170)
                    Rate :encoding(UTF-8)      Encode :utf8_strict Unicode::UTF8
 :encoding(UTF-8)  1630/s               --        -23%         -75%          -87%
 Encode            2104/s              29%          --         -68%          -83%
 :utf8_strict      6549/s             302%        211%           --          -48%
 Unicode::UTF8    12630/s             675%        500%          93%            --
diff --git a/slurp_utf8.pl b/slurp_utf8.pl
 #!/usr/bin/perl

 use strict;
 use warnings;

 use Benchmark     qw[];
 use Config        qw[%Config];
 use IO::Dir       qw[];
 use IO::File      qw[SEEK_SET];

 use Encode              qw[];
 use Unicode::UTF8       qw[];
 use PerlIO::encoding    qw[];
 use PerlIO::utf8_strict qw[];

 # https://github.com/chansen/p5-unicode-utf8/tree/master/benchmarks/data
 my $dir  = 'benchmarks/data';
 my @docs = do {
    my $d = IO::Dir->new($dir)
      or die qq/Could not open directory '$dir': $!/;
    sort grep { /^[a-z]{2}\.txt/ } $d->read;
 };

 printf "perl:                %s (%s %s)\n", $], @Config{qw[osname osvers]};
 printf "Encode:              %s\n", Encode->VERSION;
 printf "Unicode::UTF8:       %s\n", Unicode::UTF8->VERSION;
 printf "PerlIO::encoding:    %s\n", PerlIO::encoding->VERSION;
 printf "PerlIO::utf8_strict: %s\n", PerlIO::utf8_strict->VERSION;

 foreach my $doc (@docs) {

    my $octets = do {
        open my $fh, '<:raw', "$dir/$doc" or die $!;
        local $/; <$fh>;
    };

    my $string = Unicode::UTF8::decode_utf8($octets);

    my @ranges = (
        [    0x00,     0x7F, qr/[\x{00}-\x{7F}]/        ],
        [    0x80,    0x7FF, qr/[\x{80}-\x{7FF}]/       ],
        [   0x800,   0xFFFF, qr/[\x{800}-\x{FFFF}]/     ],
        [ 0x10000, 0x10FFFF, qr/[\x{10000}-\x{10FFFF}]/ ],
    );

    my @out;
    foreach my $r (@ranges) {
        my ($start, $end, $regexp) = @$r;
        my $count = () = $string =~ m/$regexp/g;
        push @out, sprintf "U+%.4X..U+%.4X: %d", $start, $end, $count
          if $count;
    }

    printf "\n\n%s: Size: %d Code points: %d (%s)\n",
      $doc, length $octets, length $string, join ' ', @out;

    open my $fh_raw, '<:raw', \$octets 
      or die qq/Could not open a :raw fh: '$!'/;
    open my $fh_encoding, '<:encoding(UTF-8)', \$octets
      or die qq/Could not open a :encoding fh: '$!'/;
    open my $fh_utf8_strict, '<:utf8_strict', \$octets 
      or die qq/Could not open a :utf8_strict fh: '$!'/;

    Benchmark::cmpthese( -10, {
        ':encoding(UTF-8)' => sub {
            my $data = do { local $/; <$fh_encoding> };
            seek($fh_encoding, 0, SEEK_SET)
              or die qq/Could not rewind fh: '$!'/;
        },
        ':utf8_strict' => sub {
            my $data = do { local $/; <$fh_utf8_strict> };
            seek($fh_utf8_strict, 0, SEEK_SET)
              or die qq/Could not rewind fh: '$!'/;
        },
        'Encode' => sub {
            my $data = Encode::decode('UTF-8', do { local $/; scalar <$fh_raw> }, Encode::FB_CROAK|Encode::LEAVE_SRC);
            seek($fh_raw, 0, SEEK_SET)
             or die qq/Could not rewind fh: '$!'/;
        },        
        'Unicode::UTF8' => sub {
            my $data = Unicode::UTF8::decode_utf8(do { local $/; scalar <$fh_raw> });
            seek($fh_raw, 0, SEEK_SET)
             or die qq/Could not rewind fh: '$!'/;
        },
    });
 }
	$ perl benchmarks/slurp.pl
	perl: 5.023001 (darwin 14.4.0)
	Encode: 2.75
	Unicode::UTF8: 0.60
	PerlIO::encoding: 0.21
	PerlIO::utf8_strict: 0.006


	ar.txt: Size: 25918 Code points: 14308 (U+0000..U+007F: 2698 U+0080..U+07FF: 11610)
	Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8
	:encoding(UTF-8) 3058/s -- -19% -73% -87%
	Encode 3754/s 23% -- -67% -84%
	:utf8_strict 11361/s 272% 203% -- -52%
	Unicode::UTF8 23620/s 672% 529% 108% --


	el.txt: Size: 103974 Code points: 58748 (U+0000..U+007F: 13560 U+0080..U+07FF: 45150 U+0800..U+FFFF: 38)
	Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8
	:encoding(UTF-8) 780/s -- -19% -73% -86%
	Encode 958/s 23% -- -66% -83%
	:utf8_strict 2855/s 266% 198% -- -48%
	Unicode::UTF8 5498/s 605% 474% 93% --


	en.txt: Size: 82171 Code points: 82055 (U+0000..U+007F: 81988 U+0080..U+07FF: 18 U+0800..U+FFFF: 49)
	Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8
	:encoding(UTF-8) 1111/s -- -16% -90% -96%
	Encode 1327/s 19% -- -88% -95%
	:utf8_strict 11446/s 931% 763% -- -60%
	Unicode::UTF8 28635/s 2478% 2058% 150% --


	ja.txt: Size: 180109 Code points: 64655 (U+0000..U+007F: 6913 U+0080..U+07FF: 30 U+0800..U+FFFF: 57712)
	Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8
	:encoding(UTF-8) 553/s -- -27% -72% -91%
	Encode 757/s 37% -- -61% -87%
	:utf8_strict 1960/s 254% 159% -- -67%
	Unicode::UTF8 5915/s 970% 682% 202% --


	lv.txt: Size: 138397 Code points: 127160 (U+0000..U+007F: 117031 U+0080..U+07FF: 9021 U+0800..U+FFFF: 1108)
	Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8
	:encoding(UTF-8) 605/s -- -19% -80% -91%
	Encode 746/s 23% -- -75% -88%
	:utf8_strict 3043/s 403% 308% -- -53%
	Unicode::UTF8 6453/s 967% 765% 112% --


	ru.txt: Size: 151633 Code points: 85266 (U+0000..U+007F: 19263 U+0080..U+07FF: 65639 U+0800..U+FFFF: 364)
	Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8
	:encoding(UTF-8) 542/s -- -19% -73% -86%
	Encode 673/s 24% -- -66% -83%
	:utf8_strict 2001/s 269% 197% -- -50%
	Unicode::UTF8 4010/s 640% 496% 100% --


	sv.txt: Size: 96449 Code points: 92894 (U+0000..U+007F: 89510 U+0080..U+07FF: 3213 U+0800..U+FFFF: 171)
	Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8
	:encoding(UTF-8) 923/s -- -17% -85% -93%
	Encode 1109/s 20% -- -82% -92%
	:utf8_strict 5998/s 550% 441% -- -56%
	Unicode::UTF8 13604/s 1374% 1127% 127% --


	zh.txt: Size: 62891 Code points: 24519 (U+0000..U+007F: 5317 U+0080..U+07FF: 32 U+0800..U+FFFF: 19170)
	Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8
	:encoding(UTF-8) 1630/s -- -23% -75% -87%
	Encode 2104/s 29% -- -68% -83%
	:utf8_strict 6549/s 302% 211% -- -48%
	Unicode::UTF8 12630/s 675% 500% 93% --
	#!/usr/bin/perl

	use strict;
	use warnings;

	use Benchmark qw[];
	use Config qw[%Config];
	use IO::Dir qw[];
	use IO::File qw[SEEK_SET];

	use Encode qw[];
	use Unicode::UTF8 qw[];
	use PerlIO::encoding qw[];
	use PerlIO::utf8_strict qw[];

	# https://github.com/chansen/p5-unicode-utf8/tree/master/benchmarks/data
	my $dir = 'benchmarks/data';
	my @docs = do {
	my $d = IO::Dir->new($dir)
	or die qq/Could not open directory '$dir': $!/;
	sort grep { /^[a-z]{2}\.txt/ } $d->read;
	};

	printf "perl: %s (%s %s)\n", $], @Config{qw[osname osvers]};
	printf "Encode: %s\n", Encode->VERSION;
	printf "Unicode::UTF8: %s\n", Unicode::UTF8->VERSION;
	printf "PerlIO::encoding: %s\n", PerlIO::encoding->VERSION;
	printf "PerlIO::utf8_strict: %s\n", PerlIO::utf8_strict->VERSION;

	foreach my $doc (@docs) {

	my $octets = do {
	open my $fh, '<:raw', "$dir/$doc" or die $!;
	local $/; <$fh>;
	};

	my $string = Unicode::UTF8::decode_utf8($octets);

	my @ranges = (
	[ 0x00, 0x7F, qr/[\x{00}-\x{7F}]/ ],
	[ 0x80, 0x7FF, qr/[\x{80}-\x{7FF}]/ ],
	[ 0x800, 0xFFFF, qr/[\x{800}-\x{FFFF}]/ ],
	[ 0x10000, 0x10FFFF, qr/[\x{10000}-\x{10FFFF}]/ ],
	);

	my @out;
	foreach my $r (@ranges) {
	my ($start, $end, $regexp) = @$r;
	my $count = () = $string =~ m/$regexp/g;
	push @out, sprintf "U+%.4X..U+%.4X: %d", $start, $end, $count
	if $count;
	}

	printf "\n\n%s: Size: %d Code points: %d (%s)\n",
	$doc, length $octets, length $string, join ' ', @out;

	open my $fh_raw, '<:raw', \$octets
	or die qq/Could not open a :raw fh: '$!'/;
	open my $fh_encoding, '<:encoding(UTF-8)', \$octets
	or die qq/Could not open a :encoding fh: '$!'/;
	open my $fh_utf8_strict, '<:utf8_strict', \$octets
	or die qq/Could not open a :utf8_strict fh: '$!'/;

	Benchmark::cmpthese( -10, {
	':encoding(UTF-8)' => sub {
	my $data = do { local $/; <$fh_encoding> };
	seek($fh_encoding, 0, SEEK_SET)
	or die qq/Could not rewind fh: '$!'/;
	},
	':utf8_strict' => sub {
	my $data = do { local $/; <$fh_utf8_strict> };
	seek($fh_utf8_strict, 0, SEEK_SET)
	or die qq/Could not rewind fh: '$!'/;
	},
	'Encode' => sub {
	my $data = Encode::decode('UTF-8', do { local $/; scalar <$fh_raw> }, Encode::FB_CROAK\|Encode::LEAVE_SRC);
	seek($fh_raw, 0, SEEK_SET)
	or die qq/Could not rewind fh: '$!'/;
	},
	'Unicode::UTF8' => sub {
	my $data = Unicode::UTF8::decode_utf8(do { local $/; scalar <$fh_raw> });
	seek($fh_raw, 0, SEEK_SET)
	or die qq/Could not rewind fh: '$!'/;
	},
	});
	}