Last active
August 29, 2015 14:25
-
-
Save chansen/254a01d69490d93297f4 to your computer and use it in GitHub Desktop.
Benchmark of slurping UTF-8 encoded file in Perl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ perl benchmarks/slurp.pl | |
perl: 5.023001 (darwin 14.4.0) | |
Encode: 2.75 | |
Unicode::UTF8: 0.60 | |
PerlIO::encoding: 0.21 | |
PerlIO::utf8_strict: 0.006 | |
ar.txt: Size: 25918 Code points: 14308 (U+0000..U+007F: 2698 U+0080..U+07FF: 11610) | |
Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8 | |
:encoding(UTF-8) 3058/s -- -19% -73% -87% | |
Encode 3754/s 23% -- -67% -84% | |
:utf8_strict 11361/s 272% 203% -- -52% | |
Unicode::UTF8 23620/s 672% 529% 108% -- | |
el.txt: Size: 103974 Code points: 58748 (U+0000..U+007F: 13560 U+0080..U+07FF: 45150 U+0800..U+FFFF: 38) | |
Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8 | |
:encoding(UTF-8) 780/s -- -19% -73% -86% | |
Encode 958/s 23% -- -66% -83% | |
:utf8_strict 2855/s 266% 198% -- -48% | |
Unicode::UTF8 5498/s 605% 474% 93% -- | |
en.txt: Size: 82171 Code points: 82055 (U+0000..U+007F: 81988 U+0080..U+07FF: 18 U+0800..U+FFFF: 49) | |
Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8 | |
:encoding(UTF-8) 1111/s -- -16% -90% -96% | |
Encode 1327/s 19% -- -88% -95% | |
:utf8_strict 11446/s 931% 763% -- -60% | |
Unicode::UTF8 28635/s 2478% 2058% 150% -- | |
ja.txt: Size: 180109 Code points: 64655 (U+0000..U+007F: 6913 U+0080..U+07FF: 30 U+0800..U+FFFF: 57712) | |
Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8 | |
:encoding(UTF-8) 553/s -- -27% -72% -91% | |
Encode 757/s 37% -- -61% -87% | |
:utf8_strict 1960/s 254% 159% -- -67% | |
Unicode::UTF8 5915/s 970% 682% 202% -- | |
lv.txt: Size: 138397 Code points: 127160 (U+0000..U+007F: 117031 U+0080..U+07FF: 9021 U+0800..U+FFFF: 1108) | |
Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8 | |
:encoding(UTF-8) 605/s -- -19% -80% -91% | |
Encode 746/s 23% -- -75% -88% | |
:utf8_strict 3043/s 403% 308% -- -53% | |
Unicode::UTF8 6453/s 967% 765% 112% -- | |
ru.txt: Size: 151633 Code points: 85266 (U+0000..U+007F: 19263 U+0080..U+07FF: 65639 U+0800..U+FFFF: 364) | |
Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8 | |
:encoding(UTF-8) 542/s -- -19% -73% -86% | |
Encode 673/s 24% -- -66% -83% | |
:utf8_strict 2001/s 269% 197% -- -50% | |
Unicode::UTF8 4010/s 640% 496% 100% -- | |
sv.txt: Size: 96449 Code points: 92894 (U+0000..U+007F: 89510 U+0080..U+07FF: 3213 U+0800..U+FFFF: 171) | |
Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8 | |
:encoding(UTF-8) 923/s -- -17% -85% -93% | |
Encode 1109/s 20% -- -82% -92% | |
:utf8_strict 5998/s 550% 441% -- -56% | |
Unicode::UTF8 13604/s 1374% 1127% 127% -- | |
zh.txt: Size: 62891 Code points: 24519 (U+0000..U+007F: 5317 U+0080..U+07FF: 32 U+0800..U+FFFF: 19170) | |
Rate :encoding(UTF-8) Encode :utf8_strict Unicode::UTF8 | |
:encoding(UTF-8) 1630/s -- -23% -75% -87% | |
Encode 2104/s 29% -- -68% -83% | |
:utf8_strict 6549/s 302% 211% -- -48% | |
Unicode::UTF8 12630/s 675% 500% 93% -- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Benchmark qw[]; | |
use Config qw[%Config]; | |
use IO::Dir qw[]; | |
use IO::File qw[SEEK_SET]; | |
use Encode qw[]; | |
use Unicode::UTF8 qw[]; | |
use PerlIO::encoding qw[]; | |
use PerlIO::utf8_strict qw[]; | |
# https://github.com/chansen/p5-unicode-utf8/tree/master/benchmarks/data | |
my $dir = 'benchmarks/data'; | |
my @docs = do { | |
my $d = IO::Dir->new($dir) | |
or die qq/Could not open directory '$dir': $!/; | |
sort grep { /^[a-z]{2}\.txt/ } $d->read; | |
}; | |
printf "perl: %s (%s %s)\n", $], @Config{qw[osname osvers]}; | |
printf "Encode: %s\n", Encode->VERSION; | |
printf "Unicode::UTF8: %s\n", Unicode::UTF8->VERSION; | |
printf "PerlIO::encoding: %s\n", PerlIO::encoding->VERSION; | |
printf "PerlIO::utf8_strict: %s\n", PerlIO::utf8_strict->VERSION; | |
foreach my $doc (@docs) { | |
my $octets = do { | |
open my $fh, '<:raw', "$dir/$doc" or die $!; | |
local $/; <$fh>; | |
}; | |
my $string = Unicode::UTF8::decode_utf8($octets); | |
my @ranges = ( | |
[ 0x00, 0x7F, qr/[\x{00}-\x{7F}]/ ], | |
[ 0x80, 0x7FF, qr/[\x{80}-\x{7FF}]/ ], | |
[ 0x800, 0xFFFF, qr/[\x{800}-\x{FFFF}]/ ], | |
[ 0x10000, 0x10FFFF, qr/[\x{10000}-\x{10FFFF}]/ ], | |
); | |
my @out; | |
foreach my $r (@ranges) { | |
my ($start, $end, $regexp) = @$r; | |
my $count = () = $string =~ m/$regexp/g; | |
push @out, sprintf "U+%.4X..U+%.4X: %d", $start, $end, $count | |
if $count; | |
} | |
printf "\n\n%s: Size: %d Code points: %d (%s)\n", | |
$doc, length $octets, length $string, join ' ', @out; | |
open my $fh_raw, '<:raw', \$octets | |
or die qq/Could not open a :raw fh: '$!'/; | |
open my $fh_encoding, '<:encoding(UTF-8)', \$octets | |
or die qq/Could not open a :encoding fh: '$!'/; | |
open my $fh_utf8_strict, '<:utf8_strict', \$octets | |
or die qq/Could not open a :utf8_strict fh: '$!'/; | |
Benchmark::cmpthese( -10, { | |
':encoding(UTF-8)' => sub { | |
my $data = do { local $/; <$fh_encoding> }; | |
seek($fh_encoding, 0, SEEK_SET) | |
or die qq/Could not rewind fh: '$!'/; | |
}, | |
':utf8_strict' => sub { | |
my $data = do { local $/; <$fh_utf8_strict> }; | |
seek($fh_utf8_strict, 0, SEEK_SET) | |
or die qq/Could not rewind fh: '$!'/; | |
}, | |
'Encode' => sub { | |
my $data = Encode::decode('UTF-8', do { local $/; scalar <$fh_raw> }, Encode::FB_CROAK|Encode::LEAVE_SRC); | |
seek($fh_raw, 0, SEEK_SET) | |
or die qq/Could not rewind fh: '$!'/; | |
}, | |
'Unicode::UTF8' => sub { | |
my $data = Unicode::UTF8::decode_utf8(do { local $/; scalar <$fh_raw> }); | |
seek($fh_raw, 0, SEEK_SET) | |
or die qq/Could not rewind fh: '$!'/; | |
}, | |
}); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment