Created
April 28, 2010 08:55
-
-
Save makamaka/381890 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl | |
# benchmark encoding detection for JSON | |
use strict; | |
use Modern::Perl; | |
use Benchmark qw(:all); | |
use utf8; | |
use Encode; | |
# >>> copied from Mojo::JSON | |
my $UTF_PATTERNS = { | |
"\0\0\0[^\0]" => 'UTF-32BE', | |
"\0[^\0]\0[^\0]" => 'UTF-16BE', | |
"[^\0]\0\0\0" => 'UTF-32LE', | |
"[^\0]\0[^\0]\0" => 'UTF-16LE' | |
}; | |
my $BOM_RE = qr/ | |
(?: | |
\357\273\277 # UTF-8 | |
| | |
\377\376\0\0 # UTF-32LE | |
| | |
\0\0\376\377 # UTF-32BE | |
| | |
\376\377 # UTF-16BE | |
| | |
\377\376 # UTF-16LE | |
) | |
/x; | |
# <<< copied from Mojo::JSON::XS | |
my $utf8 = q|{"foo":"bar"}|; | |
my $utf16 = Encode::encode( 'UTF-16', Encode::decode( 'utf8', $utf8 ) ); | |
my $utf32 = Encode::encode( 'UTF-32', Encode::decode( 'utf8', $utf8 ) ); | |
my $utf16be = Encode::encode( 'UTF-16BE', Encode::decode( 'utf8', $utf8 ) ); | |
my $utf32be = Encode::encode( 'UTF-32BE', Encode::decode( 'utf8', $utf8 ) ); | |
my $utf16le = Encode::encode( 'UTF-16LE', Encode::decode( 'utf8', $utf8 ) ); | |
my $utf32le = Encode::encode( 'UTF-32LE', Encode::decode( 'utf8', $utf8 ) ); | |
$utf16 =~ s/^$BOM_RE//go; # for utf-16, utf-32 | |
$utf32 =~ s/^$BOM_RE//go; # for utf-16, utf-32 | |
my $count = 300000; | |
cmpthese( $count, { | |
'unpack utf8' => sub { test_unpack( $utf8 ); }, | |
'regexp utf8' => sub { test_regexp( $utf8 ); }, | |
'mojo_json utf8' => sub { test_mojo_json( $utf8 ); }, | |
'hybride utf8' => sub { test_hybride( $utf8 ); }, | |
} ); | |
cmpthese( $count, { | |
'unpack utf16' => sub { test_unpack( $utf16le ); }, | |
'regexp utf16' => sub { test_regexp( $utf16le ); }, | |
'mojo_json utf16' => sub { test_mojo_json( $utf16le ); }, | |
'hybride utf16' => sub { test_hybride( $utf16le ); }, | |
} ); | |
cmpthese( $count, { | |
'unpack utf32' => sub { test_unpack( $utf32le ); }, | |
'regexp utf32' => sub { test_regexp( $utf32le ); }, | |
'mojo_json utf32' => sub { test_mojo_json( $utf32le ); }, | |
'hybride utf32' => sub { test_hybride( $utf32le ); }, | |
} ); | |
# | |
# | |
# | |
sub test_mojo_json { # copied and modified from Mojo::JSON | |
my ( $text ) = @_; | |
my $encoding = 'UTF-8'; | |
for my $pattern (keys %$UTF_PATTERNS) { | |
if ($text =~ /^$pattern/) { | |
return $UTF_PATTERNS->{$pattern}; | |
} | |
} | |
return $encoding; | |
} | |
sub test_unpack { # copied from JSON::PP | |
my ( $text ) = @_; | |
my @octets = unpack('C3', $text); | |
return ( $octets[0] and $octets[1]) ? 'UTF-8' | |
: (!$octets[0] and $octets[1]) ? 'UTF-16BE' | |
: (!$octets[0] and !$octets[1]) ? 'UTF-32BE' | |
: ( $octets[2] ) ? 'UTF-16LE' | |
: 'UTF-32LE'; | |
} | |
sub test_regexp { # regexp version of test_unpack | |
my ( $text ) = @_; | |
return 'unknown' unless ( $text =~ /^(?:([^\0][^\0])|(\0[^\0])|(\0\0)|..(.))/ ); | |
return $1 ? 'UTF-8' | |
: $2 ? 'UTF-16BE' | |
: $3 ? 'UTF-32BE' | |
: $4 eq "\0" ? 'UTF-32LE' | |
: 'UTF-16LE'; | |
} | |
sub test_hybride { # regexp and unpack | |
my ( $text ) = @_; | |
if ( $text =~ /^(?:([^\0][^\0])|(\0[^\0])|(\0\0))/ ) { | |
return $1 ? 'UTF-8' | |
: $2 ? 'UTF-16BE' | |
: 'UTF-32BE'; | |
} | |
my @octets = unpack('C3', $text); | |
return $octets[2] ? 'UTF-16LE' : 'UTF-32LE'; | |
} | |
__END__ | |
Rate mojo_json utf8 regexp utf8 hybride utf8 unpack utf8 | |
mojo_json utf8 31780/s -- -94% -94% -94% | |
regexp utf8 526316/s 1556% -- -4% -7% | |
hybride utf8 545455/s 1616% 4% -- -4% | |
unpack utf8 566038/s 1681% 8% 4% -- | |
Rate mojo_json utf16 regexp utf16 hybride utf16 unpack utf16 | |
mojo_json utf16 315789/s -- -7% -32% -39% | |
regexp utf16 340909/s 8% -- -26% -34% | |
hybride utf16 461538/s 46% 35% -- -11% | |
unpack utf16 517241/s 64% 52% 12% -- | |
Rate mojo_json utf32 regexp utf32 hybride utf32 unpack utf32 | |
mojo_json utf32 37313/s -- -89% -91% -93% | |
regexp utf32 329670/s 784% -- -24% -34% | |
hybride utf32 434783/s 1065% 32% -- -13% | |
unpack utf32 500000/s 1240% 52% 15% -- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment