Last active
January 21, 2020 09:06
-
-
Save azumakuniyuki/9e9be0c025c58f288ea52890a69c1875 to your computer and use it in GitHub Desktop.
split-and-loop-vs-regexp-and-make-hash.pl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# Try to improve p5-Sisimai/lib/Sisimai/Message.pm | |
use strict; | |
use warnings; | |
use feature ':5.10'; | |
use Benchmark ':all'; | |
use Test::More 'no_plan'; | |
use Data::Dumper; | |
use Encode; | |
use Sisimai::MIME; | |
use Sisimai::String; | |
use Sisimai::RFC5322; | |
my $RFC822Head = Sisimai::RFC5322->HEADERFIELDS; | |
my $FieldTable = qr/( | |
apparently-to | |
|date | |
|delivered-to | |
|envelope-from | |
|envelope-to | |
|errors-to | |
|forward-path | |
|from | |
|list-id | |
|message-id | |
|posted | |
|posted-date | |
|reply-to | |
|resent-date | |
|resent-to | |
|reverse-path | |
|return-path | |
|subject | |
|to | |
|x-envelope-from | |
|x-envelope-to | |
|x-postfix-sender | |
)/imx; | |
my $Head = << 'EOH'; | |
Return-Path: <[email protected]> | |
Received: from [192.0.2.25] (p0000-ipbfpfx00kyoto.kyoto.example.co.jp [192.0.2.25]) | |
(authenticated bits=0) | |
by smtpgw.example.jp (V8/cf) with ESMTP id r9G5FXh9018568 | |
for <[email protected]>; Wed, 16 Oct 2013 14:15:34 +0900 | |
From: "Kijitora Cat" <[email protected]> | |
Content-Type: text/plain; charset=utf-8 | |
Content-Transfer-Encoding: base64 | |
Subject: =?utf-8?B?44OQ44Km44Oz44K544Oh44O844Or44Gu44OG44K544OIKOaXpQ==?= | |
=?utf-8?B?5pys6KqeKQ==?= | |
Date: Wed, 16 Oct 2013 14:15:35 +0900 | |
Message-Id: <[email protected]> | |
To: [email protected] | |
Mime-Version: 1.0 (Apple Message framework v1283) | |
X-Mailer: Apple Mail (2.1283) | |
EOH | |
sub takeapart { | |
# Take each email header in the original message apart | |
# @param [String] heads The original message header | |
# @return [Hash] Structured message headers | |
my $heads = shift || return {}; | |
state $borderline = '__MIME_ENCODED_BOUNDARY__'; | |
$$heads =~ s/^[>]+[ ]//mg; # Remove '>' indent symbol of forwarded message | |
$$heads =~ s/=[ ]+=/=\n =/mg; # Replace ' ' with "\n" at unfolded values | |
my $previousfn = ''; | |
my $asciiarmor = {}; # Header names which has MIME encoded value | |
my $headerpart = {}; # Required headers in the original message part | |
for my $e ( split("\n", $$heads) ) { | |
# Header name as a key, The value of header as a value | |
if( $e =~ /\A[ \t]+/ ) { | |
# Continued (foled) header value from the previous line | |
next unless $previousfn; | |
# Concatenate the line if it is the value of required header | |
if( Sisimai::MIME->is_mimeencoded(\$e) ) { | |
# The line is MIME-Encoded test | |
if( $previousfn eq 'subject' ) { | |
# Subject: header | |
$headerpart->{ $previousfn } .= $borderline.$e; | |
} else { | |
# Is not Subject header | |
$headerpart->{ $previousfn } .= $e; | |
} | |
$asciiarmor->{ $previousfn } = 1; | |
} else { | |
# ASCII Characters only: Not MIME-Encoded | |
$e =~ s/\A[ \t]+//; # unfolding | |
$headerpart->{ $previousfn } .= $e; | |
$asciiarmor->{ $previousfn } //= 0; | |
} | |
} else { | |
# Header name as a key, The value of header as a value | |
my($lhs, $rhs) = split(/:[ ]*/, $e, 2); | |
next unless $lhs = lc($lhs || ''); | |
$previousfn = ''; | |
next unless exists $RFC822Head->{ $lhs }; | |
$previousfn = $lhs; | |
$headerpart->{ $previousfn } //= $rhs; | |
} | |
} | |
return $headerpart unless $headerpart->{'subject'}; | |
# Convert MIME-Encoded subject | |
if( Sisimai::String->is_8bit(\$headerpart->{'subject'}) ) { | |
# The value of ``Subject'' header is including multibyte character, | |
# is not MIME-Encoded text. | |
eval { | |
# Remove invalid byte sequence | |
Encode::decode_utf8($headerpart->{'subject'}); | |
Encode::encode_utf8($headerpart->{'subject'}); | |
}; | |
$headerpart->{'subject'} = 'MULTIBYTE CHARACTERS HAVE BEEN REMOVED' if $@; | |
} else { | |
# MIME-Encoded subject field or ASCII characters only | |
my $r = []; | |
if( $asciiarmor->{'subject'} ) { | |
# split the value of Subject by $borderline | |
for my $v ( split($borderline, $headerpart->{'subject'}) ) { | |
# Insert value to the array if the string is MIME encoded text | |
push @$r, $v if Sisimai::MIME->is_mimeencoded(\$v); | |
} | |
} else { | |
# Subject line is not MIME encoded | |
$r = [$headerpart->{'subject'}]; | |
} | |
$headerpart->{'subject'} = Sisimai::MIME->mimedecode($r); | |
} | |
return $headerpart; | |
} | |
sub useregex1 { | |
my $heads = shift || return {}; | |
my $table = {}; | |
$$heads =~ s/^[>]+[ ]//mg; # Remove '>' indent symbol of forwarded message | |
$$heads =~ s/=[ ]+=/=\n =/mg; # Replace ' ' with "\n" at unfolded values | |
my %q = $$heads =~ /^([\w-]+):[ ]*(.*?)\n(?![\s\t])/gms; | |
map { $table->{ lc $_ } = $q{ $_ } } keys %q; | |
map { $_ =~ s/\n\s+/ /; $_ =~ y/\t/ /s } values %$table; | |
return $table unless $table->{'subject'}; | |
# Convert MIME-Encoded subject | |
if( Sisimai::String->is_8bit(\$table->{'subject'}) ) { | |
# The value of ``Subject'' header is including multibyte character, | |
# is not MIME-Encoded text. | |
eval { | |
# Remove invalid byte sequence | |
Encode::decode_utf8($table->{'subject'}); | |
Encode::encode_utf8($table->{'subject'}); | |
}; | |
$table->{'subject'} = 'MULTIBYTE CHARACTERS HAVE BEEN REMOVED' if $@; | |
} else { | |
# MIME-Encoded subject field or ASCII characters only | |
my $r = []; | |
if( Sisimai::MIME->is_mimeencoded(\$table->{'subject'}) ) { | |
# split the value of Subject by $borderline | |
for my $v ( split(/ /, $table->{'subject'}) ) { | |
# Insert value to the array if the string is MIME encoded text | |
push @$r, $v if Sisimai::MIME->is_mimeencoded(\$v); | |
} | |
} else { | |
# Subject line is not MIME encoded | |
$r = [$table->{'subject'}]; | |
} | |
$table->{'subject'} = Sisimai::MIME->mimedecode($r); | |
} | |
return $table; | |
} | |
sub useregex2 { | |
my $heads = shift || return {}; | |
my $table = {}; | |
$$heads =~ s/^[>]+[ ]//mg; # Remove '>' indent symbol of forwarded message | |
$$heads =~ s/=[ ]+=/=\n =/mg; # Replace ' ' with "\n" at unfolded values | |
my %q = $$heads =~ /^$FieldTable:[ ]*(.*?)\n(?![\s\t])/gms; | |
map { $table->{ lc $_ } = $q{ $_ } } keys %q; | |
map { $_ =~ s/\n\s+/ /; $_ =~ y/\t/ /s } values %$table; | |
return $table unless $table->{'subject'}; | |
# Convert MIME-Encoded subject | |
if( Sisimai::String->is_8bit(\$table->{'subject'}) ) { | |
# The value of ``Subject'' header is including multibyte character, | |
# is not MIME-Encoded text. | |
eval { | |
# Remove invalid byte sequence | |
Encode::decode_utf8($table->{'subject'}); | |
Encode::encode_utf8($table->{'subject'}); | |
}; | |
$table->{'subject'} = 'MULTIBYTE CHARACTERS HAVE BEEN REMOVED' if $@; | |
} else { | |
# MIME-Encoded subject field or ASCII characters only | |
my $r = []; | |
if( Sisimai::MIME->is_mimeencoded(\$table->{'subject'}) ) { | |
# split the value of Subject by $borderline | |
for my $v ( split(/ /, $table->{'subject'}) ) { | |
# Insert value to the array if the string is MIME encoded text | |
push @$r, $v if Sisimai::MIME->is_mimeencoded(\$v); | |
} | |
} else { | |
# Subject line is not MIME encoded | |
$r = [$table->{'subject'}]; | |
} | |
$table->{'subject'} = Sisimai::MIME->mimedecode($r); | |
} | |
return $table; | |
} | |
sub useregex3 { | |
my $heads = shift || return {}; | |
my $table = {}; | |
$$heads =~ s/^[>]+[ ]//mg; # Remove '>' indent symbol of forwarded message | |
$$heads =~ s/=[ ]+=/=\n =/mg; # Replace ' ' with "\n" at unfolded values | |
my %q = $$heads =~ /^$FieldTable:[ ]*(.*?)\n(?![\s\t])/cgms; | |
map { $table->{ lc $_ } = $q{ $_ } } keys %q; | |
map { $_ =~ s/\n\s+/ /; $_ =~ y/\t/ /s } values %$table; | |
return $table unless $table->{'subject'}; | |
# Convert MIME-Encoded subject | |
if( Sisimai::String->is_8bit(\$table->{'subject'}) ) { | |
# The value of ``Subject'' header is including multibyte character, | |
# is not MIME-Encoded text. | |
eval { | |
# Remove invalid byte sequence | |
Encode::decode_utf8($table->{'subject'}); | |
Encode::encode_utf8($table->{'subject'}); | |
}; | |
$table->{'subject'} = 'MULTIBYTE CHARACTERS HAVE BEEN REMOVED' if $@; | |
} else { | |
# MIME-Encoded subject field or ASCII characters only | |
my $r = []; | |
if( Sisimai::MIME->is_mimeencoded(\$table->{'subject'}) ) { | |
# split the value of Subject by $borderline | |
for my $v ( split(/ /, $table->{'subject'}) ) { | |
# Insert value to the array if the string is MIME encoded text | |
push @$r, $v if Sisimai::MIME->is_mimeencoded(\$v); | |
} | |
} else { | |
# Subject line is not MIME encoded | |
$r = [$table->{'subject'}]; | |
} | |
$table->{'subject'} = Sisimai::MIME->mimedecode($r); | |
} | |
return $table; | |
} | |
my $p = [takeapart(\$Head), useregex1(\$Head), useregex2(\$Head), useregex3(\$Head)]; | |
for my $e ( @$p ) { | |
isa_ok $e, 'HASH'; | |
for my $ee ( qw|from subject date to return-path message-id| ) { | |
ok length $e->{ $ee }; | |
} | |
like $e->{'from'}, qr/kijitora/; | |
like $e->{'subject'}, qr/繝舌え繝ウ繧ケ/; | |
like $e->{'date'}, qr/Wed, 16 Oct/; | |
like $e->{'to'}, qr/bouncehammer[.]jp/; | |
like $e->{'return-path'}, qr/example[.]org/; | |
like $e->{'message-id'}, qr/BC36/; | |
} | |
printf("Running with Perl %s on %s\n%s\n", $^V, $^O, '-' x 80); | |
cmpthese(6e5, { | |
'takeapart' => sub { takeapart(\$Head) }, | |
'useregex1' => sub { useregex1(\$Head) }, | |
'useregex2' => sub { useregex2(\$Head) }, | |
'useregex3' => sub { useregex3(\$Head) }, | |
} | |
); | |
__END__ | |
Running with Perl v5.30.0 on darwin | |
-------------------------------------------------------------------------------- | |
Rate useregex1 takeapart useregex2 useregex3 | |
useregex1 19756/s -- -2% -18% -94% | |
takeapart 20080/s 2% -- -17% -94% | |
useregex2 24145/s 22% 20% -- -93% | |
useregex3 355030/s 1697% 1668% 1370% -- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment