Skip to content

Instantly share code, notes, and snippets.

@azumakuniyuki
Last active January 21, 2020 09:06
Show Gist options
  • Save azumakuniyuki/9e9be0c025c58f288ea52890a69c1875 to your computer and use it in GitHub Desktop.
Save azumakuniyuki/9e9be0c025c58f288ea52890a69c1875 to your computer and use it in GitHub Desktop.
split-and-loop-vs-regexp-and-make-hash.pl
#!/usr/bin/env perl
# Try to improve p5-Sisimai/lib/Sisimai/Message.pm
use strict;
use warnings;
use feature ':5.10';
use Benchmark ':all';
use Test::More 'no_plan';
use Data::Dumper;
use Encode;
use Sisimai::MIME;
use Sisimai::String;
use Sisimai::RFC5322;
my $RFC822Head = Sisimai::RFC5322->HEADERFIELDS;
my $FieldTable = qr/(
apparently-to
|date
|delivered-to
|envelope-from
|envelope-to
|errors-to
|forward-path
|from
|list-id
|message-id
|posted
|posted-date
|reply-to
|resent-date
|resent-to
|reverse-path
|return-path
|subject
|to
|x-envelope-from
|x-envelope-to
|x-postfix-sender
)/imx;
my $Head = << 'EOH';
Return-Path: <[email protected]>
Received: from [192.0.2.25] (p0000-ipbfpfx00kyoto.kyoto.example.co.jp [192.0.2.25])
(authenticated bits=0)
by smtpgw.example.jp (V8/cf) with ESMTP id r9G5FXh9018568
for <[email protected]>; Wed, 16 Oct 2013 14:15:34 +0900
From: "Kijitora Cat" <[email protected]>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: base64
Subject: =?utf-8?B?44OQ44Km44Oz44K544Oh44O844Or44Gu44OG44K544OIKOaXpQ==?=
=?utf-8?B?5pys6KqeKQ==?=
Date: Wed, 16 Oct 2013 14:15:35 +0900
Message-Id: <[email protected]>
To: [email protected]
Mime-Version: 1.0 (Apple Message framework v1283)
X-Mailer: Apple Mail (2.1283)
EOH
sub takeapart {
# Take each email header in the original message apart
# @param [String] heads The original message header
# @return [Hash] Structured message headers
my $heads = shift || return {};
state $borderline = '__MIME_ENCODED_BOUNDARY__';
$$heads =~ s/^[>]+[ ]//mg; # Remove '>' indent symbol of forwarded message
$$heads =~ s/=[ ]+=/=\n =/mg; # Replace ' ' with "\n" at unfolded values
my $previousfn = '';
my $asciiarmor = {}; # Header names which has MIME encoded value
my $headerpart = {}; # Required headers in the original message part
for my $e ( split("\n", $$heads) ) {
# Header name as a key, The value of header as a value
if( $e =~ /\A[ \t]+/ ) {
# Continued (foled) header value from the previous line
next unless $previousfn;
# Concatenate the line if it is the value of required header
if( Sisimai::MIME->is_mimeencoded(\$e) ) {
# The line is MIME-Encoded test
if( $previousfn eq 'subject' ) {
# Subject: header
$headerpart->{ $previousfn } .= $borderline.$e;
} else {
# Is not Subject header
$headerpart->{ $previousfn } .= $e;
}
$asciiarmor->{ $previousfn } = 1;
} else {
# ASCII Characters only: Not MIME-Encoded
$e =~ s/\A[ \t]+//; # unfolding
$headerpart->{ $previousfn } .= $e;
$asciiarmor->{ $previousfn } //= 0;
}
} else {
# Header name as a key, The value of header as a value
my($lhs, $rhs) = split(/:[ ]*/, $e, 2);
next unless $lhs = lc($lhs || '');
$previousfn = '';
next unless exists $RFC822Head->{ $lhs };
$previousfn = $lhs;
$headerpart->{ $previousfn } //= $rhs;
}
}
return $headerpart unless $headerpart->{'subject'};
# Convert MIME-Encoded subject
if( Sisimai::String->is_8bit(\$headerpart->{'subject'}) ) {
# The value of ``Subject'' header is including multibyte character,
# is not MIME-Encoded text.
eval {
# Remove invalid byte sequence
Encode::decode_utf8($headerpart->{'subject'});
Encode::encode_utf8($headerpart->{'subject'});
};
$headerpart->{'subject'} = 'MULTIBYTE CHARACTERS HAVE BEEN REMOVED' if $@;
} else {
# MIME-Encoded subject field or ASCII characters only
my $r = [];
if( $asciiarmor->{'subject'} ) {
# split the value of Subject by $borderline
for my $v ( split($borderline, $headerpart->{'subject'}) ) {
# Insert value to the array if the string is MIME encoded text
push @$r, $v if Sisimai::MIME->is_mimeencoded(\$v);
}
} else {
# Subject line is not MIME encoded
$r = [$headerpart->{'subject'}];
}
$headerpart->{'subject'} = Sisimai::MIME->mimedecode($r);
}
return $headerpart;
}
sub useregex1 {
my $heads = shift || return {};
my $table = {};
$$heads =~ s/^[>]+[ ]//mg; # Remove '>' indent symbol of forwarded message
$$heads =~ s/=[ ]+=/=\n =/mg; # Replace ' ' with "\n" at unfolded values
my %q = $$heads =~ /^([\w-]+):[ ]*(.*?)\n(?![\s\t])/gms;
map { $table->{ lc $_ } = $q{ $_ } } keys %q;
map { $_ =~ s/\n\s+/ /; $_ =~ y/\t/ /s } values %$table;
return $table unless $table->{'subject'};
# Convert MIME-Encoded subject
if( Sisimai::String->is_8bit(\$table->{'subject'}) ) {
# The value of ``Subject'' header is including multibyte character,
# is not MIME-Encoded text.
eval {
# Remove invalid byte sequence
Encode::decode_utf8($table->{'subject'});
Encode::encode_utf8($table->{'subject'});
};
$table->{'subject'} = 'MULTIBYTE CHARACTERS HAVE BEEN REMOVED' if $@;
} else {
# MIME-Encoded subject field or ASCII characters only
my $r = [];
if( Sisimai::MIME->is_mimeencoded(\$table->{'subject'}) ) {
# split the value of Subject by $borderline
for my $v ( split(/ /, $table->{'subject'}) ) {
# Insert value to the array if the string is MIME encoded text
push @$r, $v if Sisimai::MIME->is_mimeencoded(\$v);
}
} else {
# Subject line is not MIME encoded
$r = [$table->{'subject'}];
}
$table->{'subject'} = Sisimai::MIME->mimedecode($r);
}
return $table;
}
sub useregex2 {
my $heads = shift || return {};
my $table = {};
$$heads =~ s/^[>]+[ ]//mg; # Remove '>' indent symbol of forwarded message
$$heads =~ s/=[ ]+=/=\n =/mg; # Replace ' ' with "\n" at unfolded values
my %q = $$heads =~ /^$FieldTable:[ ]*(.*?)\n(?![\s\t])/gms;
map { $table->{ lc $_ } = $q{ $_ } } keys %q;
map { $_ =~ s/\n\s+/ /; $_ =~ y/\t/ /s } values %$table;
return $table unless $table->{'subject'};
# Convert MIME-Encoded subject
if( Sisimai::String->is_8bit(\$table->{'subject'}) ) {
# The value of ``Subject'' header is including multibyte character,
# is not MIME-Encoded text.
eval {
# Remove invalid byte sequence
Encode::decode_utf8($table->{'subject'});
Encode::encode_utf8($table->{'subject'});
};
$table->{'subject'} = 'MULTIBYTE CHARACTERS HAVE BEEN REMOVED' if $@;
} else {
# MIME-Encoded subject field or ASCII characters only
my $r = [];
if( Sisimai::MIME->is_mimeencoded(\$table->{'subject'}) ) {
# split the value of Subject by $borderline
for my $v ( split(/ /, $table->{'subject'}) ) {
# Insert value to the array if the string is MIME encoded text
push @$r, $v if Sisimai::MIME->is_mimeencoded(\$v);
}
} else {
# Subject line is not MIME encoded
$r = [$table->{'subject'}];
}
$table->{'subject'} = Sisimai::MIME->mimedecode($r);
}
return $table;
}
sub useregex3 {
my $heads = shift || return {};
my $table = {};
$$heads =~ s/^[>]+[ ]//mg; # Remove '>' indent symbol of forwarded message
$$heads =~ s/=[ ]+=/=\n =/mg; # Replace ' ' with "\n" at unfolded values
my %q = $$heads =~ /^$FieldTable:[ ]*(.*?)\n(?![\s\t])/cgms;
map { $table->{ lc $_ } = $q{ $_ } } keys %q;
map { $_ =~ s/\n\s+/ /; $_ =~ y/\t/ /s } values %$table;
return $table unless $table->{'subject'};
# Convert MIME-Encoded subject
if( Sisimai::String->is_8bit(\$table->{'subject'}) ) {
# The value of ``Subject'' header is including multibyte character,
# is not MIME-Encoded text.
eval {
# Remove invalid byte sequence
Encode::decode_utf8($table->{'subject'});
Encode::encode_utf8($table->{'subject'});
};
$table->{'subject'} = 'MULTIBYTE CHARACTERS HAVE BEEN REMOVED' if $@;
} else {
# MIME-Encoded subject field or ASCII characters only
my $r = [];
if( Sisimai::MIME->is_mimeencoded(\$table->{'subject'}) ) {
# split the value of Subject by $borderline
for my $v ( split(/ /, $table->{'subject'}) ) {
# Insert value to the array if the string is MIME encoded text
push @$r, $v if Sisimai::MIME->is_mimeencoded(\$v);
}
} else {
# Subject line is not MIME encoded
$r = [$table->{'subject'}];
}
$table->{'subject'} = Sisimai::MIME->mimedecode($r);
}
return $table;
}
my $p = [takeapart(\$Head), useregex1(\$Head), useregex2(\$Head), useregex3(\$Head)];
for my $e ( @$p ) {
isa_ok $e, 'HASH';
for my $ee ( qw|from subject date to return-path message-id| ) {
ok length $e->{ $ee };
}
like $e->{'from'}, qr/kijitora/;
like $e->{'subject'}, qr/繝舌え繝ウ繧ケ/;
like $e->{'date'}, qr/Wed, 16 Oct/;
like $e->{'to'}, qr/bouncehammer[.]jp/;
like $e->{'return-path'}, qr/example[.]org/;
like $e->{'message-id'}, qr/BC36/;
}
printf("Running with Perl %s on %s\n%s\n", $^V, $^O, '-' x 80);
cmpthese(6e5, {
'takeapart' => sub { takeapart(\$Head) },
'useregex1' => sub { useregex1(\$Head) },
'useregex2' => sub { useregex2(\$Head) },
'useregex3' => sub { useregex3(\$Head) },
}
);
__END__
Running with Perl v5.30.0 on darwin
--------------------------------------------------------------------------------
Rate useregex1 takeapart useregex2 useregex3
useregex1 19756/s -- -2% -18% -94%
takeapart 20080/s 2% -- -17% -94%
useregex2 24145/s 22% 20% -- -93%
useregex3 355030/s 1697% 1668% 1370% --
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment