Last active
January 21, 2020 12:08
-
-
Save azumakuniyuki/cf9a4f0340ee5548629482e4b0711ec8 to your computer and use it in GitHub Desktop.
split-and-loop-vs-regexp-and-make-hash.rb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sisimai::Message improvement | |
require 'benchmark' | |
require 'sisimai/string' | |
require 'sisimai/rfc5322' | |
require 'sisimai/mime' | |
HowMany = 30000 | |
RFC822Head = Sisimai::RFC5322.HEADERFIELDS | |
BorderLine = '__MIME_ENCODED_BOUNDARY__' | |
FieldTable = %r/( | |
apparently-to | |
|date | |
|delivered-to | |
|envelope-from | |
|envelope-to | |
|errors-to | |
|forward-path | |
|from | |
|list-id | |
|message-id | |
|posted | |
|posted-date | |
|reply-to | |
|resent-date | |
|resent-to | |
|reverse-path | |
|return-path | |
|subject | |
|to | |
|x-envelope-from | |
|x-envelope-to | |
|x-postfix-sender | |
)/imx | |
Head = <<'EOH' | |
Return-Path: <[email protected]> | |
Received: from [192.0.2.25] (p0000-ipbfpfx00kyoto.kyoto.example.co.jp [192.0.2.25]) | |
(authenticated bits=0) | |
by smtpgw.example.jp (V8/cf) with ESMTP id r9G5FXh9018568 | |
for <[email protected]>; Wed, 16 Oct 2013 14:15:34 +0900 | |
From: "Kijitora Cat" <[email protected]> | |
Content-Type: text/plain; charset=utf-8 | |
Content-Transfer-Encoding: base64 | |
Subject: =?utf-8?B?44OQ44Km44Oz44K544Oh44O844Or44Gu44OG44K544OIKOaXpQ==?= | |
=?utf-8?B?5pys6KqeKQ==?= | |
Date: Wed, 16 Oct 2013 14:15:35 +0900 | |
Message-Id: <[email protected]> | |
To: [email protected] | |
Mime-Version: 1.0 (Apple Message framework v1283) | |
X-Mailer: Apple Mail (2.1283) | |
EOH | |
def takeapart(heads) | |
return {} unless heads | |
# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82) | |
# 2. Convert from string to hash reference | |
heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =") | |
previousfn = '' # Previous field name | |
asciiarmor = {} # Header names which has MIME encoded value | |
headerpart = {} # Required headers in the original message part | |
headslices = heads.split("\n") | |
while e = headslices.shift do | |
# Header name as a key, The value of header as a value | |
if e.start_with?(' ', "\t") | |
# Continued (foled) header value from the previous line | |
next if previousfn.empty? | |
# Concatenate the line if it is the value of required header | |
if Sisimai::MIME.is_mimeencoded(e) | |
# The line is MIME-Encoded test | |
headerpart[previousfn] << if previousfn == 'subject' | |
# Subject: header | |
BorderLine + e | |
else | |
# Is not Subject header | |
e | |
end | |
asciiarmor[previousfn] = true | |
else | |
# ASCII Characters only: Not MIME-Encoded | |
headerpart[previousfn] << e.lstrip | |
asciiarmor[previousfn] ||= false | |
end | |
else | |
# Header name as a key, The value of header as a value | |
(lhs, rhs) = e.split(/:[ ]*/, 2) | |
next unless lhs | |
lhs.downcase! | |
previousfn = '' | |
next unless RFC822Head[lhs] | |
previousfn = lhs | |
headerpart[previousfn] = rhs unless headerpart[previousfn] | |
end | |
end | |
return headerpart unless headerpart['subject'] | |
# Convert MIME-Encoded subject | |
if Sisimai::String.is_8bit(headerpart['subject']) | |
# The value of ``Subject'' header is including multibyte character, | |
# is not MIME-Encoded text. | |
headerpart['subject'].scrub!('?') | |
else | |
# MIME-Encoded subject field or ASCII characters only | |
r = [] | |
if asciiarmor['subject'] | |
# split the value of Subject by borderline | |
headerpart['subject'].split(BorderLine).each do |v| | |
# Insert value to the array if the string is MIME encoded text | |
r << v if Sisimai::MIME.is_mimeencoded(v) | |
end | |
else | |
# Subject line is not MIME encoded | |
r << headerpart['subject'] | |
end | |
headerpart['subject'] = Sisimai::MIME.mimedecode(r) | |
end | |
return headerpart | |
end | |
def useregex1(heads) | |
return {} unless heads | |
# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82) | |
# 2. Convert from string to hash reference | |
heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =") | |
table = {}; | |
heads.scan(/^([\w-]+):[ ]*(.*?)\n(?![\s\t])/m) { |e| table[e[0].downcase] = e[1] } | |
return table if table['subject'].empty? | |
# Convert MIME-Encoded subject | |
if Sisimai::String.is_8bit(table['subject']) | |
# The value of ``Subject'' header is including multibyte character, | |
# is not MIME-Encoded text. | |
table['subject'].scrub!('?') | |
else | |
# MIME-Encoded subject field or ASCII characters only | |
r = [] | |
if Sisimai::MIME.is_mimeencoded(table['subject']) | |
# split the value of Subject by borderline | |
table['subject'].split(/ /).each do |v| | |
# Insert value to the array if the string is MIME encoded text | |
r << v if Sisimai::MIME.is_mimeencoded(v) | |
end | |
else | |
# Subject line is not MIME encoded | |
r << table['subject'] | |
end | |
table['subject'] = Sisimai::MIME.mimedecode(r) | |
end | |
return table | |
end | |
def useregex2(heads) | |
return {} unless heads | |
# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82) | |
# 2. Convert from string to hash reference | |
heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =") | |
table = {}; | |
heads.scan(/^#{FieldTable}:[ ]*(.*?)\n(?![\s\t])/m) { |e| table[e[0].downcase] = e[1] } | |
return table if table['subject'].empty? | |
# Convert MIME-Encoded subject | |
if Sisimai::String.is_8bit(table['subject']) | |
# The value of ``Subject'' header is including multibyte character, | |
# is not MIME-Encoded text. | |
table['subject'].scrub!('?') | |
else | |
# MIME-Encoded subject field or ASCII characters only | |
r = [] | |
if Sisimai::MIME.is_mimeencoded(table['subject']) | |
# split the value of Subject by borderline | |
table['subject'].split(/ /).each do |v| | |
# Insert value to the array if the string is MIME encoded text | |
r << v if Sisimai::MIME.is_mimeencoded(v) | |
end | |
else | |
# Subject line is not MIME encoded | |
r << table['subject'] | |
end | |
table['subject'] = Sisimai::MIME.mimedecode(r) | |
end | |
return table | |
end | |
def useregex3(heads) | |
return {} unless heads | |
# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82) | |
# 2. Convert from string to hash reference | |
heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =") | |
table = {}; | |
heads.scan(/^#{FieldTable}:[ ]*(.*?)\n(?![\s\t])/mo) { |e| table[e[0].downcase] = e[1] } | |
return table if table['subject'].empty? | |
# Convert MIME-Encoded subject | |
if Sisimai::String.is_8bit(table['subject']) | |
# The value of ``Subject'' header is including multibyte character, | |
# is not MIME-Encoded text. | |
table['subject'].scrub!('?') | |
else | |
# MIME-Encoded subject field or ASCII characters only | |
r = [] | |
if Sisimai::MIME.is_mimeencoded(table['subject']) | |
# split the value of Subject by borderline | |
table['subject'].split(/ /).each do |v| | |
# Insert value to the array if the string is MIME encoded text | |
r << v if Sisimai::MIME.is_mimeencoded(v) | |
end | |
else | |
# Subject line is not MIME encoded | |
r << table['subject'] | |
end | |
table['subject'] = Sisimai::MIME.mimedecode(r) | |
end | |
return table | |
end | |
p = [takeapart(Head), useregex1(Head), useregex2(Head), useregex3(Head)] | |
p.each do |e| | |
puts 'ok' if e.is_a? Hash | |
%w|from subject date to return-path message-id|.each do |ee| | |
puts 'ok' if e[ee].size > 0 | |
end | |
puts 'ok' if e['from'] =~ /kijitora/ | |
puts 'ok' if e['subject'] =~ /繝舌え繝ウ繧ケ/ | |
puts 'ok' if e['date'] =~ /Wed, 16 Oct/ | |
puts 'ok' if e['to'] =~ /bouncehammer[.]jp/ | |
puts 'ok' if e['return-path'] =~ /example[.]org/ | |
puts 'ok' if e['message-id'] =~ /BC36/ | |
end | |
Benchmark.bmbm do |b| | |
b.report('takeapart') do | |
HowMany.times do | |
takeapart(Head) | |
end | |
end | |
b.report('useregex1') do | |
HowMany.times do | |
useregex1(Head) | |
end | |
end | |
b.report('useregex2') do | |
HowMany.times do | |
useregex2(Head) | |
end | |
end | |
b.report('useregex3') do | |
HowMany.times do | |
useregex3(Head) | |
end | |
end | |
end | |
__END__ | |
Rehearsal --------------------------------------------- | |
takeapart 1.690759 0.004814 1.695573 ( 1.713718) | |
useregex1 1.632277 0.002730 1.635007 ( 1.641662) | |
useregex2 3.566479 0.004850 3.571329 ( 3.578703) | |
useregex3 1.497352 0.002813 1.500165 ( 1.504279) | |
------------------------------------ total: 8.402074sec | |
user system total real | |
takeapart 1.659426 0.002044 1.661470 ( 1.668322) | |
useregex1 1.686137 0.002063 1.688200 ( 1.691882) | |
useregex2 3.609357 0.003275 3.612632 ( 3.617397) | |
useregex3 1.552407 0.002338 1.554745 ( 1.558926) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment