Skip to content

Instantly share code, notes, and snippets.

@azumakuniyuki
Last active January 21, 2020 12:08
Show Gist options
  • Save azumakuniyuki/cf9a4f0340ee5548629482e4b0711ec8 to your computer and use it in GitHub Desktop.
Save azumakuniyuki/cf9a4f0340ee5548629482e4b0711ec8 to your computer and use it in GitHub Desktop.
split-and-loop-vs-regexp-and-make-hash.rb
# Sisimai::Message improvement
require 'benchmark'
require 'sisimai/string'
require 'sisimai/rfc5322'
require 'sisimai/mime'
HowMany = 30000
RFC822Head = Sisimai::RFC5322.HEADERFIELDS
BorderLine = '__MIME_ENCODED_BOUNDARY__'
FieldTable = %r/(
apparently-to
|date
|delivered-to
|envelope-from
|envelope-to
|errors-to
|forward-path
|from
|list-id
|message-id
|posted
|posted-date
|reply-to
|resent-date
|resent-to
|reverse-path
|return-path
|subject
|to
|x-envelope-from
|x-envelope-to
|x-postfix-sender
)/imx
Head = <<'EOH'
Return-Path: <[email protected]>
Received: from [192.0.2.25] (p0000-ipbfpfx00kyoto.kyoto.example.co.jp [192.0.2.25])
(authenticated bits=0)
by smtpgw.example.jp (V8/cf) with ESMTP id r9G5FXh9018568
for <[email protected]>; Wed, 16 Oct 2013 14:15:34 +0900
From: "Kijitora Cat" <[email protected]>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: base64
Subject: =?utf-8?B?44OQ44Km44Oz44K544Oh44O844Or44Gu44OG44K544OIKOaXpQ==?=
=?utf-8?B?5pys6KqeKQ==?=
Date: Wed, 16 Oct 2013 14:15:35 +0900
Message-Id: <[email protected]>
To: [email protected]
Mime-Version: 1.0 (Apple Message framework v1283)
X-Mailer: Apple Mail (2.1283)
EOH
def takeapart(heads)
return {} unless heads
# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
# 2. Convert from string to hash reference
heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
previousfn = '' # Previous field name
asciiarmor = {} # Header names which has MIME encoded value
headerpart = {} # Required headers in the original message part
headslices = heads.split("\n")
while e = headslices.shift do
# Header name as a key, The value of header as a value
if e.start_with?(' ', "\t")
# Continued (foled) header value from the previous line
next if previousfn.empty?
# Concatenate the line if it is the value of required header
if Sisimai::MIME.is_mimeencoded(e)
# The line is MIME-Encoded test
headerpart[previousfn] << if previousfn == 'subject'
# Subject: header
BorderLine + e
else
# Is not Subject header
e
end
asciiarmor[previousfn] = true
else
# ASCII Characters only: Not MIME-Encoded
headerpart[previousfn] << e.lstrip
asciiarmor[previousfn] ||= false
end
else
# Header name as a key, The value of header as a value
(lhs, rhs) = e.split(/:[ ]*/, 2)
next unless lhs
lhs.downcase!
previousfn = ''
next unless RFC822Head[lhs]
previousfn = lhs
headerpart[previousfn] = rhs unless headerpart[previousfn]
end
end
return headerpart unless headerpart['subject']
# Convert MIME-Encoded subject
if Sisimai::String.is_8bit(headerpart['subject'])
# The value of ``Subject'' header is including multibyte character,
# is not MIME-Encoded text.
headerpart['subject'].scrub!('?')
else
# MIME-Encoded subject field or ASCII characters only
r = []
if asciiarmor['subject']
# split the value of Subject by borderline
headerpart['subject'].split(BorderLine).each do |v|
# Insert value to the array if the string is MIME encoded text
r << v if Sisimai::MIME.is_mimeencoded(v)
end
else
# Subject line is not MIME encoded
r << headerpart['subject']
end
headerpart['subject'] = Sisimai::MIME.mimedecode(r)
end
return headerpart
end
def useregex1(heads)
return {} unless heads
# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
# 2. Convert from string to hash reference
heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
table = {};
heads.scan(/^([\w-]+):[ ]*(.*?)\n(?![\s\t])/m) { |e| table[e[0].downcase] = e[1] }
return table if table['subject'].empty?
# Convert MIME-Encoded subject
if Sisimai::String.is_8bit(table['subject'])
# The value of ``Subject'' header is including multibyte character,
# is not MIME-Encoded text.
table['subject'].scrub!('?')
else
# MIME-Encoded subject field or ASCII characters only
r = []
if Sisimai::MIME.is_mimeencoded(table['subject'])
# split the value of Subject by borderline
table['subject'].split(/ /).each do |v|
# Insert value to the array if the string is MIME encoded text
r << v if Sisimai::MIME.is_mimeencoded(v)
end
else
# Subject line is not MIME encoded
r << table['subject']
end
table['subject'] = Sisimai::MIME.mimedecode(r)
end
return table
end
def useregex2(heads)
return {} unless heads
# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
# 2. Convert from string to hash reference
heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
table = {};
heads.scan(/^#{FieldTable}:[ ]*(.*?)\n(?![\s\t])/m) { |e| table[e[0].downcase] = e[1] }
return table if table['subject'].empty?
# Convert MIME-Encoded subject
if Sisimai::String.is_8bit(table['subject'])
# The value of ``Subject'' header is including multibyte character,
# is not MIME-Encoded text.
table['subject'].scrub!('?')
else
# MIME-Encoded subject field or ASCII characters only
r = []
if Sisimai::MIME.is_mimeencoded(table['subject'])
# split the value of Subject by borderline
table['subject'].split(/ /).each do |v|
# Insert value to the array if the string is MIME encoded text
r << v if Sisimai::MIME.is_mimeencoded(v)
end
else
# Subject line is not MIME encoded
r << table['subject']
end
table['subject'] = Sisimai::MIME.mimedecode(r)
end
return table
end
def useregex3(heads)
return {} unless heads
# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
# 2. Convert from string to hash reference
heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
table = {};
heads.scan(/^#{FieldTable}:[ ]*(.*?)\n(?![\s\t])/mo) { |e| table[e[0].downcase] = e[1] }
return table if table['subject'].empty?
# Convert MIME-Encoded subject
if Sisimai::String.is_8bit(table['subject'])
# The value of ``Subject'' header is including multibyte character,
# is not MIME-Encoded text.
table['subject'].scrub!('?')
else
# MIME-Encoded subject field or ASCII characters only
r = []
if Sisimai::MIME.is_mimeencoded(table['subject'])
# split the value of Subject by borderline
table['subject'].split(/ /).each do |v|
# Insert value to the array if the string is MIME encoded text
r << v if Sisimai::MIME.is_mimeencoded(v)
end
else
# Subject line is not MIME encoded
r << table['subject']
end
table['subject'] = Sisimai::MIME.mimedecode(r)
end
return table
end
p = [takeapart(Head), useregex1(Head), useregex2(Head), useregex3(Head)]
p.each do |e|
puts 'ok' if e.is_a? Hash
%w|from subject date to return-path message-id|.each do |ee|
puts 'ok' if e[ee].size > 0
end
puts 'ok' if e['from'] =~ /kijitora/
puts 'ok' if e['subject'] =~ /繝舌え繝ウ繧ケ/
puts 'ok' if e['date'] =~ /Wed, 16 Oct/
puts 'ok' if e['to'] =~ /bouncehammer[.]jp/
puts 'ok' if e['return-path'] =~ /example[.]org/
puts 'ok' if e['message-id'] =~ /BC36/
end
Benchmark.bmbm do |b|
b.report('takeapart') do
HowMany.times do
takeapart(Head)
end
end
b.report('useregex1') do
HowMany.times do
useregex1(Head)
end
end
b.report('useregex2') do
HowMany.times do
useregex2(Head)
end
end
b.report('useregex3') do
HowMany.times do
useregex3(Head)
end
end
end
__END__
Rehearsal ---------------------------------------------
takeapart 1.690759 0.004814 1.695573 ( 1.713718)
useregex1 1.632277 0.002730 1.635007 ( 1.641662)
useregex2 3.566479 0.004850 3.571329 ( 3.578703)
useregex3 1.497352 0.002813 1.500165 ( 1.504279)
------------------------------------ total: 8.402074sec
user system total real
takeapart 1.659426 0.002044 1.661470 ( 1.668322)
useregex1 1.686137 0.002063 1.688200 ( 1.691882)
useregex2 3.609357 0.003275 3.612632 ( 3.617397)
useregex3 1.552407 0.002338 1.554745 ( 1.558926)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment