azumakuniyuki · January 21, 2020 12:08
diff --git a/split-and-loop-vs-regexp-and-make-hash.rb b/split-and-loop-vs-regexp-and-make-hash.rb
 # Sisimai::Message improvement
 require 'benchmark'
 require 'sisimai/string'
 require 'sisimai/rfc5322'
 require 'sisimai/mime'

 HowMany = 30000
 RFC822Head = Sisimai::RFC5322.HEADERFIELDS
 BorderLine = '__MIME_ENCODED_BOUNDARY__'
 FieldTable = %r/(
     apparently-to
    |date
    |delivered-to
    |envelope-from
    |envelope-to
    |errors-to
    |forward-path
    |from
    |list-id
    |message-id
    |posted
    |posted-date
    |reply-to
    |resent-date
    |resent-to
    |reverse-path
    |return-path
    |subject
    |to
    |x-envelope-from
    |x-envelope-to
    |x-postfix-sender
 )/imx

 Head = <<'EOH'
 Return-Path: <[email protected]>
 Received: from [192.0.2.25] (p0000-ipbfpfx00kyoto.kyoto.example.co.jp [192.0.2.25])
 	(authenticated bits=0)
 	by smtpgw.example.jp (V8/cf) with ESMTP id r9G5FXh9018568
 	for <[email protected]>; Wed, 16 Oct 2013 14:15:34 +0900
 From: "Kijitora Cat" <[email protected]>
 Content-Type: text/plain; charset=utf-8
 Content-Transfer-Encoding: base64
 Subject: =?utf-8?B?44OQ44Km44Oz44K544Oh44O844Or44Gu44OG44K544OIKOaXpQ==?=
 =?utf-8?B?5pys6KqeKQ==?=
 Date: Wed, 16 Oct 2013 14:15:35 +0900
 Message-Id: <[email protected]>
 To: [email protected]
 Mime-Version: 1.0 (Apple Message framework v1283)
 X-Mailer: Apple Mail (2.1283)
 EOH

 def takeapart(heads)
  return {} unless heads

  # 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
  # 2. Convert from string to hash reference
  heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")

  previousfn = '' # Previous field name
  asciiarmor = {} # Header names which has MIME encoded value
  headerpart = {} # Required headers in the original message part
  headslices = heads.split("\n")

  while e = headslices.shift do
    # Header name as a key, The value of header as a value
    if e.start_with?(' ', "\t")
      # Continued (foled) header value from the previous line
      next if previousfn.empty?

      # Concatenate the line if it is the value of required header
      if Sisimai::MIME.is_mimeencoded(e)
        # The line is MIME-Encoded test
        headerpart[previousfn] << if previousfn == 'subject'
                                    # Subject: header
                                    BorderLine + e
                                  else
                                    # Is not Subject header
                                    e
                                  end
        asciiarmor[previousfn] = true
      else
        # ASCII Characters only: Not MIME-Encoded
        headerpart[previousfn] << e.lstrip
        asciiarmor[previousfn] ||= false
      end
    else
      # Header name as a key, The value of header as a value
      (lhs, rhs) = e.split(/:[ ]*/, 2)
      next unless lhs
      lhs.downcase!
      previousfn = ''

      next unless RFC822Head[lhs]
      previousfn = lhs
      headerpart[previousfn] = rhs unless headerpart[previousfn]
    end
  end
  return headerpart unless headerpart['subject']

  # Convert MIME-Encoded subject
  if Sisimai::String.is_8bit(headerpart['subject'])
    # The value of ``Subject'' header is including multibyte character,
    # is not MIME-Encoded text.
    headerpart['subject'].scrub!('?')
  else
    # MIME-Encoded subject field or ASCII characters only
    r = []
    if asciiarmor['subject']
      # split the value of Subject by borderline
      headerpart['subject'].split(BorderLine).each do |v|
        # Insert value to the array if the string is MIME encoded text
        r << v if Sisimai::MIME.is_mimeencoded(v)
      end
    else
      # Subject line is not MIME encoded
      r << headerpart['subject']
    end
    headerpart['subject'] = Sisimai::MIME.mimedecode(r)
  end
  return headerpart
 end

 def useregex1(heads)
  return {} unless heads

  # 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
  # 2. Convert from string to hash reference
  heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
  table = {};

  heads.scan(/^([\w-]+):[ ]*(.*?)\n(?![\s\t])/m) { |e| table[e[0].downcase] = e[1] }
  return table if table['subject'].empty?

  # Convert MIME-Encoded subject
  if Sisimai::String.is_8bit(table['subject'])
    # The value of ``Subject'' header is including multibyte character,
    # is not MIME-Encoded text.
    table['subject'].scrub!('?')
  else
    # MIME-Encoded subject field or ASCII characters only
    r = []
    if Sisimai::MIME.is_mimeencoded(table['subject'])
      # split the value of Subject by borderline
      table['subject'].split(/ /).each do |v|
        # Insert value to the array if the string is MIME encoded text
        r << v if Sisimai::MIME.is_mimeencoded(v)
      end
    else
      # Subject line is not MIME encoded
      r << table['subject']
    end
    table['subject'] = Sisimai::MIME.mimedecode(r)
  end
  return table
 end

 def useregex2(heads)
  return {} unless heads

  # 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
  # 2. Convert from string to hash reference
  heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
  table = {};

  heads.scan(/^#{FieldTable}:[ ]*(.*?)\n(?![\s\t])/m) { |e| table[e[0].downcase] = e[1] }
  return table if table['subject'].empty?

  # Convert MIME-Encoded subject
  if Sisimai::String.is_8bit(table['subject'])
    # The value of ``Subject'' header is including multibyte character,
    # is not MIME-Encoded text.
    table['subject'].scrub!('?')
  else
    # MIME-Encoded subject field or ASCII characters only
    r = []
    if Sisimai::MIME.is_mimeencoded(table['subject'])
      # split the value of Subject by borderline
      table['subject'].split(/ /).each do |v|
        # Insert value to the array if the string is MIME encoded text
        r << v if Sisimai::MIME.is_mimeencoded(v)
      end
    else
      # Subject line is not MIME encoded
      r << table['subject']
    end
    table['subject'] = Sisimai::MIME.mimedecode(r)
  end
  return table
 end

 def useregex3(heads)
  return {} unless heads

  # 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
  # 2. Convert from string to hash reference
  heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
  table = {};

  heads.scan(/^#{FieldTable}:[ ]*(.*?)\n(?![\s\t])/mo) { |e| table[e[0].downcase] = e[1] }
  return table if table['subject'].empty?

  # Convert MIME-Encoded subject
  if Sisimai::String.is_8bit(table['subject'])
    # The value of ``Subject'' header is including multibyte character,
    # is not MIME-Encoded text.
    table['subject'].scrub!('?')
  else
    # MIME-Encoded subject field or ASCII characters only
    r = []
    if Sisimai::MIME.is_mimeencoded(table['subject'])
      # split the value of Subject by borderline
      table['subject'].split(/ /).each do |v|
        # Insert value to the array if the string is MIME encoded text
        r << v if Sisimai::MIME.is_mimeencoded(v)
      end
    else
      # Subject line is not MIME encoded
      r << table['subject']
    end
    table['subject'] = Sisimai::MIME.mimedecode(r)
  end
  return table
 end

 p = [takeapart(Head), useregex1(Head), useregex2(Head), useregex3(Head)]
 p.each do |e|
  puts 'ok' if e.is_a? Hash
  %w|from subject date to return-path message-id|.each do |ee|
    puts 'ok' if e[ee].size > 0
  end

  puts 'ok' if e['from'] =~ /kijitora/
  puts 'ok' if e['subject'] =~ /繝舌え繝ｳ繧ｹ/
  puts 'ok' if e['date'] =~ /Wed, 16 Oct/
  puts 'ok' if e['to'] =~ /bouncehammer[.]jp/
  puts 'ok' if e['return-path'] =~ /example[.]org/
  puts 'ok' if e['message-id'] =~ /BC36/
 end

 Benchmark.bmbm do |b|
  b.report('takeapart') do
    HowMany.times do
      takeapart(Head)
    end
  end
  b.report('useregex1') do
    HowMany.times do
      useregex1(Head)
    end
  end
  b.report('useregex2') do
    HowMany.times do
      useregex2(Head)
    end
  end
  b.report('useregex3') do
    HowMany.times do
      useregex3(Head)
    end
  end
 end

 __END__
 Rehearsal ---------------------------------------------
 takeapart   1.690759   0.004814   1.695573 (  1.713718)
 useregex1   1.632277   0.002730   1.635007 (  1.641662)
 useregex2   3.566479   0.004850   3.571329 (  3.578703)
 useregex3   1.497352   0.002813   1.500165 (  1.504279)
 ------------------------------------ total: 8.402074sec

                user     system      total        real
 takeapart   1.659426   0.002044   1.661470 (  1.668322)
 useregex1   1.686137   0.002063   1.688200 (  1.691882)
 useregex2   3.609357   0.003275   3.612632 (  3.617397)
 useregex3   1.552407   0.002338   1.554745 (  1.558926)
	# Sisimai::Message improvement
	require 'benchmark'
	require 'sisimai/string'
	require 'sisimai/rfc5322'
	require 'sisimai/mime'

	HowMany = 30000
	RFC822Head = Sisimai::RFC5322.HEADERFIELDS
	BorderLine = '__MIME_ENCODED_BOUNDARY__'
	FieldTable = %r/(
	apparently-to
	\|date
	\|delivered-to
	\|envelope-from
	\|envelope-to
	\|errors-to
	\|forward-path
	\|from
	\|list-id
	\|message-id
	\|posted
	\|posted-date
	\|reply-to
	\|resent-date
	\|resent-to
	\|reverse-path
	\|return-path
	\|subject
	\|to
	\|x-envelope-from
	\|x-envelope-to
	\|x-postfix-sender
	)/imx

	Head = <<'EOH'
	Return-Path: <[email protected]>
	Received: from [192.0.2.25] (p0000-ipbfpfx00kyoto.kyoto.example.co.jp [192.0.2.25])
	(authenticated bits=0)
	by smtpgw.example.jp (V8/cf) with ESMTP id r9G5FXh9018568
	for <[email protected]>; Wed, 16 Oct 2013 14:15:34 +0900
	From: "Kijitora Cat" <[email protected]>
	Content-Type: text/plain; charset=utf-8
	Content-Transfer-Encoding: base64
	Subject: =?utf-8?B?44OQ44Km44Oz44K544Oh44O844Or44Gu44OG44K544OIKOaXpQ==?=
	=?utf-8?B?5pys6KqeKQ==?=
	Date: Wed, 16 Oct 2013 14:15:35 +0900
	Message-Id: <[email protected]>
	To: [email protected]
	Mime-Version: 1.0 (Apple Message framework v1283)
	X-Mailer: Apple Mail (2.1283)
	EOH

	def takeapart(heads)
	return {} unless heads

	# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
	# 2. Convert from string to hash reference
	heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")

	previousfn = '' # Previous field name
	asciiarmor = {} # Header names which has MIME encoded value
	headerpart = {} # Required headers in the original message part
	headslices = heads.split("\n")

	while e = headslices.shift do
	# Header name as a key, The value of header as a value
	if e.start_with?(' ', "\t")
	# Continued (foled) header value from the previous line
	next if previousfn.empty?

	# Concatenate the line if it is the value of required header
	if Sisimai::MIME.is_mimeencoded(e)
	# The line is MIME-Encoded test
	headerpart[previousfn] << if previousfn == 'subject'
	# Subject: header
	BorderLine + e
	else
	# Is not Subject header
	e
	end
	asciiarmor[previousfn] = true
	else
	# ASCII Characters only: Not MIME-Encoded
	headerpart[previousfn] << e.lstrip
	asciiarmor[previousfn] \|\|= false
	end
	else
	# Header name as a key, The value of header as a value
	(lhs, rhs) = e.split(/:[ ]*/, 2)
	next unless lhs
	lhs.downcase!
	previousfn = ''

	next unless RFC822Head[lhs]
	previousfn = lhs
	headerpart[previousfn] = rhs unless headerpart[previousfn]
	end
	end
	return headerpart unless headerpart['subject']

	# Convert MIME-Encoded subject
	if Sisimai::String.is_8bit(headerpart['subject'])
	# The value of ``Subject'' header is including multibyte character,
	# is not MIME-Encoded text.
	headerpart['subject'].scrub!('?')
	else
	# MIME-Encoded subject field or ASCII characters only
	r = []
	if asciiarmor['subject']
	# split the value of Subject by borderline
	headerpart['subject'].split(BorderLine).each do \|v\|
	# Insert value to the array if the string is MIME encoded text
	r << v if Sisimai::MIME.is_mimeencoded(v)
	end
	else
	# Subject line is not MIME encoded
	r << headerpart['subject']
	end
	headerpart['subject'] = Sisimai::MIME.mimedecode(r)
	end
	return headerpart
	end

	def useregex1(heads)
	return {} unless heads

	# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
	# 2. Convert from string to hash reference
	heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
	table = {};

	heads.scan(/^([\w-]+):[ ](.?)\n(?![\s\t])/m) { \|e\| table[e[0].downcase] = e[1] }
	return table if table['subject'].empty?

	# Convert MIME-Encoded subject
	if Sisimai::String.is_8bit(table['subject'])
	# The value of ``Subject'' header is including multibyte character,
	# is not MIME-Encoded text.
	table['subject'].scrub!('?')
	else
	# MIME-Encoded subject field or ASCII characters only
	r = []
	if Sisimai::MIME.is_mimeencoded(table['subject'])
	# split the value of Subject by borderline
	table['subject'].split(/ /).each do \|v\|
	# Insert value to the array if the string is MIME encoded text
	r << v if Sisimai::MIME.is_mimeencoded(v)
	end
	else
	# Subject line is not MIME encoded
	r << table['subject']
	end
	table['subject'] = Sisimai::MIME.mimedecode(r)
	end
	return table
	end

	def useregex2(heads)
	return {} unless heads

	# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
	# 2. Convert from string to hash reference
	heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
	table = {};

	heads.scan(/^#{FieldTable}:[ ](.?)\n(?![\s\t])/m) { \|e\| table[e[0].downcase] = e[1] }
	return table if table['subject'].empty?

	# Convert MIME-Encoded subject
	if Sisimai::String.is_8bit(table['subject'])
	# The value of ``Subject'' header is including multibyte character,
	# is not MIME-Encoded text.
	table['subject'].scrub!('?')
	else
	# MIME-Encoded subject field or ASCII characters only
	r = []
	if Sisimai::MIME.is_mimeencoded(table['subject'])
	# split the value of Subject by borderline
	table['subject'].split(/ /).each do \|v\|
	# Insert value to the array if the string is MIME encoded text
	r << v if Sisimai::MIME.is_mimeencoded(v)
	end
	else
	# Subject line is not MIME encoded
	r << table['subject']
	end
	table['subject'] = Sisimai::MIME.mimedecode(r)
	end
	return table
	end

	def useregex3(heads)
	return {} unless heads

	# 1. Scrub to avoid "invalid byte sequence in UTF-8" exception (#82)
	# 2. Convert from string to hash reference
	heads = heads.scrub('?').gsub(/^[>]+[ ]/m, '').gsub(/=[ ]+=/, "=\n =")
	table = {};

	heads.scan(/^#{FieldTable}:[ ](.?)\n(?![\s\t])/mo) { \|e\| table[e[0].downcase] = e[1] }
	return table if table['subject'].empty?

	# Convert MIME-Encoded subject
	if Sisimai::String.is_8bit(table['subject'])
	# The value of ``Subject'' header is including multibyte character,
	# is not MIME-Encoded text.
	table['subject'].scrub!('?')
	else
	# MIME-Encoded subject field or ASCII characters only
	r = []
	if Sisimai::MIME.is_mimeencoded(table['subject'])
	# split the value of Subject by borderline
	table['subject'].split(/ /).each do \|v\|
	# Insert value to the array if the string is MIME encoded text
	r << v if Sisimai::MIME.is_mimeencoded(v)
	end
	else
	# Subject line is not MIME encoded
	r << table['subject']
	end
	table['subject'] = Sisimai::MIME.mimedecode(r)
	end
	return table
	end

	p = [takeapart(Head), useregex1(Head), useregex2(Head), useregex3(Head)]
	p.each do \|e\|
	puts 'ok' if e.is_a? Hash
	%w\|from subject date to return-path message-id\|.each do \|ee\|
	puts 'ok' if e[ee].size > 0
	end

	puts 'ok' if e['from'] =~ /kijitora/
	puts 'ok' if e['subject'] =~ /繝舌え繝ｳ繧ｹ/
	puts 'ok' if e['date'] =~ /Wed, 16 Oct/
	puts 'ok' if e['to'] =~ /bouncehammer[.]jp/
	puts 'ok' if e['return-path'] =~ /example[.]org/
	puts 'ok' if e['message-id'] =~ /BC36/
	end

	Benchmark.bmbm do \|b\|
	b.report('takeapart') do
	HowMany.times do
	takeapart(Head)
	end
	end
	b.report('useregex1') do
	HowMany.times do
	useregex1(Head)
	end
	end
	b.report('useregex2') do
	HowMany.times do
	useregex2(Head)
	end
	end
	b.report('useregex3') do
	HowMany.times do
	useregex3(Head)
	end
	end
	end

	__END__
	Rehearsal ---------------------------------------------
	takeapart 1.690759 0.004814 1.695573 ( 1.713718)
	useregex1 1.632277 0.002730 1.635007 ( 1.641662)
	useregex2 3.566479 0.004850 3.571329 ( 3.578703)
	useregex3 1.497352 0.002813 1.500165 ( 1.504279)
	------------------------------------ total: 8.402074sec

	user system total real
	takeapart 1.659426 0.002044 1.661470 ( 1.668322)
	useregex1 1.686137 0.002063 1.688200 ( 1.691882)
	useregex2 3.609357 0.003275 3.612632 ( 3.617397)
	useregex3 1.552407 0.002338 1.554745 ( 1.558926)