Last active
October 18, 2020 07:50
-
-
Save tonytonyjan/58bcc97fdb1940391eb01b4e4fa1ef2c to your computer and use it in GitHub Desktop.
Ruby implementation for RFC2047
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2020 Jian Weihang <[email protected]> | |
# frozen_string_literal: true | |
module Rfc2047 | |
TOKEN = /[\041\043-\047\052\053\055\060-\071\101-\132\134\136\137\141-\176]+/.freeze | |
ENCODED_TEXT = /[\041-\076\100-\176]*/.freeze | |
ENCODED_WORD = /=\?(?<charset>#{TOKEN})\?(?<encoding>[QBqb])\?(?<encoded_text>#{ENCODED_TEXT})\?=/.freeze | |
ENCODED_WORD_SEQUENCE = /#{ENCODED_WORD}(?:\s*#{ENCODED_WORD})*/.freeze | |
class << self | |
# example | |
# | |
# Rfc2047.decode_value '=?UTF-8?B?5Yu/5Lul5oOh5bCP6ICM54K65LmL77yM5Yu/5Lul5ZaE5bCP6ICM5LiN54K6?= =?UTF-8?B?44CC?=' | |
# # => "勿以惡小而為之,勿以善小而不為。" | |
def decode_value(input) | |
return input unless input.match?(ENCODED_WORD) | |
input.gsub(ENCODED_WORD_SEQUENCE) do |match| | |
result = +'' | |
match.scan(ENCODED_WORD) { result << decode($&) } | |
if result.encoding == Encoding::UTF_7 | |
require 'net/imap' | |
result.replace( | |
Net::IMAP.decode_utf7(result.force_encoding(Encoding::BINARY)) | |
).force_encoding(Encoding::UTF_8) | |
else | |
result.encode!(Encoding::UTF_8) | |
end | |
result | |
end | |
end | |
# example: | |
# | |
# Rfc2047.encode('己所不欲,勿施於人。') | |
# # => "=?UTF-8?B?5bex5omA5LiN5qyy77yM5Yu/5pa95pa85Lq644CC?=" | |
def encode(input, encoding: :B) | |
case encoding | |
when :B then "=?#{input.encoding}?B?#{[input].pack('m0')}?=" | |
when :Q then "=?#{input.encoding}?Q?#{[input].pack('M')}?=" | |
else raise ":encoding should be either :B or :Q, got #{encoding}" | |
end | |
end | |
# example: | |
# | |
# Rfc2047.decode '=?UTF-8?B?5bex5omA5LiN5qyy77yM5Yu/5pa95pa85Lq644CC?=' | |
# # => "己所不欲,勿施於人。" | |
def decode(input) | |
match_data = ENCODED_WORD.match(input) | |
raise ArgumentError if match_data.nil? | |
charset, encoding, encoded_text = match_data.captures | |
charset = 'CP950' if charset == 'MS950' | |
decoded = | |
case encoding | |
when 'Q', 'q' then encoded_text.gsub('_', '=20').unpack1('M') | |
when 'B', 'b' then encoded_text.unpack1('m') | |
end | |
found_encoding = find_encoding(charset) | |
found_encoding = Encoding::UTF_8 if found_encoding == Encoding::ASCII_8BIT | |
decoded.force_encoding(found_encoding) | |
end | |
private | |
def find_encoding(charset) | |
case charset.downcase | |
when 'utf-16' then Encoding::UTF_16BE | |
when 'utf-32' then Encoding::UTF_32BE | |
when 'ks_c_5601-1987' then Encoding::CP949 | |
when 'shift-jis' then Encoding::Shift_JIS | |
when 'gb2312' then Encoding::GB18030 | |
when 'ms950' then Encoding::CP950 | |
when '8bit' then Encoding::ASCII_8BIT | |
when 'latin2' then Encoding::ISO_8859_2 | |
else Encoding.find(charset) | |
end | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2020 Jian Weihang <[email protected]> | |
# frozen_string_literal: true | |
require 'minitest/autorun' | |
require 'rfc_2047' | |
class Test < Minitest::Test | |
def assert_decode_value(expected, actual) | |
assert_equal expected, Rfc2047.decode_value(actual) | |
end | |
end | |
class Rfc2047Test < Test | |
def test_encode | |
assert_equal '=?UTF-8?B?5ris6Kmm?=', Rfc2047.encode('測試') | |
end | |
def test_decode | |
assert_equal 'this is some text', Rfc2047.decode('=?iso-8859-1?q?this=20is=20some=20text?=') | |
assert_equal '測試', Rfc2047.decode('=?UTF-8?B?5ris6Kmm?=') | |
end | |
class Base64 < Test | |
def test_it_should_decode_an_encoded_string | |
assert_decode_value( | |
'This is あ string', | |
'=?UTF-8?B?VGhpcyBpcyDjgYIgc3RyaW5n?=' | |
) | |
end | |
def test_it_should_decode_a_long_encoded_string | |
assert_decode_value( | |
'This is あ really long string This is あ really long string This is あ really long string This is あ really long string This is あ really long string', | |
'=?UTF-8?B?VGhpcyBpcyDjgYIgcmVhbGx5IGxvbmcgc3RyaW5nIFRoaXMgaXMg44GCIHJl?= =?UTF-8?B?YWxseSBsb25nIHN0cmluZyBUaGlzIGlzIOOBgiByZWFsbHkgbG9uZyBzdHJp?= =?UTF-8?B?bmcgVGhpcyBpcyDjgYIgcmVhbGx5IGxvbmcgc3RyaW5nIFRoaXMgaXMg44GC?= =?UTF-8?B?IHJlYWxseSBsb25nIHN0cmluZw==?=' | |
) | |
end | |
def test_it_should_decode_utf_16_encoded_string | |
assert_decode_value( | |
'あいうえお', | |
'=?UTF-16?B?MEIwRDBGMEgwSg==?=' | |
) | |
end | |
def test_it_should_decode_utf_32_encoded_string | |
assert_decode_value( | |
'あいうえお', | |
'=?UTF-32?B?AAAwQgAAMEQAADBGAAAwSAAAMEo=?=' | |
) | |
end | |
def test_it_should_decoded | |
assert_decode_value( | |
'案件情報[-01 大手資産運用会社 - 資産運用にかかるDWHの二次開発業務]', | |
"=?iso-2022-jp?Q?=1B=24B0F7o=3EpJs=1B=28B=5B=2D01_=1B=24?=\n =?iso-2022-jp?Q?BBg=3Cj=3Bq=3B=3A1=3FMQ2q=3CR=1B=28B_=2D_=1B=24B=3B?=\n =?iso-2022-jp?Q?q=3B=3A1=3FMQ=24K=24=2B=24=2B=24k=1B=28BDWH=1B=24B=24?=\n =?iso-2022-jp?Q?NFs=3C=213=2BH=2F6HL3=1B=28B=5D?=" | |
) | |
end | |
def test_it_should_decode_a_string_that_looks_similar_to_an_encoded_string | |
assert_decode_value('1+1=?', '1+1=?') | |
end | |
def test_it_should_parse_adjacent_encoded_words_separated_by_linear_white_space | |
assert_decode_value( | |
'новый сотрудник — дорофеев', | |
"=?utf-8?B?0L3QvtCy0YvQuSDRgdC+0YLRgNGD0LTQvdC40Log4oCUINC00L7RgNC+0YQ=?=\n =?utf-8?B?0LXQtdCy?=" | |
) | |
end | |
def test_it_should_parse_adjacent_words_with_no_space | |
assert_decode_value( | |
'новый сотрудник — дорофеев', | |
'=?utf-8?B?0L3QvtCy0YvQuSDRgdC+0YLRgNGD0LTQvdC40Log4oCUINC00L7RgNC+0YQ=?==?utf-8?B?0LXQtdCy?=' | |
) | |
end | |
def test_it_should_collapse_adjacent_words_with_multiple_encodings_on_one_line_seperated_by_non_spaces | |
assert_decode_value( | |
"Re:[グルーポン・ジャパン株式会社] 返信:【グルーポン】お問い合わせの件について(リクエスト#1056273\n )", | |
"Re:[=?iso-2022-jp?B?GyRCJTAlayE8JV0lcyEmJTglYyVRJXMzdDwwMnEbKEI=?=\n =?iso-2022-jp?B?GyRCPFIbKEI=?=] =?iso-2022-jp?B?GyRCSlY/LiEnGyhC?=\n =?iso-2022-jp?B?GyRCIVolMCVrITwlXSVzIVskKkxkJCQ5ZyRvJDsbKEI=?=\n =?iso-2022-jp?B?GyRCJE43byRLJEQkJCRGIUolaiUvJSglOSVIGyhC?=#1056273\n =?iso-2022-jp?B?GyRCIUsbKEI=?=" | |
) | |
end | |
def test_it_should_decode_a_blank_string | |
assert_decode_value('', '=?utf-8?B??=') | |
end | |
def test_it_should_decode_ks_c_5601_1987_encoded_string | |
assert_decode_value( | |
'김 현진 <[email protected]>', | |
'=?ks_c_5601-1987?B?seggx/bB+A==?= <[email protected]>' | |
) | |
end | |
def test_it_should_decode_shift_jis_encoded_string | |
assert_decode_value('日本語', '=?shift-jis?Q?=93=FA=96{=8C=EA?=') | |
end | |
def test_it_should_decode_gb18030_encoded_string_misidentified_as_gb2312 | |
assert_decode_value('開', '=?GB2312?B?6V8=?=') | |
end | |
def test_it_should_decode_a_utf_7_encoded_unstructured_field | |
assert_decode_value( | |
'勿以惡小而為之,勿以善小而不為。', | |
'=?utf-7?B?5Yu/5Lul5oOh5bCP6ICM54K65LmL77yM5Yu/5Lul5ZaE5bCP6ICM5LiN54K6?= =?utf-7?B?44CC?=' | |
) | |
end | |
end | |
class QuotedPrintable < Test | |
def test_it_should_decode_an_encoded_string | |
assert_decode_value( | |
'This is あ string', | |
'=?UTF-8?Q?This_is_=E3=81=82_string?=' | |
) | |
end | |
def test_it_should_decode_q_encoded_5F_as_underscore | |
assert_decode_value( | |
'This and_that', | |
'=?UTF-8?Q?This_=C2=AD_and=5Fthat?=' | |
) | |
end | |
def test_it_should_decode_a_blank_string | |
assert_decode_value('', '=?utf-8?Q??=') | |
end | |
def test_it_should_decode_8bit_encoded_string | |
assert_decode_value("ALPH\xC3\x89E", '=?8bit?Q?ALPH=C3=89E?=') | |
end | |
end | |
class Mixed < Test | |
def test_it_should_decode_an_encoded_string2 | |
assert_decode_value( | |
'This is あ string This was あ string', | |
'=?UTF-8?B?VGhpcyBpcyDjgYIgc3RyaW5n?= =?UTF-8?Q?_This_was_=E3=81=82_string?=' | |
) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment