Skip to content

Instantly share code, notes, and snippets.

@tmm1
Created December 16, 2012 13:06
Show Gist options
  • Save tmm1/4307047 to your computer and use it in GitHub Desktop.
Save tmm1/4307047 to your computer and use it in GitHub Desktop.
UTF-8 and ASCII-8BIT (BINARY) compatibility

The patch below fixes the following tests in 1.9:

$ ruby -v binary_test.rb 
ruby 1.9.3p194 (2012-04-20 revision 35410) [x86_64-darwin11.4.2]
Run options: 

# Running tests:

E...E....FF

Finished tests in 0.007315s, 1503.7594 tests/s, 3691.0458 assertions/s.

  1) Error:
test_add_8bit_plus_utf8(BinaryUTF8CompatTest):
Encoding::CompatibilityError: incompatible character encodings: ASCII-8BIT and UTF-8
    binary_test.rb:62:in `test_add_8bit_plus_utf8'

  2) Error:
test_concat_8bit_and_utf8(BinaryUTF8CompatTest):
Encoding::CompatibilityError: incompatible character encodings: ASCII-8BIT and UTF-8
    binary_test.rb:106:in `test_concat_8bit_and_utf8'

  3) Failure:
test_encode_utf8_to_binary(BinaryUTF8CompatTest) [binary_test.rb:13]:
Exception raised:
<#<Encoding::UndefinedConversionError: U+00E9 from UTF-8 to ASCII-8BIT>>.

  4) Failure:
test_equal_contents(BinaryUTF8CompatTest) [binary_test.rb:25]:
<"h\xC3\xA9ll\xC3\xB8"> expected but was
<"héllø">.

11 tests, 27 assertions, 2 failures, 2 errors, 0 skips
diff --git a/enc/trans/single_byte.trans b/enc/trans/single_byte.trans
index 1bf1001..dfff0be 100644
--- a/enc/trans/single_byte.trans
+++ b/enc/trans/single_byte.trans
@@ -6,7 +6,7 @@
transcode_tblgen "US-ASCII", "UTF-8", us_ascii_map
transcode_tblgen "UTF-8", "US-ASCII", us_ascii_map
transcode_tblgen "ASCII-8BIT", "UTF-8", us_ascii_map
- transcode_tblgen "UTF-8", "ASCII-8BIT", us_ascii_map
+ transcode_tblgen "UTF-8", "ASCII-8BIT", [["{00-ff}", :nomap]], '{00-ff}'
CONTROL1_TO_UCS_TBL = (0x80..0x9f).map {|c| ["%02X" % c, c] }
diff --git a/encoding.c b/encoding.c
index b8c5f6d..1b8d4f7 100644
--- a/encoding.c
+++ b/encoding.c
@@ -806,6 +806,12 @@ rb_enc_compatible(VALUE str1, VALUE str2)
if (cr2 == ENC_CODERANGE_7BIT) {
return enc1;
}
+ if (idx1 == ENCINDEX_UTF_8 && idx2 == ENCINDEX_ASCII) {
+ return enc2;
+ }
+ else if (idx1 == ENCINDEX_ASCII && idx2 == ENCINDEX_UTF_8) {
+ return enc1;
+ }
}
if (cr1 == ENC_CODERANGE_7BIT)
return enc2;
diff --git a/string.c b/string.c
index 134d65b..01311ab 100644
--- a/string.c
+++ b/string.c
@@ -1947,7 +1947,11 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
if (ptr_cr_ret)
*ptr_cr_ret = ptr_cr;
- if (str_encindex != ptr_encindex &&
+ if ((str_encindex == rb_utf8_encindex() && ptr_encindex == rb_ascii8bit_encindex()) ||
+ (str_encindex == rb_ascii8bit_encindex() && ptr_encindex == rb_utf8_encindex())) {
+ /* pass through */
+ }
+ else if (str_encindex != ptr_encindex &&
str_cr != ENC_CODERANGE_7BIT &&
ptr_cr != ENC_CODERANGE_7BIT) {
incompatible:
@@ -1956,7 +1960,13 @@ rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
rb_enc_name(rb_enc_from_index(ptr_encindex)));
}
- if (str_cr == ENC_CODERANGE_UNKNOWN) {
+ if (str_encindex != ptr_encindex &&
+ str_cr != ENC_CODERANGE_7BIT &&
+ ptr_cr != ENC_CODERANGE_7BIT) {
+ res_encindex = rb_ascii8bit_encindex();
+ res_cr = ENC_CODERANGE_VALID;
+ }
+ else if (str_cr == ENC_CODERANGE_UNKNOWN) {
res_encindex = str_encindex;
res_cr = ENC_CODERANGE_UNKNOWN;
}
@@ -2227,6 +2237,10 @@ rb_str_comparable(VALUE str1, VALUE str2)
if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
return TRUE;
}
+ if ((idx1 == rb_utf8_encindex() && idx2 == rb_ascii8bit_encindex()) ||
+ (idx1 == rb_ascii8bit_encindex() && idx2 == rb_utf8_encindex())) {
+ return TRUE;
+ }
return FALSE;
}
# coding: utf-8
require 'test/unit'
class BinaryUTF8CompatTest < Test::Unit::TestCase
def setup
@binary_mb = 'héllø'.force_encoding('binary')
@utf8_mb = 'héllø'.force_encoding('utf-8')
@binary_as = 'hello'.force_encoding('binary')
@utf8_as = 'hello'.force_encoding('utf-8')
end
def test_encode_utf8_to_binary
assert_nothing_raised do
@utf8_mb.encode('binary')
end
end
def test_encode_binary_to_utf8
assert_raises Encoding::UndefinedConversionError do
@binary_mb.encode('utf-8')
end
end
def test_equal_contents
assert_equal @binary_mb, @utf8_mb
end
def test_add_binary
ret = @binary_mb + @binary_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_mb + @binary_as
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_as + @binary_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_as + @binary_as
assert_equal Encoding::ASCII_8BIT, ret.encoding
end
def test_add_utf8
ret = @utf8_mb + @utf8_mb
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_mb + @utf8_as
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_as + @utf8_mb
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_as + @utf8_as
assert_equal Encoding::UTF_8, ret.encoding
end
def test_add_utf8_plus_7bit
ret = @binary_as + @utf8_as
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_as + @utf8_mb
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_as + @binary_as
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_mb + @binary_as
assert_equal Encoding::UTF_8, ret.encoding
end
def test_add_8bit_plus_utf8
ret = @binary_mb + @utf8_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_mb + @utf8_as
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @utf8_mb + @binary_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @utf8_as + @binary_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
end
def test_concat_binary
ret = @binary_mb.dup << @binary_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_mb.dup << @binary_as
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_as.dup << @binary_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_as.dup << @binary_as
assert_equal Encoding::ASCII_8BIT, ret.encoding
end
def test_concat_utf8
ret = @utf8_mb.dup << @utf8_mb
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_mb.dup << @utf8_as
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_as.dup << @utf8_mb
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_as.dup << @utf8_as
assert_equal Encoding::UTF_8, ret.encoding
end
def test_concat_utf8_and_7bit
ret = @binary_as.dup << @utf8_as
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_as.dup << @utf8_mb
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_as.dup << @binary_as
assert_equal Encoding::UTF_8, ret.encoding
ret = @utf8_mb.dup << @binary_as
assert_equal Encoding::UTF_8, ret.encoding
end
def test_concat_8bit_and_utf8
ret = @binary_mb.dup << @utf8_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @binary_mb.dup << @utf8_as
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @utf8_mb.dup << @binary_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
ret = @utf8_as.dup << @binary_mb
assert_equal Encoding::ASCII_8BIT, ret.encoding
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment