Skip to content

Instantly share code, notes, and snippets.

@patrickt
Created May 20, 2010 23:32
Show Gist options
  • Save patrickt/408270 to your computer and use it in GitHub Desktop.
Save patrickt/408270 to your computer and use it in GitHub Desktop.
Index: encoding.c
===================================================================
--- encoding.c (revision 4140)
+++ encoding.c (working copy)
@@ -146,6 +146,32 @@ mr_enc_dummy_p(VALUE self, SEL sel)
return Qfalse;
}
+// For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
+rb_str_t *replacement_string_for_encoding(rb_encoding_t* destination)
+{
+ rb_str_t *replacement_str = NULL;
+ if (destination == rb_encodings[ENCODING_UTF16BE]) {
+ replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, destination));
+ }
+ else if (destination == rb_encodings[ENCODING_UTF32BE]) {
+ replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, destination));
+ }
+ else if (destination == rb_encodings[ENCODING_UTF16LE]) {
+ replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, destination));
+ }
+ else if (destination == rb_encodings[ENCODING_UTF32LE]) {
+ replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, destination));
+ }
+ else if (destination == rb_encodings[ENCODING_UTF8]) {
+ replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, destination));
+ }
+ else {
+ replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII]));
+ replacement_str = str_simple_transcode(replacement_str, destination);
+ }
+ return replacement_str;
+}
+
static void
define_encoding_constant(const char *name, rb_encoding_t *encoding)
{
@@ -291,6 +317,7 @@ Init_PreEncoding(void)
add_encoding(ENCODING_BIG5, ENCODING_TYPE_UCNV, "Big5", 1, false, true, "CP950", NULL);
// FIXME: the ICU conversion tables do not seem to match Ruby's Japanese conversion tables
add_encoding(ENCODING_EUCJP, ENCODING_TYPE_UCNV, "EUC-JP", 1, false, true, "eucJP", NULL);
+ add_encoding(ENCODING_SJIS, ENCODING_TYPE_UCNV, "Shift_JIS", 1, false, true, "SJIS", NULL);
//add_encoding(ENCODING_EUCJP, ENCODING_TYPE_RUBY, "EUC-JP", 1, false, true, "eucJP", NULL);
//add_encoding(ENCODING_SJIS, ENCODING_TYPE_RUBY, "Shift_JIS", 1, false, true, "SJIS", NULL);
//add_encoding(ENCODING_CP932, ENCODING_TYPE_RUBY, "Windows-31J", 1, false, true, "CP932", "csWindows31J", NULL);
Index: encoding.h
===================================================================
--- encoding.h (revision 4140)
+++ encoding.h (working copy)
@@ -148,7 +148,7 @@ enum {
ENCODING_MACCYRILLIC,
ENCODING_BIG5,
ENCODING_EUCJP,
- //ENCODING_SJIS,
+ ENCODING_SJIS,
//ENCODING_CP932,
ENCODINGS_COUNT
@@ -293,6 +293,40 @@ str_set_valid_encoding(rb_str_t *self, bool status)
STRING_VALID_ENCODING);
}
+typedef enum {
+ TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
+ TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING,
+ TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT,
+ TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR
+} transcode_behavior_t;
+
+typedef enum {
+ ECONV_INVALID_MASK = 1,
+ ECONV_INVALID_REPLACE = 1 << 1,
+ ECONV_UNDEF_MASK = 1 << 2,
+ ECONV_UNDEF_REPLACE = 1 << 3,
+ ECONV_UNDEF_HEX_CHARREF = 1 << 4,
+ ECONV_PARTIAL_INPUT = 1 << 5,
+ ECONV_AFTER_OUTPUT = 1 << 6,
+ ECONV_UNIVERSAL_NEWLINE_DECORATOR = 1 << 7,
+ ECONV_CRLF_NEWLINE_DECORATOR = 1 << 8,
+ ECONV_CR_NEWLINE_DECORATOR = 1 << 9,
+ ECONV_XML_TEXT_DECORATOR = 1 << 10,
+ ECONV_XML_ATTR_CONTENT_DECORATOR = 1 << 11,
+ ECONV_XML_ATTR_QUOTE_DECORATOR = 1 << 12
+} transcode_flags_t;
+
+rb_str_t *str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
+ int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str);
+
+static inline rb_str_t *
+str_simple_transcode(rb_str_t *self, rb_encoding_t *dst_encoding)
+{
+ return str_transcode(self, self->encoding, dst_encoding,
+ TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
+}
+
+
void rb_str_NSCoder_encode(void *coder, VALUE str, const char *key);
VALUE rb_str_NSCoder_decode(void *coder, const char *key);
@@ -319,6 +353,10 @@ unsigned long rb_str_hash_uchars(const UChar *chars, long chars_len);
long rb_uchar_strtol(UniChar *chars, long chars_len, long pos,
long *end_offset);
void rb_str_force_encoding(VALUE str, rb_encoding_t *encoding);
+rb_str_t *str_need_string(VALUE str);
+rb_str_t *replacement_string_for_encoding(rb_encoding_t* enc);
+void str_replace_with_string(rb_str_t *self, rb_str_t *source);
+
#if defined(__cplusplus)
} // extern "C"
Index: inits.c
===================================================================
--- inits.c (revision 4140)
+++ inits.c (working copy)
@@ -58,6 +58,7 @@ void Init_ObjC(void);
void Init_BridgeSupport(void);
void Init_FFI(void);
void Init_Dispatch(void);
+void Init_Transcode(void);
void Init_PostVM(void);
void
@@ -110,5 +111,6 @@ rb_call_inits()
Init_BridgeSupport();
Init_FFI();
Init_Dispatch();
+ Init_Transcode();
Init_PostVM();
}
Index: rakelib/builder/builder.rb
===================================================================
--- rakelib/builder/builder.rb (revision 4140)
+++ rakelib/builder/builder.rb (working copy)
@@ -6,7 +6,7 @@ OBJS = %w{
random range rational re ruby signal sprintf st string struct time
util variable version thread id objc bs ucnv encoding main dln dmyext marshal
gcd vm_eval gc-stub bridgesupport compiler dispatcher vm symbol debugger MacRuby
- MacRubyDebuggerConnector NSArray NSDictionary NSString
+ MacRubyDebuggerConnector NSArray NSDictionary NSString transcode
}
EXTENSIONS = %w{
Index: spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt
===================================================================
--- spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt (revision 4140)
+++ spec/frozen/tags/macruby/core/encoding/converter/asciicompat_encoding_tags.txt (working copy)
@@ -1,7 +1,4 @@
-fails:Encoding::Converter.asciicompat_encoding accepts an encoding name as a String argument
fails:Encoding::Converter.asciicompat_encoding coerces non-String/Encoding objects with #to_str
fails:Encoding::Converter.asciicompat_encoding accepts an Encoding object as an argument
fails:Encoding::Converter.asciicompat_encoding returns a corresponding ASCII compatible encoding for ASCII-incompatible encodings
-fails:Encoding::Converter.asciicompat_encoding returns nil when the given encoding is ASCII compatible
fails:Encoding::Converter.asciicompat_encoding handles encoding names who resolve to nil encodings
-fails:Encoding::Converter.asciicompat_encoding returns nil if called with an encoding it returned previously
Index: spec/frozen/tags/macruby/core/encoding/converter/constants_tags.txt
deleted file mode 100644
===================================================================
--- spec/frozen/tags/macruby/core/encoding/converter/constants_tags.txt (revision 4140)
+++ /dev/null (working copy)
@@ -1,26 +0,0 @@
-fails:Encoding::Converter::INVALID_MASK exists
-fails:Encoding::Converter::INVALID_MASK has a Fixnum value
-fails:Encoding::Converter::INVALID_REPLACE exists
-fails:Encoding::Converter::INVALID_REPLACE has a Fixnum value
-fails:Encoding::Converter::UNDEF_MASK exists
-fails:Encoding::Converter::UNDEF_MASK has a Fixnum value
-fails:Encoding::Converter::UNDEF_REPLACE exists
-fails:Encoding::Converter::UNDEF_REPLACE has a Fixnum value
-fails:Encoding::Converter::UNDEF_HEX_CHARREF exists
-fails:Encoding::Converter::UNDEF_HEX_CHARREF has a Fixnum value
-fails:Encoding::Converter::PARTIAL_INPUT exists
-fails:Encoding::Converter::PARTIAL_INPUT has a Fixnum value
-fails:Encoding::Converter::AFTER_OUTPUT exists
-fails:Encoding::Converter::AFTER_OUTPUT has a Fixnum value
-fails:Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR exists
-fails:Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR has a Fixnum value
-fails:Encoding::Converter::CRLF_NEWLINE_DECORATOR exists
-fails:Encoding::Converter::CRLF_NEWLINE_DECORATOR has a Fixnum value
-fails:Encoding::Converter::CR_NEWLINE_DECORATOR exists
-fails:Encoding::Converter::CR_NEWLINE_DECORATOR has a Fixnum value
-fails:Encoding::Converter::XML_TEXT_DECORATOR exists
-fails:Encoding::Converter::XML_TEXT_DECORATOR has a Fixnum value
-fails:Encoding::Converter::XML_ATTR_CONTENT_DECORATOR exists
-fails:Encoding::Converter::XML_ATTR_CONTENT_DECORATOR has a Fixnum value
-fails:Encoding::Converter::XML_ATTR_QUOTE_DECORATOR exists
-fails:Encoding::Converter::XML_ATTR_QUOTE_DECORATOR has a Fixnum value
Index: spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt
===================================================================
--- spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt (revision 4140)
+++ spec/frozen/tags/macruby/core/encoding/converter/convert_tags.txt (working copy)
@@ -1,7 +1,2 @@
-fails:Encoding::Converter#convert returns a String
-fails:Encoding::Converter#convert sets the encoding of the result to the target encoding
-fails:Encoding::Converter#convert transcodes the given String to the target encoding
fails:Encoding::Converter#convert allows Strings of different encodings to the source encoding
-fails:Encoding::Converter#convert reuses the given encoding pair if called multiple times
-fails:Encoding::Converter#convert raises UndefinedConversionError if the String contains characters invalid for the target encoding
-fails:Encoding::Converter#convert raises an ArgumentError if called on a finished stream
+
Index: spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt
===================================================================
--- spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt (revision 4140)
+++ spec/frozen/tags/macruby/core/encoding/converter/convpath_tags.txt (working copy)
@@ -1,7 +1,2 @@
-fails:Encoding::Converter#convpath returns an Array
-fails:Encoding::Converter#convpath returns each encoding pair as a sub-Array
-fails:Encoding::Converter#convpath returns each encoding as an Encoding object
fails:Encoding::Converter#convpath returns multiple encoding pairs when direct conversion is impossible
-fails:Encoding::Converter#convpath sets the last element of each pair to the first element of the next
-fails:Encoding::Converter#convpath only lists a source encoding once
fails:Encoding::Converter#convpath indicates if crlf_newline conversion would occur
Index: spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt
===================================================================
--- spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt (revision 4140)
+++ spec/frozen/tags/macruby/core/encoding/converter/destination_encoding_tags.txt (working copy)
@@ -1 +1 @@
-fails:Encoding::Converter#destination_encoding returns the destination encoding as an Encoding object
+
Index: spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt
===================================================================
--- spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt (revision 4140)
+++ spec/frozen/tags/macruby/core/encoding/converter/replacement_tags.txt (working copy)
@@ -1,8 +1,3 @@
fails:Encoding::Converter#replacement returns '?' in US-ASCII when the destination encoding is not UTF-8
-fails:Encoding::Converter#replacement returns � when the destination encoding is UTF-8
-fails:Encoding::Converter#replacement= accepts a String argument
-fails:Encoding::Converter#replacement= accepts a String argument of arbitrary length
-fails:Encoding::Converter#replacement= raises an TypeError if assigned a non-String argument
-fails:Encoding::Converter#replacement= sets #replacement
fails:Encoding::Converter#replacement= raises an UndefinedConversionError is the argument cannot be converted into the destination encoding
fails:Encoding::Converter#replacement= does not change the replacement character if the argument cannot be converted into the destination encoding
Index: spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt
===================================================================
--- spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt (revision 4140)
+++ spec/frozen/tags/macruby/core/encoding/converter/search_convpath_tags.txt (working copy)
@@ -1,8 +1,3 @@
-fails:Encoding::Converter.search_convpath returns an Array
-fails:Encoding::Converter.search_convpath returns each encoding pair as a sub-Array
-fails:Encoding::Converter.search_convpath returns each encoding as an Encoding object
fails:Encoding::Converter.search_convpath returns multiple encoding pairs when direct conversion is impossible
-fails:Encoding::Converter.search_convpath sets the last element of each pair to the first element of the next
-fails:Encoding::Converter.search_convpath only lists a source encoding once
fails:Encoding::Converter.search_convpath indicates if crlf_newline conversion would occur
fails:Encoding::Converter.search_convpath raises an Encoding::ConverterNotFoundError if no conversion path exists
Index: spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt
===================================================================
--- spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt (revision 4140)
+++ spec/frozen/tags/macruby/core/encoding/converter/source_encoding_tags.txt (working copy)
@@ -1 +1 @@
-fails:Encoding::Converter#source_encoding returns the source encoding as an Encoding object
+
Index: string.c
===================================================================
--- string.c (revision 4140)
+++ string.c (working copy)
@@ -251,7 +251,7 @@ str_replace_with_bytes(rb_str_t *self, const char *bytes, long len,
}
}
-static void
+void
str_replace_with_string(rb_str_t *self, rb_str_t *source)
{
if (self == source) {
@@ -1118,7 +1118,7 @@ str_include_string(rb_str_t *self, rb_str_t *searched)
self->length_in_bytes, true) != -1;
}
-static rb_str_t *
+rb_str_t *
str_need_string(VALUE str)
{
switch (TYPE(str)) {
@@ -1247,24 +1247,6 @@ rstr_append(VALUE str, VALUE substr)
}
}
-enum {
- TRANSCODE_BEHAVIOR_RAISE_EXCEPTION,
- TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING,
- TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT,
- TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR
-};
-
-
-static rb_str_t *
-str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
- int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str);
-static inline rb_str_t *
-str_simple_transcode(rb_str_t *self, rb_encoding_t *dst_encoding)
-{
- return str_transcode(self, self->encoding, dst_encoding,
- TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, NULL);
-}
-
static void inline
str_concat_ascii_cstr(rb_str_t *self, char *cstr)
{
@@ -1280,7 +1262,7 @@ str_concat_ascii_cstr(rb_str_t *self, char *cstr)
}
}
-static rb_str_t *
+rb_str_t *
str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str)
{
@@ -1844,165 +1826,6 @@ rstr_is_ascii_only(VALUE self, SEL sel)
return str_is_ruby_ascii_only(RSTR(self)) ? Qtrue : Qfalse;
}
-/*
- * call-seq:
- * str.encode(encoding [, options] ) => str
- * str.encode(dst_encoding, src_encoding [, options] ) => str
- * str.encode([options]) => str
- *
- * The first form returns a copy of <i>str</i> transcoded
- * to encoding +encoding+.
- * The second form returns a copy of <i>str</i> transcoded
- * from src_encoding to dst_encoding.
- * The last form returns a copy of <i>str</i> transcoded to
- * <code>Encoding.default_internal</code>.
- * By default, the first and second form raise
- * Encoding::UndefinedConversionError for characters that are
- * undefined in the destination encoding, and
- * Encoding::InvalidByteSequenceError for invalid byte sequences
- * in the source encoding. The last form by default does not raise
- * exceptions but uses replacement strings.
- * The <code>options</code> Hash gives details for conversion.
- *
- * === options
- * The hash <code>options</code> can have the following keys:
- * :invalid ::
- * If the value is <code>:replace</code>, <code>#encode</code> replaces
- * invalid byte sequences in <code>str</code> with the replacement character.
- * The default is to raise the exception
- * :undef ::
- * If the value is <code>:replace</code>, <code>#encode</code> replaces
- * characters which are undefined in the destination encoding with
- * the replacement character.
- * :replace ::
- * Sets the replacement string to the value. The default replacement
- * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
- * :xml ::
- * The value must be <code>:text</code> or <code>:attr</code>.
- * If the value is <code>:text</code> <code>#encode</code> replaces
- * undefined characters with their (upper-case hexadecimal) numeric
- * character references. '&', '<', and '>' are converted to "&amp;",
- * "&lt;", and "&gt;", respectively.
- * If the value is <code>:attr</code>, <code>#encode</code> also quotes
- * the replacement result (using '"'), and replaces '"' with "&quot;".
- */
-extern rb_encoding_t *default_internal;
-static VALUE
-rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
-{
- VALUE opt = Qnil;
- if (argc > 0) {
- opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
- if (!NIL_P(opt)) {
- argc--;
- }
- }
-
- rb_str_t *self = RSTR(str);
- rb_str_t *replacement_str = NULL;
- rb_encoding_t *src_encoding, *dst_encoding;
- int behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
- int behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
- if (argc == 0) {
- src_encoding = self->encoding;
- dst_encoding = default_internal;
- behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- }
- else if (argc == 1) {
- src_encoding = self->encoding;
- dst_encoding = rb_to_encoding(argv[0]);
- }
- else if (argc == 2) {
- dst_encoding = rb_to_encoding(argv[0]);
- src_encoding = rb_to_encoding(argv[1]);
- }
- else {
- rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
- }
-
- if (!NIL_P(opt)) {
- VALUE invalid_val = rb_hash_aref(opt, ID2SYM(rb_intern("invalid")));
- VALUE replace_sym = ID2SYM(rb_intern("replace"));
- if (invalid_val == replace_sym) {
- behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- }
- VALUE undefined_val = rb_hash_aref(opt, ID2SYM(rb_intern("undefined")));
- if (undefined_val == replace_sym) {
- behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- }
- VALUE xml_val = rb_hash_aref(opt, ID2SYM(rb_intern("xml")));
- if (xml_val == ID2SYM(rb_intern("text"))) {
- behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
- }
- else if (xml_val == ID2SYM(rb_intern("attr"))) {
- behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
- }
-
- VALUE replacement = rb_hash_aref(opt, replace_sym);
- if (!NIL_P(replacement)) {
- replacement_str = str_need_string(replacement);
- if ((replacement_str->encoding != dst_encoding) && (replacement_str->length_in_bytes > 0)) {
- replacement_str = str_simple_transcode(replacement_str, dst_encoding);
- }
- if ((behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
- && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
- behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
- }
- }
- }
-
- if ((replacement_str == NULL)
- && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
- || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
- if (dst_encoding == rb_encodings[ENCODING_UTF16BE]) {
- replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, dst_encoding));
- }
- else if (dst_encoding == rb_encodings[ENCODING_UTF32BE]) {
- replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, dst_encoding));
- }
- else if (dst_encoding == rb_encodings[ENCODING_UTF16LE]) {
- replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, dst_encoding));
- }
- else if (dst_encoding == rb_encodings[ENCODING_UTF32LE]) {
- replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, dst_encoding));
- }
- else if (dst_encoding == rb_encodings[ENCODING_UTF8]) {
- replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, dst_encoding));
- }
- else {
- replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII]));
- replacement_str = str_simple_transcode(replacement_str, dst_encoding);
- }
- }
-
- return (VALUE)str_transcode(self, src_encoding, dst_encoding,
- behavior_for_invalid, behavior_for_undefined, replacement_str);
-}
-
-/*
- * call-seq:
- * str.encode!(encoding [, options] ) => str
- * str.encode!(dst_encoding, src_encoding [, options] ) => str
- *
- * The first form transcodes the contents of <i>str</i> from
- * str.encoding to +encoding+.
- * The second form transcodes the contents of <i>str</i> from
- * src_encoding to dst_encoding.
- * The options Hash gives details for conversion. See String#encode
- * for details.
- * Returns the string even if no changes were made.
- */
-static VALUE
-rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
-{
- rstr_modify(str);
-
- VALUE new_str = rstr_encode(str, sel, argc, argv);
- str_replace_with_string(RSTR(str), RSTR(new_str));
- return str;
-}
-
/*
* call-seq:
@@ -5958,8 +5781,6 @@ Init_String(void)
rb_objc_define_method(rb_cRubyString, "partition", rstr_partition, 1);
rb_objc_define_method(rb_cRubyString, "rpartition", rstr_rpartition, 1);
rb_objc_define_method(rb_cRubyString, "crypt", rstr_crypt, 1);
- rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
- rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
// MacRuby extensions.
rb_objc_define_method(rb_cRubyString, "transform", rstr_transform, 1);
Index: transcode.c
new file mode 100644
===================================================================
--- /dev/null (revision 4140)
+++ transcode.c (working copy)
@@ -0,0 +1,450 @@
+/*
+ * MacRuby implementation of transcode.c.
+ *
+ * This file is covered by the Ruby license. See COPYING for more details.
+ *
+ * Copyright (C) 2007-2010, Apple Inc. All rights reserved.
+ * Copyright (C) 1993-2007 Yukihiro Matsumoto
+ * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
+ * Copyright (C) 2000 Information-technology Promotion Agency, Japan
+ */
+
+// Notes:
+// AFAICT, we need to add support for newline decorators.
+
+#include "ruby.h"
+#include "ruby/encoding.h"
+#include "encoding.h"
+
+static VALUE sym_invalid;
+static VALUE sym_undef;
+static VALUE sym_replace;
+static VALUE sym_xml;
+static VALUE sym_text;
+static VALUE sym_attr;
+
+typedef struct rb_econv_s {
+ rb_encoding_t *source;
+ rb_encoding_t *destination;
+ transcode_behavior_t invalid_sequence_behavior;
+ transcode_behavior_t undefined_conversion_behavior;
+ transcode_flags_t special_flags;
+ rb_str_t *replacement;
+ bool finished;
+} rb_econv_t;
+
+VALUE rb_cEncodingConverter;
+
+static rb_econv_t* RConverter(VALUE self) {
+ rb_econv_t *conv;
+ Data_Get_Struct(self, rb_econv_t, conv);
+ return conv;
+}
+
+static VALUE
+rb_econv_alloc(VALUE klass, SEL sel)
+{
+ rb_econv_t *conv = ALLOC(rb_econv_t);
+ conv->source = NULL;
+ conv->destination = NULL;
+ conv->replacement = NULL;
+ conv->special_flags = 0;
+ conv->finished = false;
+ return Data_Wrap_Struct(klass, 0, 0, conv);
+}
+
+static VALUE
+rb_econv_asciicompat_encoding(VALUE klass, SEL sel, VALUE arg)
+{
+ rb_encoding_t *enc = NULL;
+ if (CLASS_OF(arg) == rb_cEncoding) {
+ enc = rb_to_encoding(arg);
+ }
+ else {
+ StringValue(arg);
+ enc = rb_enc_find(RSTRING_PTR(arg));
+ }
+
+ if ((enc == NULL) || (enc->ascii_compatible)) {
+ return Qnil;
+ }
+ else if (UTF16_ENC(enc) || UTF32_ENC(enc)) {
+ return (VALUE)rb_utf8_encoding();
+ }
+ // TODO: Port MRI's table that maps ASCII-incompatible encodings to compatible ones.
+ rb_raise(rb_eConverterNotFoundError, "could not find ASCII-compatible encoding for %s", enc->public_name);
+}
+
+static VALUE rb_econv_convpath(VALUE self, SEL sel);
+
+static VALUE
+rb_econv_search_convpath(VALUE klass, SEL sel, int argc, VALUE* argv)
+{
+ return rb_econv_convpath(rb_class_new_instance(argc, argv, klass), sel);
+}
+
+static transcode_behavior_t
+symbol_option_with_default(VALUE given_symbol, transcode_behavior_t otherwise, const char* name)
+{
+ if (given_symbol == sym_replace) {
+ return TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ }
+ else if (given_symbol == sym_attr) {
+ return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
+ }
+ else if (given_symbol == sym_text) {
+ return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
+ }
+ else if (!NIL_P(given_symbol)) {
+ rb_raise(rb_eArgError, "unknown value '%s' for option %s", StringValuePtr(given_symbol), name);
+ }
+ return otherwise;
+}
+
+static void parse_conversion_options(VALUE options, transcode_behavior_t* behavior_for_invalid,
+ transcode_behavior_t* behavior_for_undefined, rb_str_t** replacement_str, rb_encoding_t* destination)
+{
+
+ *behavior_for_invalid = symbol_option_with_default(rb_hash_aref(options, sym_invalid),
+ TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "invalid-character");
+
+ *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_undef),
+ TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "undefined-conversion");
+
+ // Because the API conflates the :xml and :undef options, we pass in the previous setting
+ *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
+ *behavior_for_undefined, "xml-replacement");
+
+ *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
+ *behavior_for_undefined, "xml-replacement");
+
+ VALUE replacement = rb_hash_aref(options, sym_replace);
+ if (!NIL_P(replacement)) {
+ *replacement_str = str_simple_transcode(str_need_string(replacement), destination);
+ }
+
+}
+
+static VALUE
+rb_econv_initialize(VALUE self, SEL sel, int argc, VALUE* argv)
+{
+ rb_econv_t *conv = RConverter(self);
+ VALUE sourceobj, destobj, options;
+ rb_scan_args(argc, argv, "21", &sourceobj, &destobj, &options);
+
+ rb_encoding_t* source = rb_to_encoding(sourceobj);
+ rb_encoding_t* destination = rb_to_encoding(destobj);
+ rb_str_t* replacement_str = NULL;
+
+ conv->source = source;
+ conv->destination = destination;
+
+ conv->invalid_sequence_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
+ conv->undefined_conversion_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
+
+ // Extract the options. This is a hateful, hateful API.
+ if (!NIL_P(options)) {
+
+ if (FIXNUM_P(options)) {
+ rb_bug("fixnum arguments are not supported yet.");
+ }
+ else if (TYPE(options) == T_HASH) {
+ parse_conversion_options(options, &conv->invalid_sequence_behavior,
+ &conv->undefined_conversion_behavior, &replacement_str, destination);
+ }
+ else {
+ rb_raise(rb_eArgError, "expected either a hash or a fixnum as the last parameter");
+ }
+ }
+
+ // Get the default replacement string. For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
+ if (replacement_str == NULL) {
+ replacement_str = replacement_string_for_encoding(destination);
+ }
+ GC_WB(&conv->replacement, replacement_str);
+
+ return self;
+}
+
+static VALUE
+rb_econv_inspect(VALUE self, SEL sel)
+{
+ // TODO: make this comply with the MRI output when we add newline decorators
+ rb_econv_t *conv = RConverter(self);
+ return rb_sprintf("#<%s: %s to %s>", rb_obj_classname(self), conv->source->public_name,
+ conv->destination->public_name);
+}
+
+static VALUE
+rb_econv_convpath(VALUE self, SEL sel)
+{
+ // in MacRuby, the convpath always looks like this:
+ // [[source_encoding, native UTF-16], [native UTF-16, dest_encoding]]
+ // The first element is omitted if the source encoding is UTF-16, obviously.
+ rb_econv_t *conv = RConverter(self);
+ VALUE to_return = rb_ary_new2(2);
+ rb_encoding_t* nativeUTF16 = rb_encodings[ENCODING_UTF16_NATIVE];
+
+ if (conv->source != nativeUTF16) {
+ rb_ary_push(to_return, rb_assoc_new((VALUE)conv->source, (VALUE)nativeUTF16));
+ }
+
+ rb_ary_push(to_return, rb_assoc_new((VALUE)nativeUTF16, (VALUE)conv->destination));
+
+ return to_return;
+}
+
+static VALUE
+rb_econv_source_encoding(VALUE self, SEL sel)
+{
+ return (VALUE)(RConverter(self)->source);
+}
+
+static VALUE
+rb_econv_destination_encoding(VALUE self, SEL sel)
+{
+ return (VALUE)(RConverter(self)->destination);
+}
+
+// Since our converter is basically a black box at this point, we'll leave
+// the lower-level methods unimplemented.
+#define rb_econv_primitive_convert rb_f_notimplement
+
+static VALUE
+rb_econv_convert(VALUE self, SEL sel, VALUE str)
+{
+ rb_econv_t *conv;
+ Data_Get_Struct(self, rb_econv_t, conv);
+
+ if (conv->finished) {
+ rb_raise(rb_eArgError, "convert() called on a finished stream");
+ }
+
+ assert(conv->replacement->encoding == conv->destination);
+ return (VALUE)str_transcode(str_need_string(str), conv->source, conv->destination, conv->invalid_sequence_behavior, conv->undefined_conversion_behavior, conv->replacement);
+}
+
+static VALUE
+rb_econv_finish(VALUE self, SEL sel)
+{
+ // TODO: Flesh this out later.
+ RConverter(self)->finished = true;
+ return rb_str_new2("");
+}
+
+#define rb_econv_primitive_errinfo rb_f_notimplement
+
+#define rb_econv_insert_output rb_f_notimplement
+
+#define rb_econv_putback rb_f_notimplement
+
+#define rb_econv_last_error rb_f_notimplement
+
+static VALUE
+rb_econv_replacement(VALUE self, SEL sel)
+{
+ return (VALUE)(RConverter(self)->replacement);
+}
+
+static VALUE
+rb_econv_set_replacement(VALUE self, SEL sel, VALUE str)
+{
+ // TODO: Should we copy this string? Probably.
+ rb_econv_t *conv = RConverter(self);
+ if (TYPE(str) != T_STRING) {
+ rb_raise(rb_eTypeError, "wrong argument type %s (expected String)", rb_obj_classname(str));
+ }
+ rb_str_force_encoding(str, conv->destination);
+ GC_WB(&conv->replacement, str_need_string(str));
+ return str;
+}
+
+/*
+ * call-seq:
+ * str.encode(encoding [, options] ) => str
+ * str.encode(dst_encoding, src_encoding [, options] ) => str
+ * str.encode([options]) => str
+ *
+ * The first form returns a copy of <i>str</i> transcoded
+ * to encoding +encoding+.
+ * The second form returns a copy of <i>str</i> transcoded
+ * from src_encoding to dst_encoding.
+ * The last form returns a copy of <i>str</i> transcoded to
+ * <code>Encoding.default_internal</code>.
+ * By default, the first and second form raise
+ * Encoding::UndefinedConversionError for characters that are
+ * undefined in the destination encoding, and
+ * Encoding::InvalidByteSequenceError for invalid byte sequences
+ * in the source encoding. The last form by default does not raise
+ * exceptions but uses replacement strings.
+ * The <code>options</code> Hash gives details for conversion.
+ *
+ * === options
+ * The hash <code>options</code> can have the following keys:
+ * :invalid ::
+ * If the value is <code>:replace</code>, <code>#encode</code> replaces
+ * invalid byte sequences in <code>str</code> with the replacement character.
+ * The default is to raise the exception
+ * :undef ::
+ * If the value is <code>:replace</code>, <code>#encode</code> replaces
+ * characters which are undefined in the destination encoding with
+ * the replacement character.
+ * :replace ::
+ * Sets the replacement string to the value. The default replacement
+ * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
+ * :xml ::
+ * The value must be <code>:text</code> or <code>:attr</code>.
+ * If the value is <code>:text</code> <code>#encode</code> replaces
+ * undefined characters with their (upper-case hexadecimal) numeric
+ * character references. '&', '<', and '>' are converted to "&amp;",
+ * "&lt;", and "&gt;", respectively.
+ * If the value is <code>:attr</code>, <code>#encode</code> also quotes
+ * the replacement result (using '"'), and replaces '"' with "&quot;".
+ */
+extern rb_encoding_t *default_internal;
+static VALUE
+rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+ VALUE opt = Qnil;
+ if (argc > 0) {
+ opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
+ if (!NIL_P(opt)) {
+ argc--;
+ }
+ }
+
+ rb_str_t *self = RSTR(str);
+ rb_str_t *replacement_str = NULL;
+ rb_encoding_t *src_encoding, *dst_encoding;
+ transcode_behavior_t behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
+ transcode_behavior_t behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
+ if (argc == 0) {
+ src_encoding = self->encoding;
+ dst_encoding = default_internal;
+ behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ }
+ else if (argc == 1) {
+ src_encoding = self->encoding;
+ dst_encoding = rb_to_encoding(argv[0]);
+ }
+ else if (argc == 2) {
+ dst_encoding = rb_to_encoding(argv[0]);
+ src_encoding = rb_to_encoding(argv[1]);
+ }
+ else {
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
+ }
+
+ if (!NIL_P(opt)) {
+ parse_conversion_options(opt, &behavior_for_invalid, &behavior_for_undefined, &replacement_str, dst_encoding);
+ if ((replacement_str != NULL)
+ && (behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
+ && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
+ behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
+ }
+ }
+
+ if ((replacement_str == NULL)
+ && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
+ || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
+ replacement_str = replacement_string_for_encoding(dst_encoding);
+ }
+
+ return (VALUE)str_transcode(self, src_encoding, dst_encoding,
+ behavior_for_invalid, behavior_for_undefined, replacement_str);
+}
+
+/*
+ * call-seq:
+ * str.encode!(encoding [, options] ) => str
+ * str.encode!(dst_encoding, src_encoding [, options] ) => str
+ *
+ * The first form transcodes the contents of <i>str</i> from
+ * str.encoding to +encoding+.
+ * The second form transcodes the contents of <i>str</i> from
+ * src_encoding to dst_encoding.
+ * The options Hash gives details for conversion. See String#encode
+ * for details.
+ * Returns the string even if no changes were made.
+ */
+static VALUE
+rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
+{
+ rstr_modify(str);
+
+ VALUE new_str = rstr_encode(str, sel, argc, argv);
+ str_replace_with_string(RSTR(str), RSTR(new_str));
+ return str;
+}
+
+void
+Init_Transcode(void)
+{
+ rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
+ rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
+ rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
+
+ rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
+ rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
+
+ rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
+ rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "alloc", rb_econv_alloc, 0);
+ rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "asciicompat_encoding", rb_econv_asciicompat_encoding, 1);
+ rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "search_convpath", rb_econv_search_convpath, -1);
+
+ rb_objc_define_method(rb_cEncodingConverter, "initialize", rb_econv_initialize, -1);
+ rb_objc_define_method(rb_cEncodingConverter, "inspect", rb_econv_inspect, 0);
+ rb_objc_define_method(rb_cEncodingConverter, "convpath", rb_econv_convpath, 0);
+ rb_objc_define_method(rb_cEncodingConverter, "source_encoding", rb_econv_source_encoding, 0);
+ rb_objc_define_method(rb_cEncodingConverter, "destination_encoding", rb_econv_destination_encoding, 0);
+ rb_objc_define_method(rb_cEncodingConverter, "primitive_convert", rb_econv_primitive_convert, -1);
+ rb_objc_define_method(rb_cEncodingConverter, "convert", rb_econv_convert, 1);
+ rb_objc_define_method(rb_cEncodingConverter, "finish", rb_econv_finish, 0);
+ rb_objc_define_method(rb_cEncodingConverter, "primitive_errinfo", rb_econv_primitive_errinfo, 0);
+ rb_objc_define_method(rb_cEncodingConverter, "insert_output", rb_econv_insert_output, 1);
+ rb_objc_define_method(rb_cEncodingConverter, "putback", rb_econv_putback, -1);
+ rb_objc_define_method(rb_cEncodingConverter, "last_error", rb_econv_last_error, 0);
+ rb_objc_define_method(rb_cEncodingConverter, "replacement", rb_econv_replacement, 0);
+ rb_objc_define_method(rb_cEncodingConverter, "replacement=", rb_econv_set_replacement, 1);
+
+ sym_invalid = ID2SYM(rb_intern("invalid"));
+ sym_undef = ID2SYM(rb_intern("undef"));
+ sym_replace = ID2SYM(rb_intern("replace"));
+ sym_attr = ID2SYM(rb_intern("attr"));
+ sym_text = ID2SYM(rb_intern("text"));
+ sym_xml = ID2SYM(rb_intern("xml"));
+
+ // If only these mapped to the internal enums...
+ rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
+ rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
+ rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
+ rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
+ rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
+ rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
+ rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
+ rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
+ rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
+ rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
+ rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
+ rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
+ rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
+
+#if 0
+ rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
+ rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
+ rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
+ rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
+ rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
+
+ rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
+ rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
+
+ Init_newline();
+#endif
+}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment