ruby-changes:7125

akr	2008-08-15 18:12:56 +0900 (Fri, 15 Aug 2008)

  New Revision: 18643

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=18643

  Log:
    * include/ruby/encoding.h (rb_econv_t): new field: last_error.
    
    * transcode.c (rb_trans_conv): new argument: result_position_ptr.
      (rb_econv_convert): fill last_error.
      (econv_result_to_symbol): extracted from econv_primitive_convert.
      (econv_primitive_errinfo): new method.

  Modified files:
    trunk/ChangeLog
    trunk/include/ruby/encoding.h
    trunk/test/ruby/test_econv.rb
    trunk/transcode.c

Index: include/ruby/encoding.h
===================================================================
--- include/ruby/encoding.h	(revision 18642)
+++ include/ruby/encoding.h	(revision 18643)
@@ -224,6 +224,17 @@
     int num_finished;
     struct rb_transcoding *last_tc;
 
+    /* last error */
+    struct {
+        rb_econv_result_t result;
+        const char *source_encoding;
+        const char *destination_encoding;
+        const unsigned char *error_bytes_start;
+        size_t error_bytes_len;
+        size_t readagain_len;
+        int partial_input;
+    } last_error;
+
     /* The following fields are only for Encoding::Converter.
      * rb_econv_open set them NULL. */
     rb_encoding *source_encoding;
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 18642)
+++ ChangeLog	(revision 18643)
@@ -1,3 +1,12 @@
+Fri Aug 15 18:10:49 2008  Tanaka Akira  <akr@f...>
+
+	* include/ruby/encoding.h (rb_econv_t): new field: last_error.
+
+	* transcode.c (rb_trans_conv): new argument: result_position_ptr.
+	  (rb_econv_convert): fill last_error.
+	  (econv_result_to_symbol): extracted from econv_primitive_convert.
+	  (econv_primitive_errinfo): new method.
+
 Fri Aug 15 17:39:05 2008  Nobuyoshi Nakada  <nobu@r...>
 
 	* common.mk (parse.{c,h}): creates in the dependency order.
Index: test/ruby/test_econv.rb
===================================================================
--- test/ruby/test_econv.rb	(revision 18642)
+++ test/ruby/test_econv.rb	(revision 18643)
@@ -20,6 +20,14 @@
                  [o,            ret,           i])
   end
 
+  def assert_errinfo(e_res, e_enc1, e_enc2, e_error_bytes, e_readagain_bytes, e_partial_input, ec)
+    assert_equal([e_res, e_enc1, e_enc2,
+                  e_error_bytes && e_error_bytes.dup.force_encoding("ASCII-8BIT"), 
+                  e_readagain_bytes && e_readagain_bytes.dup.force_encoding("ASCII-8BIT"), 
+                  e_partial_input],
+                 ec.primitive_errinfo)
+  end
+
   def test_new
     assert_kind_of(Encoding::Converter, Encoding::Converter.new("UTF-8", "EUC-JP"))
     assert_kind_of(Encoding::Converter, Encoding::Converter.new(Encoding::UTF_8, Encoding::EUC_JP))
@@ -319,4 +327,42 @@
     check_ec("abc\xA4\xA2def",  "", :output_followed_by_input, *a)
     check_ec("abc\xA4\xA2def",  "", :finished, *a)
   end
+
+  def test_errinfo_invalid_euc_jp
+    ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
+    ec.primitive_convert(src="\xff", dst="", nil, 10)                       
+    assert_errinfo(:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", "", nil, ec)
+  end
+
+  def test_errinfo_undefined_hiragana
+    ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
+    ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
+    assert_errinfo(:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", "", nil, ec)
+  end
+
+  def test_errinfo_invalid_partial_character
+    ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
+    ec.primitive_convert(src="\xa4", dst="", nil, 10)
+    assert_errinfo(:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xA4", "", nil, ec)
+  end
+
+  def test_errinfo_valid_partial_character
+    ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
+    ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
+    assert_errinfo(:source_buffer_empty, nil, nil, nil, nil, :partial_input, ec)
+  end
+
+  def test_errinfo_invalid_utf16be
+    ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
+    ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
+    assert_errinfo(:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00", nil, ec)
+    assert_equal("@", src)
+  end
+
+  def test_errinfo_invalid_utf16le
+    ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
+    ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
+    assert_errinfo(:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00", nil, ec)
+    assert_equal("", src)
+  end
 end
Index: transcode.c
===================================================================
--- transcode.c	(revision 18642)
+++ transcode.c	(revision 18643)
@@ -839,7 +839,8 @@
 rb_trans_conv(rb_econv_t *ec,
     const unsigned char **input_ptr, const unsigned char *input_stop,
     unsigned char **output_ptr, unsigned char *output_stop,
-    int flags)
+    int flags,
+    int *result_position_ptr)
 {
     int i;
     int needreport_index;
@@ -888,7 +889,8 @@
         rb_econv_result_t res;
 
         res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
-                (flags & ~ECONV_OUTPUT_FOLLOWED_BY_INPUT)|ECONV_PARTIAL_INPUT);
+                (flags & ~ECONV_OUTPUT_FOLLOWED_BY_INPUT)|ECONV_PARTIAL_INPUT,
+                result_position_ptr);
 
         if (res == econv_source_buffer_empty)
             return econv_output_followed_by_input;
@@ -912,9 +914,13 @@
                 res == econv_output_followed_by_input) {
                 ec->elems[i].last_result = econv_source_buffer_empty;
             }
+            if (result_position_ptr)
+                *result_position_ptr = i;
             return res;
         }
     }
+    if (result_position_ptr)
+        *result_position_ptr = -1;
     return econv_source_buffer_empty;
 }
 
@@ -925,15 +931,32 @@
     int flags)
 {
     rb_econv_result_t res;
+    int result_position;
 
+    memset(&ec->last_error, 0, sizeof(ec->last_error));
+
     if ((flags & ECONV_OUTPUT_FOLLOWED_BY_INPUT) ||
-        ec->num_trans == 1)
-        return rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
+        ec->num_trans == 1) {
+        res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
+    }
+    else {
+        flags |= ECONV_OUTPUT_FOLLOWED_BY_INPUT;
+        do {
+            res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
+        } while (res == econv_output_followed_by_input);
+    }
 
-    flags |= ECONV_OUTPUT_FOLLOWED_BY_INPUT;
-    do {
-        res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
-    } while (res == econv_output_followed_by_input);
+    ec->last_error.result = res;
+    ec->last_error.partial_input = flags & ECONV_PARTIAL_INPUT;
+    if (res == econv_invalid_byte_sequence ||
+        res == econv_undefined_conversion) {
+        ec->last_error.source_encoding = ec->elems[result_position].tc->transcoder->from_encoding;
+        ec->last_error.destination_encoding = ec->elems[result_position].tc->transcoder->to_encoding;
+        ec->last_error.error_bytes_start = TRANSCODING_READBUF(ec->elems[result_position].tc);
+        ec->last_error.error_bytes_len = ec->elems[result_position].tc->recognized_len;
+        ec->last_error.readagain_len = ec->elems[result_position].tc->readagain_len;
+    }
+
     return res;
 }
 
@@ -1572,6 +1595,20 @@
     return rb_enc_from_encoding(ec->destination_encoding);
 }
 
+static VALUE
+econv_result_to_symbol(rb_econv_result_t res)
+{
+    switch (res) {
+      case econv_invalid_byte_sequence: return ID2SYM(rb_intern("invalid_byte_sequence"));
+      case econv_undefined_conversion: return ID2SYM(rb_intern("undefined_conversion"));
+      case econv_destination_buffer_full: return ID2SYM(rb_intern("destination_buffer_full"));
+      case econv_source_buffer_empty: return ID2SYM(rb_intern("source_buffer_empty"));
+      case econv_finished: return ID2SYM(rb_intern("finished"));
+      case econv_output_followed_by_input: return ID2SYM(rb_intern("output_followed_by_input"));
+      default: return INT2NUM(res); /* should not be reached */
+    }
+}
+
 /*
  * call-seq:
  *   primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
@@ -1704,15 +1741,113 @@
         rb_enc_associate(output, ec->destination_encoding);
     }
 
-    switch (res) {
-      case econv_invalid_byte_sequence: return ID2SYM(rb_intern("invalid_byte_sequence"));
-      case econv_undefined_conversion: return ID2SYM(rb_intern("undefined_conversion"));
-      case econv_destination_buffer_full: return ID2SYM(rb_intern("destination_buffer_full"));
-      case econv_source_buffer_empty: return ID2SYM(rb_intern("source_buffer_empty"));
-      case econv_finished: return ID2SYM(rb_intern("finished"));
-      case econv_output_followed_by_input: return ID2SYM(rb_intern("output_followed_by_input"));
-      default: return INT2NUM(res); /* should not be reached */
+    return econv_result_to_symbol(res);
+}
+
+/*
+ * call-seq:
+ *   primitive_errinfo -> array
+ *
+ * primitive_errinfo returns a precious information of last error result
+ * as a 6-elements array:
+ *
+ *   [result, enc1, enc2, error_bytes, readagain_bytes, partial_input]
+ *
+ * result is the last result of primitive_convert.
+ *
+ * partial_input is :partial_input or nil.
+ * :partial_input means that Encoding::Converter::PARTIAL_INPUT is specified
+ * for primitive_convert.
+ *
+ * Other elements are only meaningful when result is
+ * :invalid_byte_sequence or :undefined_conversion.
+ *
+ * enc1 and enc2 indicats a conversion step as pair of strings.
+ * For example, EUC-JP to ISO-8859-1 is
+ * converted as EUC-JP -> UTF-8 -> ISO-8859-1.
+ * So [enc1, enc2] is ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
+ *
+ * error_bytes and readagain_bytes indicats the byte sequences which causes the error.
+ * error_bytes is discarded portion.
+ * readagain_bytes is buffered portion which is read again on next conversion.
+ *
+ * Example:
+ *
+ *   # \xff is invalid as EUC-JP.
+ *   ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
+ *   ec.primitive_convert(src="\xff", dst="", nil, 10)                       
+ *   p ec.primitive_errinfo
+ *   #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", "", nil]
+ *
+ *   # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
+ *   # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
+ *   # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
+ *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
+ *   ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
+ *   p ec.primitive_errinfo
+ *   #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", "", nil]
+ *
+ *   # partial character is invalid
+ *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
+ *   ec.primitive_convert(src="\xa4", dst="", nil, 10)
+ *   p ec.primitive_errinfo
+ *   #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xA4", "", nil]
+ *
+ *   # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
+ *   # partial characters.
+ *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
+ *   ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)                 
+ *   p ec.primitive_errinfo
+ *   #=> [:source_buffer_empty, nil, nil, nil, nil, :partial_input]
+ *
+ *   # \xd8\x00\x00@ is invalid as UTF-16BE because
+ *   # no low surrogate after high surrogate (\xd8\x00).
+ *   # It is detected by 3rd byte (\00) which is part of next character.
+ *   # So the high surrogate (\xd8\x00) is discarded and
+ *   # the 3rd byte is read again later.
+ *   # Since the byte is buffered in ec, it is dropped from src.
+ *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
+ *   ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
+ *   p ec.primitive_errinfo
+ *   #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00", nil]
+ *   p src
+ *   #=> "@"
+ *
+ *   # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
+ *   # The problem is detected by 4th byte.
+ *   ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
+ *   ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
+ *   p ec.primitive_errinfo
+ *   #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00", nil]
+ *   p src
+ *   #=> ""
+ *
+ */
+static VALUE
+econv_primitive_errinfo(VALUE self)
+{
+    rb_econv_t *ec = check_econv(self);
+
+    VALUE ary;
+
+    ary = rb_ary_new2(6);
+
+    rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
+
+    if (ec->last_error.source_encoding)
+        rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
+
+    if (ec->last_error.destination_encoding)
+        rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
+
+    if (ec->last_error.error_bytes_start) {
+        rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
+        rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
     }
+
+    rb_ary_store(ary, 5, ec->last_error.partial_input ? ID2SYM(rb_intern("partial_input")) : Qnil);
+
+    return ary;
 }
 
 void
@@ -1738,6 +1873,7 @@
     rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
     rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
     rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
+    rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
     rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
     rb_define_const(rb_cEncodingConverter, "OUTPUT_FOLLOWED_BY_INPUT", INT2FIX(ECONV_OUTPUT_FOLLOWED_BY_INPUT));
     rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECODER", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECODER));

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/