ruby-changes:7641

akr	2008-09-06 05:24:18 +0900 (Sat, 06 Sep 2008)

  New Revision: 19162

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=19162

  Log:
    * include/ruby/encoding.h (ECONV_UNDEF_HEX_CHARREF): defined.
    
    * transcode.c (output_hex_charref): new function.
      (rb_econv_convert): call output_hex_charref if
      ECONV_UNDEF_HEX_CHARREF.
      (Init_transcode): Encoding::Converter::UNDEF_HEX_CHARREF added.

  Modified files:
    trunk/ChangeLog
    trunk/include/ruby/encoding.h
    trunk/test/ruby/test_econv.rb
    trunk/transcode.c

Index: include/ruby/encoding.h
===================================================================
--- include/ruby/encoding.h	(revision 19161)
+++ include/ruby/encoding.h	(revision 19162)
@@ -255,6 +255,7 @@
 
 #define ECONV_UNDEF_MASK                        0x00f0
 #define ECONV_UNDEF_REPLACE                     0x0020
+#define ECONV_UNDEF_HEX_CHARREF                 0x0030
 
 /* effective only if output is ascii compatible */
 #define ECONV_UNIVERSAL_NEWLINE_DECODER         0x0100
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 19161)
+++ ChangeLog	(revision 19162)
@@ -1,3 +1,12 @@
+Sat Sep  6 05:22:29 2008  Tanaka Akira  <akr@f...>
+
+	* include/ruby/encoding.h (ECONV_UNDEF_HEX_CHARREF): defined.
+
+	* transcode.c (output_hex_charref): new function.
+	  (rb_econv_convert): call output_hex_charref if
+	  ECONV_UNDEF_HEX_CHARREF.
+	  (Init_transcode): Encoding::Converter::UNDEF_HEX_CHARREF added.
+	  
 Sat Sep  6 03:52:47 2008  Tanaka Akira  <akr@f...>
 
 	* transcode.c (rb_econv_convert): use ECONV_INVALID_MASK and
Index: test/ruby/test_econv.rb
===================================================================
--- test/ruby/test_econv.rb	(revision 19161)
+++ test/ruby/test_econv.rb	(revision 19162)
@@ -670,4 +670,19 @@
     ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace, :replace => "X")  
     assert_equal("a X b", ec.convert("a \u3042 b"))
   end
+
+  def test_hex_charref
+    ec = Encoding::Converter.new("UTF-8", "US-ASCII", Encoding::Converter::UNDEF_HEX_CHARREF)
+    assert_equal("&#x3042;", ec.convert("\u3042"))
+
+    ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNDEF_HEX_CHARREF)
+    assert_equal("\xa4\xcf\xa4\xa1\xa4\xa4&#x2665;\xa1\xa3".force_encoding("euc-jp"),
+      ec.convert("\u{306f 3041 3044 2665 3002}"))
+
+    ec = Encoding::Converter.new("UTF-8", "ISO-2022-JP", Encoding::Converter::UNDEF_HEX_CHARREF)
+    assert_equal("\e$B$O$!$$\e(B&#x2665;\e$B!#".force_encoding("ISO-2022-JP"),
+      ec.convert("\u{306f 3041 3044 2665 3002}"))
+    assert_equal("\e(B".force_encoding("ISO-2022-JP"),
+      ec.finish)
+  end
 end
Index: transcode.c
===================================================================
--- transcode.c	(revision 19161)
+++ transcode.c	(revision 19162)
@@ -34,6 +34,12 @@
 static VALUE sym_output_followed_by_input;
 static VALUE sym_incomplete_input;
 
+static unsigned char *
+allocate_converted_string(const char *sname, const char *dname,
+        const unsigned char *str, size_t len,
+        unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
+        size_t *dst_len_ptr);
+
 /* dynamic structure, one per conversion (similar to iconv_t) */
 /* may carry conversion state (e.g. for iso-2022-jp) */
 typedef struct rb_transcoding {
@@ -1261,6 +1267,62 @@
 
 static int output_replacement_character(rb_econv_t *ec);
 
+static int
+output_hex_charref(rb_econv_t *ec)
+{
+    int ret;
+    unsigned char utfbuf[1024];
+    const unsigned char *utf;
+    size_t utf_len;
+    int utf_allocated = 0;
+    char charef_buf[16];
+    const unsigned char *p;
+
+    if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
+        utf = ec->last_error.error_bytes_start;
+        utf_len = ec->last_error.error_bytes_len;
+    }
+    else {
+        utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
+                ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
+                utfbuf, sizeof(utfbuf),
+                &utf_len);
+        if (!utf)
+            return -1;
+        if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
+            utf_allocated = 1;
+    }
+
+    if (utf_len % 4 != 0)
+        goto fail;
+
+    p = utf;
+    while (4 <= utf_len) {
+        unsigned int u = 0;
+        u += p[0] << 24;
+        u += p[1] << 16;
+        u += p[2] << 8;
+        u += p[3];
+        snprintf(charef_buf, sizeof(charef_buf), "&#x%x;", u);
+
+        ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
+        if (ret == -1)
+            goto fail;
+
+        p += 4;
+        utf_len -= 4;
+    }
+
+    if (utf_allocated)
+        xfree((void *)utf);
+    return 0;
+
+  fail:
+    if (utf_allocated)
+        xfree((void *)utf);
+    return -1;
+}
+
 rb_econv_result_t
 rb_econv_convert(rb_econv_t *ec,
     const unsigned char **input_ptr, const unsigned char *input_stop,
@@ -1305,6 +1367,11 @@
 	    if (output_replacement_character(ec) == 0)
                 goto resume;
             break;
+
+          case ECONV_UNDEF_HEX_CHARREF:
+            if (output_hex_charref(ec) == 0)
+                goto resume;
+            break;
         }
     }
 
@@ -3424,10 +3491,12 @@
     rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
     rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
     rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
+
     rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
     rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
     rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
     rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
+    rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
     rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
     rb_define_const(rb_cEncodingConverter, "OUTPUT_FOLLOWED_BY_INPUT", INT2FIX(ECONV_OUTPUT_FOLLOWED_BY_INPUT));
     rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECODER", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECODER));

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/