ruby-changes:7038

akr	2008-08-13 14:30:42 +0900 (Wed, 13 Aug 2008)

  New Revision: 18556

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=18556

  Log:
    * enc/trans/newline.trans: new file.
    
    * transcode_data.h (rb_trans_t): add last_tc field.
    
    * transcode.c (UNIVERSAL_NEWLINE): defined.
      (CRLF_NEWLINE): defined.
      (CR_NEWLINE): defined.
      (rb_trans_open_by_transcoder_entries): initialize last_tc.
      (trans_open_i): allocate one more room for newline converter.
      (rb_trans_open): universal newline implemented.
      (more_output_buffer): take max_output argument instead ts.
      (output_replacement_character): take tc argument instead of ts.
      (transcode_loop): use last_tc field.
      (econv_init): add flags argument for rb_trans_open.
      (Init_transcode): Encoding::Converter::UNIVERSAL_NEWLINE defined.

  Added files:
    trunk/enc/trans/newline.trans
  Modified files:
    trunk/ChangeLog
    trunk/test/ruby/test_econv.rb
    trunk/transcode.c
    trunk/transcode_data.h

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 18555)
+++ ChangeLog	(revision 18556)
@@ -1,3 +1,21 @@
+Wed Aug 13 14:22:16 2008  Tanaka Akira  <akr@f...>
+
+	* enc/trans/newline.trans: new file.
+
+	* transcode_data.h (rb_trans_t): add last_tc field.
+
+	* transcode.c (UNIVERSAL_NEWLINE): defined.
+	  (CRLF_NEWLINE): defined.
+	  (CR_NEWLINE): defined.
+	  (rb_trans_open_by_transcoder_entries): initialize last_tc.
+	  (trans_open_i): allocate one more room for newline converter.
+	  (rb_trans_open): universal newline implemented.
+	  (more_output_buffer): take max_output argument instead ts.
+	  (output_replacement_character): take tc argument instead of ts.
+	  (transcode_loop): use last_tc field.
+	  (econv_init): add flags argument for rb_trans_open.
+	  (Init_transcode): Encoding::Converter::UNIVERSAL_NEWLINE defined.
+
 Wed Aug 13 14:00:19 2008  Nobuyoshi Nakada  <nobu@r...>
 
 	* common.mk (parse.c): generates parse.h together.
Index: enc/trans/newline.trans
===================================================================
--- enc/trans/newline.trans	(revision 0)
+++ enc/trans/newline.trans	(revision 18556)
@@ -0,0 +1,56 @@
+#include "transcode_data.h"
+
+<%
+  map_normalize = {}
+  map_normalize["{00-ff}"] = :func_so
+%>
+
+<%= transcode_generate_node(ActionMap.parse(map_normalize), "universal_newline") %>
+
+static int
+fun_so_universal_newline(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
+{
+    int len;
+    /*
+      t->stateful[0] == 0       : normal
+      t->stateful[0] == 1       : just after '\r'
+    */
+    if (s[0] == '\n') {
+        if (t->stateful[0] == 0) {
+            o[0] = '\n';
+            len = 1;
+        }
+        else {
+            len = 0;
+        }
+        t->stateful[0] = 0;
+    }
+    else if (s[0] == '\r') {
+        o[0] = '\n';
+        len = 1;
+        t->stateful[0] = 1;
+    }
+    else {
+        o[0] = s[0];
+        len = 1;
+        t->stateful[0] = 0;
+    }
+    return len;
+}
+
+static const rb_transcoder
+rb_universal_newline = {
+    "universal_newline", "", &universal_newline,
+    1, /* input_unit_length */
+    1, /* max_input */
+    1, /* max_output */
+    NULL, NULL, NULL, fun_so_universal_newline
+};
+
+
+void
+Init_newline(void)
+{
+    rb_register_transcoder(&rb_universal_newline);
+}
+
Index: transcode_data.h
===================================================================
--- transcode_data.h	(revision 18555)
+++ transcode_data.h	(revision 18556)
@@ -122,6 +122,7 @@
     rb_trans_elem_t *elems;
     int num_trans;
     int num_finished;
+    rb_transcoding *last_tc;
 } rb_trans_t;
 
 void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);
Index: test/ruby/test_econv.rb
===================================================================
--- test/ruby/test_econv.rb	(revision 18555)
+++ test/ruby/test_econv.rb	(revision 18556)
@@ -4,7 +4,7 @@
   def assert_econv(ret_expected, dst_expected, src_expected, to, from, src, opt={})
     opt[:obuf_len] ||= 100
     src = src.dup
-    ec = Encoding::Converter.new(from, to)
+    ec = Encoding::Converter.new(from, to, 0)
     dst = ''
     while true
       ret = ec.primitive_convert(src, dst2="", opt[:obuf_len], 0)
@@ -35,7 +35,7 @@
   end
 
   def test_errors
-    ec = Encoding::Converter.new("UTF-16BE", "EUC-JP")
+    ec = Encoding::Converter.new("UTF-16BE", "EUC-JP", 0)
     src = "\xFF\xFE\x00A\xDC\x00"
     ret = ec.primitive_convert(src, dst="", 10, 0)
     assert_equal("", src)
@@ -50,4 +50,18 @@
     assert_equal("", dst)
     assert_equal(:finished, ret)
   end
+
+  def test_universal_newline
+    ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNIVERSAL_NEWLINE)
+    ret = ec.primitive_convert(src="abc\r\ndef", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+    assert_equal([:ibuf_empty, "", "abc\ndef"], [ret, src, dst])
+    ret = ec.primitive_convert(src="ghi\njkl", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+    assert_equal([:ibuf_empty, "", "ghi\njkl"], [ret, src, dst])
+    ret = ec.primitive_convert(src="mno\rpqr", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+    assert_equal([:ibuf_empty, "", "mno\npqr"], [ret, src, dst])
+    ret = ec.primitive_convert(src="stu\r", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+    assert_equal([:ibuf_empty, "", "stu\n"], [ret, src, dst])
+    ret = ec.primitive_convert(src="\nvwx", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
+    assert_equal([:ibuf_empty, "", "vwx"], [ret, src, dst])
+  end
 end
Index: transcode.c
===================================================================
--- transcode.c	(revision 18555)
+++ transcode.c	(revision 18556)
@@ -25,7 +25,10 @@
 #define INVALID_REPLACE 0x2
 #define UNDEF_IGNORE 0x10
 #define UNDEF_REPLACE 0x20
-#define PARTIAL_INPUT 0x100
+#define PARTIAL_INPUT           0x100
+#define UNIVERSAL_NEWLINE       0x200
+#define CRLF_NEWLINE            0x400
+#define CR_NEWLINE              0x800
 
 /*
  *  Dispatch data and logic
@@ -646,6 +649,7 @@
     ts->num_trans = n;
     ts->elems = ALLOC_N(rb_trans_elem_t, ts->num_trans);
     ts->num_finished = 0;
+    ts->last_tc = NULL;
     for (i = 0; i < ts->num_trans; i++) {
         const rb_transcoder *tr = load_transcoder_entry(entries[i]);
         ts->elems[i].from = tr->from_encoding;
@@ -657,6 +661,7 @@
         ts->elems[i].out_buf_end = NULL;
         ts->elems[i].last_result = transcode_ibuf_empty;
     }
+    ts->last_tc = ts->elems[ts->num_trans-1].tc;
 
     for (i = 0; i < ts->num_trans-1; i++) {
         int bufsize = 4096;
@@ -678,7 +683,7 @@
     transcoder_entry_t **entries;
 
     if (!*entries_ptr) {
-        entries = ALLOC_N(transcoder_entry_t *, depth+1);
+        entries = ALLOC_N(transcoder_entry_t *, depth+1+1);
         *entries_ptr = entries;
     }
     else {
@@ -699,7 +704,19 @@
     if (num_trans < 0 || !entries)
         return NULL;
 
+    if (flags & UNIVERSAL_NEWLINE) {
+        transcoder_entry_t *e = get_transcoder_entry("universal_newline", "");
+        if (!e)
+            return NULL;
+        entries[num_trans++] = e;
+    }
+
     ts = rb_trans_open_by_transcoder_entries(num_trans, entries);
+
+    if (flags & UNIVERSAL_NEWLINE) {
+        ts->last_tc = ts->elems[ts->num_trans-2].tc;
+    }
+
     return ts;
 }
 
@@ -840,13 +857,13 @@
 more_output_buffer(
         VALUE destination,
         unsigned char *(*resize_destination)(VALUE, int, int),
-        rb_trans_t *ts,
+        int max_output,
         unsigned char **out_start_ptr,
         unsigned char **out_pos,
         unsigned char **out_stop_ptr)
 {
     size_t len = (*out_pos - *out_start_ptr);
-    size_t new_len = (len + ts->elems[ts->num_trans-1].tc->transcoder->max_output) * 2;
+    size_t new_len = (len + max_output) * 2;
     *out_start_ptr = resize_destination(destination, len, new_len);
     *out_pos = *out_start_ptr + len;
     *out_stop_ptr = *out_start_ptr + new_len;
@@ -856,20 +873,18 @@
 output_replacement_character(
         VALUE destination,
         unsigned char *(*resize_destination)(VALUE, int, int),
-        rb_trans_t *ts,
+        rb_transcoding *tc,
         unsigned char **out_start_ptr,
         unsigned char **out_pos,
         unsigned char **out_stop_ptr)
 
 {
-    rb_transcoding *tc;
     const rb_transcoder *tr;
     int max_output;
     rb_encoding *enc;
     const char *replacement;
     int len;
 
-    tc = ts->elems[ts->num_trans-1].tc;
     tr = tc->transcoder;
     max_output = tr->max_output;
     enc = rb_enc_find(tr->to_encoding);
@@ -893,12 +908,12 @@
 
     if (tr->resetstate_func) {
         if (*out_stop_ptr - *out_pos < max_output)
-            more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
+            more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
         *out_pos += tr->resetstate_func(tc, *out_pos);
     }
 
     if (*out_stop_ptr - *out_pos < max_output)
-        more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
+        more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
 
     replacement = get_replacement_character(enc, &len);
 
@@ -919,6 +934,7 @@
 	       const int opt)
 {
     rb_trans_t *ts;
+    rb_transcoding *last_tc;
     rb_trans_result_t ret;
     unsigned char *out_start = *out_pos;
     int max_output;
@@ -927,7 +943,8 @@
     if (!ts)
         rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
 
-    max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
+    last_tc = ts->last_tc;
+    max_output = last_tc->transcoder->max_output;
 
 resume:
     ret = rb_trans_conv(ts, in_pos, in_stop, out_pos, out_stop, opt);
@@ -938,7 +955,7 @@
             goto resume;
 	}
 	else if (opt&INVALID_REPLACE) {
-	    output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+	    output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
             goto resume;
 	}
         rb_trans_close(ts);
@@ -952,14 +969,14 @@
 	    goto resume;
 	}
 	else if (opt&UNDEF_REPLACE) {
-	    output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+	    output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
 	    goto resume;
 	}
         rb_trans_close(ts);
         rb_raise(rb_eConversionUndefined, "conversion undefined for byte sequence (maybe invalid byte sequence)");
     }
     if (ret == transcode_obuf_full) {
-        more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+        more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
         goto resume;
     }
 
@@ -978,6 +995,7 @@
 	       const int opt)
 {
     rb_trans_t *ts;
+    rb_transcoding *last_tc;
     rb_trans_result_t ret;
     unsigned char *out_start = *out_pos;
     const unsigned char *ptr;
@@ -987,6 +1005,7 @@
     if (!ts)
         rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
 
+    last_tc = ts->last_tc;
     max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
 
     ret = transcode_ibuf_empty;
@@ -1017,7 +1036,7 @@
                 break;
             }
             else if (opt&INVALID_REPLACE) {
-                output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+                output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
                 break;
             }
             rb_trans_close(ts);
@@ -1032,7 +1051,7 @@
                 break;
             }
             else if (opt&UNDEF_REPLACE) {
-                output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+                output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
                 break;
             }
             rb_trans_close(ts);
@@ -1040,7 +1059,7 @@
             break;
 
           case transcode_obuf_full:
-            more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
+            more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
             break;
 
           case transcode_ibuf_empty:
@@ -1261,19 +1280,24 @@
 }
 
 static VALUE
-econv_init(VALUE self, VALUE from_encoding, VALUE to_encoding)
+econv_init(VALUE self, VALUE from_encoding, VALUE to_encoding, VALUE flags_v)
 {
     const char *from_e, *to_e;
     rb_trans_t *ts;
+    int flags;
 
-    from_e = StringValueCStr(from_encoding);
-    to_e = StringValueCStr(to_encoding);
+    StringValue(from_encoding);
+    StringValue(to_encoding);
+    flags = NUM2INT(flags_v);
 
+    from_e = RSTRING_PTR(from_encoding);
+    to_e = RSTRING_PTR(to_encoding);
+
     if (DATA_PTR(self)) {
         rb_raise(rb_eTypeError, "already initialized");
     }
 
-    ts = rb_trans_open(from_e, to_e, 0);
+    ts = rb_trans_open(from_e, to_e, flags);
     if (!ts) {
         rb_raise(rb_eArgError, "encoding convewrter not supported (from %s to %s)", from_e, to_e);
     }
@@ -1363,8 +1387,9 @@
 
     rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
     rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
-    rb_define_method(rb_cEncodingConverter, "initialize", econv_init, 2);
+    rb_define_method(rb_cEncodingConverter, "initialize", econv_init, 3);
     rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, 4);
     rb_define_method(rb_cEncodingConverter, "max_output", econv_max_output, 0);
     rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(PARTIAL_INPUT));
+    rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE", INT2FIX(UNIVERSAL_NEWLINE));
 }

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/