[前][次][番号順一覧][スレッド一覧]

ruby-changes:6985

From: akr <ko1@a...>
Date: Tue, 12 Aug 2008 07:44:48 +0900 (JST)
Subject: [ruby-changes:6985] Ruby:r18503 (trunk): * transcode_data.h (rb_transcoder): add resetstate_func field for

akr	2008-08-12 07:44:23 +0900 (Tue, 12 Aug 2008)

  New Revision: 18503

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=18503

  Log:
    * transcode_data.h (rb_transcoder): add resetstate_func field for
      resetting a state of stateful encoding.
    
    * enc/trans/iso2022.trans (rb_EUC_JP_to_ISO_2022_JP): specify
      finish_eucjp_to_iso2022jp for resetstate_func.
    
    * tool/transcode-tblgen.rb: specify NULL for resetstate_func.
    
    * transcode.c (output_replacement_character): call resetstate_func
      before appending the replacement character.

  Modified files:
    trunk/ChangeLog
    trunk/enc/trans/iso2022.trans
    trunk/test/ruby/test_transcode.rb
    trunk/tool/transcode-tblgen.rb
    trunk/transcode.c
    trunk/transcode_data.h

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 18502)
+++ ChangeLog	(revision 18503)
@@ -1,3 +1,16 @@
+Tue Aug 12 07:41:13 2008  Tanaka Akira  <akr@f...>
+
+	* transcode_data.h (rb_transcoder): add resetstate_func field for
+	  resetting a state of stateful encoding.
+
+	* enc/trans/iso2022.trans (rb_EUC_JP_to_ISO_2022_JP): specify
+	  finish_eucjp_to_iso2022jp for resetstate_func.
+
+	* tool/transcode-tblgen.rb: specify NULL for resetstate_func.
+
+	* transcode.c (output_replacement_character): call resetstate_func
+	  before appending the replacement character.
+
 Tue Aug 12 07:19:24 2008  Tanaka Akira  <akr@f...>
 
 	* transcode.c (get_replacement_character): extracted from
Index: enc/trans/iso2022.trans
===================================================================
--- enc/trans/iso2022.trans	(revision 18502)
+++ enc/trans/iso2022.trans	(revision 18503)
@@ -136,7 +136,8 @@
     1, /* input_unit_length */
     3, /* max_input */
     5, /* max_output */
-    NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
+    NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp,
+    finish_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
 };
 
 void
Index: transcode_data.h
===================================================================
--- transcode_data.h	(revision 18502)
+++ transcode_data.h	(revision 18503)
@@ -95,6 +95,7 @@
     VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info   */
     int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info  -> output */
     int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */
+    int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */
     int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
 };
 
Index: tool/transcode-tblgen.rb
===================================================================
--- tool/transcode-tblgen.rb	(revision 18502)
+++ tool/transcode-tblgen.rb	(revision 18503)
@@ -446,7 +446,7 @@
     #{input_unit_length}, /* input_unit_length */
     #{max_input}, /* max_input */
     #{max_output}, /* max_output */
-    NULL, NULL, NULL, NULL, NULL
+    NULL, NULL, NULL, NULL, NULL, NULL
 };
 End
   tree_code + "\n" + transcoder_code
Index: test/ruby/test_transcode.rb
===================================================================
--- test/ruby/test_transcode.rb	(revision 18502)
+++ test/ruby/test_transcode.rb	(revision 18503)
@@ -303,6 +303,9 @@
       "\xdc\x00".encode("EUC-JP", "UTF-16BE", :invalid=>:replace), "[ruby-dev:35776]")
     assert_equal("ab?cd?ef",
       "\0a\0b\xdc\x00\0c\0d\xdf\x00\0e\0f".encode("EUC-JP", "UTF-16BE", :invalid=>:replace))
+
+    assert_equal("\e$B!!\e(B?".force_encoding("ISO-2022-JP"),
+      "\xA1\xA1\xFF".encode("ISO-2022-JP", "EUC-JP", invalid: :replace))
   end
 
   def test_undef_replace
Index: transcode.c
===================================================================
--- transcode.c	(revision 18502)
+++ transcode.c	(revision 18503)
@@ -292,19 +292,6 @@
     }
 }
 
-static void
-output_replacement_character(unsigned char **out_pp, rb_encoding *enc)
-{
-    const char *replacement;
-    int len;
-    replacement = get_replacement_character(enc, &len);
-
-    memcpy(*out_pp, replacement, len);
-
-    *out_pp += len;
-    return;
-}
-
 /*
  *  Transcoding engine logic
  */
@@ -818,6 +805,62 @@
     *out_stop_ptr = *out_start_ptr + new_len;
 }
 
+static void
+output_replacement_character(
+        VALUE destination,
+        unsigned char *(*resize_destination)(VALUE, int, int),
+        rb_trans_t *ts,
+        unsigned char **out_start_ptr,
+        unsigned char **out_pos,
+        unsigned char **out_stop_ptr)
+
+{
+    rb_transcoding *tc;
+    const rb_transcoder *tr;
+    int max_output;
+    rb_encoding *enc;
+    const char *replacement;
+    int len;
+
+    tc = ts->elems[ts->num_trans-1].tc;
+    tr = tc->transcoder;
+    max_output = tr->max_output;
+    enc = rb_enc_find(tr->to_encoding);
+
+    /*
+     * Assumption for stateful encoding:
+     *
+     * - The replacement character can be output on resetted state and doesn't
+     *   change the state.
+     * - it is acceptable that extra state changing sequence if the replacement
+     *   character contains a state changing sequence.
+     *
+     * Currently the replacement character for stateful encoding such as
+     * ISO-2022-JP is "?" and it has no state changing sequence.
+     * So the extra state changing sequence don't occur.
+     *
+     * Thease assumption may be removed in future.
+     * It needs to scan the replacement character to check
+     * state changing sequences in the replacement character.
+     */
+
+    if (tr->resetstate_func) {
+        if (*out_stop_ptr - *out_pos < max_output)
+            more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
+        *out_pos += tr->resetstate_func(tc, *out_pos);
+    }
+
+    if (*out_stop_ptr - *out_pos < max_output)
+        more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
+
+    replacement = get_replacement_character(enc, &len);
+
+    memcpy(*out_pos, replacement, len);
+
+    *out_pos += len;
+    return;
+}
+
 #if 1
 static void
 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
@@ -848,9 +891,7 @@
             goto resume;
 	}
 	else if (opt&INVALID_REPLACE) {
-            if (out_stop - *out_pos < max_output)
-                more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
-	    output_replacement_character(out_pos, rb_enc_find(to_encoding));
+	    output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
             goto resume;
 	}
         rb_trans_close(ts);
@@ -864,9 +905,7 @@
 	    goto resume;
 	}
 	else if (opt&UNDEF_REPLACE) {
-            if (out_stop - *out_pos < max_output)
-                more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
-	    output_replacement_character(out_pos, rb_enc_find(to_encoding));
+	    output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
 	    goto resume;
 	}
         rb_trans_close(ts);
@@ -931,9 +970,7 @@
                 break;
             }
             else if (opt&INVALID_REPLACE) {
-                if (out_stop - *out_pos < max_output)
-                    more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
-                output_replacement_character(out_pos, rb_enc_find(to_encoding));
+                output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
                 break;
             }
             rb_trans_close(ts);
@@ -948,9 +985,7 @@
                 break;
             }
             else if (opt&UNDEF_REPLACE) {
-                if (out_stop - *out_pos < max_output)
-                    more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
-                output_replacement_character(out_pos, rb_enc_find(to_encoding));
+                output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
                 break;
             }
             rb_trans_close(ts);

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]