[前][次][番号順一覧][スレッド一覧]

ruby-changes:6928

From: akr <ko1@a...>
Date: Sat, 9 Aug 2008 00:48:48 +0900 (JST)
Subject: [ruby-changes:6928] Ruby:r18445 (trunk): * transcode_data.h (rb_transcoder): from_unit_length field added.

akr	2008-08-09 00:48:17 +0900 (Sat, 09 Aug 2008)

  New Revision: 18445

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=18445

  Log:
    * transcode_data.h (rb_transcoder): from_unit_length field added.
      from_utf8 field removed.
    
    * tool/transcode-tblgen.rb: generate offsets range.
      follow rb_transcoder change.
    
    * transcode.c (transcode_loop): don't use from_utf8.
      make invalid region from_unit_length wise.
    
    * enc/trans/iso2022.erb.c: follow rb_transcoder and 
      transcode_generate_node change.
    
    * enc/trans/utf_16_32.erb.c: follow rb_transcoder and
      transcode_generate_node change.
      explicit :invalid map removed.

  Modified files:
    trunk/ChangeLog
    trunk/enc/trans/iso2022.erb.c
    trunk/enc/trans/utf_16_32.erb.c
    trunk/test/ruby/test_transcode.rb
    trunk/tool/transcode-tblgen.rb
    trunk/transcode.c
    trunk/transcode_data.h

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 18444)
+++ ChangeLog	(revision 18445)
@@ -1,3 +1,21 @@
+Sat Aug  9 00:42:33 2008  Tanaka Akira  <akr@f...>
+
+	* transcode_data.h (rb_transcoder): from_unit_length field added.
+	  from_utf8 field removed.
+
+	* tool/transcode-tblgen.rb: generate offsets range.
+	  follow rb_transcoder change.
+
+	* transcode.c (transcode_loop): don't use from_utf8.
+	  make invalid region from_unit_length wise.
+
+	* enc/trans/iso2022.erb.c: follow rb_transcoder and
+	  transcode_generate_node change.
+
+	* enc/trans/utf_16_32.erb.c: follow rb_transcoder and
+	  transcode_generate_node change.
+	  explicit :invalid map removed.
+
 Fri Aug  8 23:29:44 2008  Nobuyoshi Nakada  <nobu@r...>
 
 	* enc/depend (TRANSCSRCS): needs rule_subst to apply.
Index: enc/trans/iso2022.erb.c
===================================================================
--- enc/trans/iso2022.erb.c	(revision 18444)
+++ enc/trans/iso2022.erb.c	(revision 18445)
@@ -12,8 +12,8 @@
   map_jisx0208_rest["{21-7e}"] = :func_so
 %>
 
-<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp", []) %>
-<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest", []) %>
+<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp") %>
+<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest") %>
 
 static VALUE
 fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l)
@@ -57,7 +57,7 @@
 
 static const rb_transcoder
 rb_ISO_2022_JP_to_EUC_JP = {
-    "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 3, 0,
+    "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 1, 3,
     NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp
 };
 
@@ -71,7 +71,7 @@
   }
 %>
 
-<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp", []) %>
+<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp") %>
 
 static int
 fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, unsigned char *o)
@@ -129,7 +129,7 @@
 
 static const rb_transcoder
 rb_EUC_JP_to_ISO_2022_JP = {
-    "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 5, 0,
+    "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 1, 5,
     NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
 };
 
Index: enc/trans/utf_16_32.erb.c
===================================================================
--- enc/trans/utf_16_32.erb.c	(revision 18444)
+++ enc/trans/utf_16_32.erb.c	(revision 18445)
@@ -183,14 +183,12 @@
   map = {}
   map["{00-d7,e0-ff}{00-ff}"] = :func_so
   map["{d8-db}{00-ff}{dc-df}{00-ff}"] = :func_so
-  map["{dc-df}{00-ff}"] = :invalid
-  map["{d8-db}{00-ff}{00-db,e0-ff}{00-ff}"] = :invalid
-  transcode_generate_node(ActionMap.parse(map), "from_UTF_16BE", [])
+  transcode_generate_node(ActionMap.parse(map), "from_UTF_16BE")
 %>
 
 static const rb_transcoder
 rb_from_UTF_16BE = {
-    "UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0,
+    "UTF-16BE", "UTF-8", &from_UTF_16BE, 2, 4,
     NULL, NULL, NULL, &fun_so_from_utf_16be
 };
 
@@ -205,18 +203,13 @@
   map["f0{90-bf}{80-bf}{80-bf}"] = :func_so
   map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so
   map["f4{80-8f}{80-bf}{80-bf}"] = :func_so
-  map["{80-c1,f5-ff}"] = :invalid
-  map["e0{80-9f}"] = :invalid
-  map["ed{a0-bf}"] = :invalid
-  map["f0{80-8f}"] = :invalid
-  map["f4{90-bf}"] = :invalid
   am = ActionMap.parse(map)
-  transcode_generate_node(am, "to_UTF_16BE", [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf])
+  transcode_generate_node(am, "to_UTF_16BE")
 %>
 
 static const rb_transcoder
 rb_to_UTF_16BE = {
-    "UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1,
+    "UTF-8", "UTF-16BE", &to_UTF_16BE, 1, 4,
     NULL, NULL, NULL, &fun_so_to_utf_16be
 };
 
@@ -224,20 +217,18 @@
   map = {}
   map["{00-ff}{00-d7,e0-ff}"] = :func_so
   map["{00-ff}{d8-db}{00-ff}{dc-df}"] = :func_so
-  map["{00-ff}{dc-df}"] = :invalid
-  map["{00-ff}{d8-db}{00-ff}{00-db,e0-ff}"] = :invalid
-  transcode_generate_node(ActionMap.parse(map), "from_UTF_16LE", [])
+  transcode_generate_node(ActionMap.parse(map), "from_UTF_16LE")
 %>
 
 static const rb_transcoder
 rb_from_UTF_16LE = {
-    "UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0,
+    "UTF-16LE", "UTF-8", &from_UTF_16LE, 2, 4,
     NULL, NULL, NULL, &fun_so_from_utf_16le
 };
 
 static const rb_transcoder
 rb_to_UTF_16LE = {
-    "UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1,
+    "UTF-8", "UTF-16LE", &to_UTF_16BE, 1, 4,
     NULL, NULL, NULL, &fun_so_to_utf_16le
 };
 
@@ -245,21 +236,18 @@
   map = {}
   map["0000{00-d7,e0-ff}{00-ff}"] = :func_so
   map["00{01-10}{00-ff}{00-ff}"] = :func_so
-  map["00{11-ff}{00-ff}{00-ff}"] = :invalid
-  map["0000{d8-df}{00-ff}"] = :invalid
-  map["{01-ff}{00-ff}{00-ff}{00-ff}"] = :invalid
-  transcode_generate_node(ActionMap.parse(map), "from_UTF_32BE", [])
+  transcode_generate_node(ActionMap.parse(map), "from_UTF_32BE")
 %>
 
 static const rb_transcoder
 rb_from_UTF_32BE = {
-    "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0,
+    "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 4,
     NULL, NULL, NULL, &fun_so_from_utf_32be
 };
 
 static const rb_transcoder
 rb_to_UTF_32BE = {
-    "UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1,
+    "UTF-8", "UTF-32BE", &to_UTF_16BE, 1, 4,
     NULL, NULL, NULL, &fun_so_to_utf_32be
 };
 
@@ -267,21 +255,18 @@
   map = {}
   map["{00-ff}{00-d7,e0-ff}0000"] = :func_so
   map["{00-ff}{00-ff}{01-10}00"] = :func_so
-  map["{00-ff}{00-ff}{00-ff}{01-ff}"] = :invalid
-  map["{00-ff}{00-ff}{11-ff}00"] = :invalid
-  map["{00-ff}{d8-df}0000"] = :invalid
-  transcode_generate_node(ActionMap.parse(map), "from_UTF_32LE", [])
+  transcode_generate_node(ActionMap.parse(map), "from_UTF_32LE")
 %>
 
 static const rb_transcoder
 rb_from_UTF_32LE = {
-    "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0,
+    "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 4,
     NULL, NULL, NULL, &fun_so_from_utf_32le
 };
 
 static const rb_transcoder
 rb_to_UTF_32LE = {
-    "UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1,
+    "UTF-8", "UTF-32LE", &to_UTF_16BE, 1, 4,
     NULL, NULL, NULL, &fun_so_to_utf_32le
 };
 
Index: transcode_data.h
===================================================================
--- transcode_data.h	(revision 18444)
+++ transcode_data.h	(revision 18445)
@@ -72,8 +72,8 @@
     const char *from_encoding;
     const char *to_encoding;
     const BYTE_LOOKUP *conv_tree_start;
+    int from_unit_length;
     int max_output;
-    int from_utf8;
     VALUE (*func_ii)(rb_transcoding*, VALUE); /* info  -> info   */
     VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info   */
     int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info  -> output */
Index: tool/transcode-tblgen.rb
===================================================================
--- tool/transcode-tblgen.rb	(revision 18444)
+++ tool/transcode-tblgen.rb	(revision 18445)
@@ -213,13 +213,16 @@
   OffsetsMemo = {}
   InfosMemo = {}
 
-  def format_offsets(offsets)
-    code = "{\n"
+  def format_offsets(min, max, offsets)
+    offsets = offsets[min..max]
+    code = "{ %d, %d,\n" % [min, max]
     0.step(offsets.length-1,16) {|i|
       code << "    "
       code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
-      code << "  "
-      code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
+      if i+8 < offsets.length
+        code << "  "
+        code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
+      end
       code << "\n"
     }
     code << '}'
@@ -276,14 +279,22 @@
     offsets = []
     infos = []
     infomap = {}
+    min = max = nil
     table.each_with_index {|action, byte|
       action ||= :invalid
+      if action != :invalid
+        min = byte if !min
+        max = byte
+      end
       unless o = infomap[action]
         infomap[action] = o = infos.length
         infos[o] = action
       end
       offsets[byte] = o
     }
+    if !min
+      min = max = 0
+    end
 
     if n = OffsetsMemo[offsets]
       offsets_name = n
@@ -292,7 +303,7 @@
       offsets_name = "#{name}_offsets"
       offsets_code = <<"End"
 static const unsigned char
-#{offsets_name}[#{offsets.length}] = #{format_offsets(offsets)};
+#{offsets_name}[#{2+max-min+1}] = #{format_offsets(min,max,offsets)};
 End
       OffsetsMemo[offsets] = offsets_name
     end
@@ -324,24 +335,19 @@
   PostMemo = {}
   NextName = "a"
 
-  def generate_node(code, name_hint=nil, ranges=[], valid_encoding=nil)
-    ranges = [0x00..0xff] if ranges.empty?
-    range = ranges.first
+  def generate_node(code, name_hint=nil, valid_encoding=nil)
     if n = PreMemo[[self,valid_encoding]]
       return n
     end
 
-    table = Array.new(range.end - range.begin + 1)
+    table = Array.new(0x100, :invalid)
     each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
-      unless range === byte
-        raise "byte not in range"
-      end
       if a = rest.empty_action
-        table[byte-range.begin] = a
+        table[byte] = a
       else
         name_hint2 = nil
         name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
-        table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1], rest_valid_encoding)
+        table[byte] = "&" + rest.generate_node(code, name_hint2, rest_valid_encoding)
       end
     }
 
@@ -386,9 +392,8 @@
     valid_encoding = nil
   end
 
-  ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : []
   code = ''
-  defined_name = am.generate_node(code, name, ranges, valid_encoding)
+  defined_name = am.generate_node(code, name, valid_encoding)
   return defined_name, code
 end
 
@@ -409,22 +414,22 @@
   real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map)
   transcoder_name = "rb_#{tree_name}"
   TRANSCODERS << transcoder_name
-  from_utf8 = from == 'UTF-8' ? 1 : 0
+  from_unit_length = UnitLength[from]
   max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
   transcoder_code = <<"End"
 static const rb_transcoder
 #{transcoder_name} = {
-    #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{max_output}, #{from_utf8},
+    #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{from_unit_length}, #{max_output},
     NULL, NULL,
 };
 End
   tree_code + "\n" + transcoder_code
 end
 
-def transcode_generate_node(am, name_hint=nil, ranges=[])
+def transcode_generate_node(am, name_hint=nil)
   STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
   code = ''
-  am.generate_node(code, name_hint, ranges)
+  am.generate_node(code, name_hint)
   code
 end
 
@@ -436,6 +441,14 @@
   code
 end
 
+UnitLength = {
+  'UTF-16BE'    => 2,
+  'UTF-16LE'    => 2,
+  'UTF-32BE'    => 4,
+  'UTF-32LE'    => 4,
+}
+UnitLength.default = 1
+
 ValidEncoding = {
   '1byte'       => '{00-ff}',
   '2byte'       => '{00-ff}{00-ff}',
Index: test/ruby/test_transcode.rb
===================================================================
--- test/ruby/test_transcode.rb	(revision 18444)
+++ test/ruby/test_transcode.rb	(revision 18445)
@@ -267,8 +267,30 @@
       "\x80".encode("UTF-32BE", "UTF-8", invalid: :replace))
     assert_equal("\xFD\xFF\x00\x00".force_encoding("UTF-32LE"),
       "\x80".encode("UTF-32LE", "UTF-8", invalid: :replace))
+
     assert_equal("\uFFFD!",
-      "\x01\x00\x00\x00\x00\x00\x00\x21".encode("utf-8", "utf-32be", :invalid=>:replace), "[ruby-dev:35726]")
+      "\xdc\x00\x00!".encode("utf-8", "utf-16be", :invalid=>:replace))
+    assert_equal("\uFFFD!",
+      "\xd8\x00\x00!".encode("utf-8", "utf-16be", :invalid=>:replace))
+
+    assert_equal("\uFFFD!",
+      "\x00\xdc!\x00".encode("utf-8", "utf-16le", :invalid=>:replace))
+    assert_equal("\uFFFD!",
+      "\x00\xd8!\x00".encode("utf-8", "utf-16le", :invalid=>:replace))
+
+    assert_equal("\uFFFD!",
+      "\x01\x00\x00\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace), "[ruby-dev:35726]")
+    assert_equal("\uFFFD!",
+      "\x00\xff\x00\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace))
+    assert_equal("\uFFFD!",
+      "\x00\x00\xd8\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace))
+
+    assert_equal("\uFFFD!",
+      "\xff!".encode("utf-8", "euc-jp", :invalid=>:replace))
+    assert_equal("\uFFFD!",
+      "\xa1!".encode("utf-8", "euc-jp", :invalid=>:replace))
+    assert_equal("\uFFFD!",
+      "\x8f\xa1!".encode("utf-8", "euc-jp", :invalid=>:replace))
   end
 
   def test_undef_replace
Index: transcode.c
===================================================================
--- transcode.c	(revision 18444)
+++ transcode.c	(revision 18445)
@@ -336,10 +336,8 @@
     const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start;
     const BYTE_LOOKUP *next_table;
     const unsigned char *char_start;
-    unsigned int next_offset;
     VALUE next_info;
     unsigned char next_byte;
-    int from_utf8 = my_transcoder->from_utf8;
     unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
     rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding);
 
@@ -355,8 +353,12 @@
 	}
 	next_byte = (unsigned char)*in_p++;
       follow_byte:
-	next_offset = next_table->base[next_byte];
-	next_info = (VALUE)next_table->info[next_offset];
+        if (next_byte < next_table->base[0] || next_table->base[1] < next_byte)
+            next_info = INVALID;
+        else {
+            unsigned int next_offset = next_table->base[2+next_byte-next_table->base[0]];
+            next_info = (VALUE)next_table->info[next_offset];
+        }
       follow_info:
 	switch (next_info & 0x1F) {
 	  case NOMAP:
@@ -370,14 +372,6 @@
 		goto invalid;
 	    }
 	    next_byte = (unsigned char)*in_p++;
-	    if (from_utf8) {
-		if ((next_byte&0xC0) == 0x80)
-		    next_byte -= 0x80;
-		else {
-		    in_p--; /* may need to add more code later to revert other things */
-		    goto invalid;
-		}
-	    }
 	    next_table = (const BYTE_LOOKUP *)next_info;
 	    goto follow_byte;
 	    /* maybe rewrite the following cases to use fallthrough???? */
@@ -411,7 +405,16 @@
 	    out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)(in_p-char_start), out_p);
 	    break;
 	  case INVALID:
-	    goto invalid;
+            {
+                int unitlen = my_transcoder->from_unit_length;
+                if (in_stop - char_start <= unitlen)
+                    in_p = in_stop;
+                else if (in_p - char_start <= unitlen)
+                    in_p = char_start + unitlen;
+                else
+                    in_p = char_start + ((in_p - char_start - 1) / unitlen) * unitlen;
+                goto invalid;
+            }
 	  case UNDEF:
 	    goto undef;
 	}

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]