ruby-changes:17967

naruse	2010-12-01 01:47:24 +0900 (Wed, 01 Dec 2010)

  New Revision: 29984

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=29984

  Log:
    * string.c (rb_str_inspect): inspect as a dummy encoding string
      when a UTF-16/32 (not BE/LE) string does not have a BOM.
      Unicode and some RFCs say that a string labeld as UTF-16/32
      doesn't have a BOM, it should be considered big endian.
      But many Windows programs generates little endian UTF-16
      strings without a BOM. So String#inspect treats a string
      labeled UTF-16/32 withaout a BOM as a dummy encoding string.
      patched by Martin Duerst. [ruby-core:33461]

  Modified files:
    trunk/ChangeLog
    trunk/string.c
    trunk/test/ruby/test_m17n.rb

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 29983)
+++ ChangeLog	(revision 29984)
@@ -1,3 +1,14 @@
+Wed Dec  1 01:29:15 2010  NARUSE, Yui  <naruse@r...>
+
+	* string.c (rb_str_inspect): inspect as a dummy encoding string
+	  when a UTF-16/32 (not BE/LE) string does not have a BOM.
+	  Unicode and some RFCs say that a string labeld as UTF-16/32
+	  doesn't have a BOM, it should be considered big endian.
+	  But many Windows programs generates little endian UTF-16
+	  strings without a BOM. So String#inspect treats a string
+	  labeled UTF-16/32 withaout a BOM as a dummy encoding string.
+	  patched by Martin Duerst. [ruby-core:33461]
+
 Tue Nov 30 17:04:10 2010  NARUSE, Yui  <naruse@r...>
 
 	* addr2line.c (parse_debug_line_cu): ignore DW_LNE_set_discriminator.
Index: string.c
===================================================================
--- string.c	(revision 29983)
+++ string.c	(revision 29984)
@@ -4214,10 +4214,22 @@
     p = RSTRING_PTR(str); pend = RSTRING_END(str);
     prev = p;
     if (enc == utf16) {
-	enc = *p == (char)0xFF ? rb_enc_find("UTF-16LE") : rb_enc_find("UTF-16BE");
+	const unsigned char *q = (const unsigned char *)p;
+	if (q[0] == 0xFE && q[1] == 0xFF)
+	    enc = rb_enc_find("UTF-16BE");
+	else if (q[0] == 0xFF && q[1] == 0xFD)
+	    enc = rb_enc_find("UTF-16LE");
+	else
+	    unicode_p = 0;
     }
     else if (enc == utf32) {
-	enc = *p == (char)0xFF ? rb_enc_find("UTF-32LE") : rb_enc_find("UTF-32BE");
+	const unsigned char *q = (const unsigned char *)p;
+	if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
+	    enc = rb_enc_find("UTF-32BE");
+	else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
+	    enc = rb_enc_find("UTF-32LE");
+	else
+	    unicode_p = 0;
     }
     while (p < pend) {
 	unsigned int c, cc;
@@ -6004,7 +6016,6 @@
 rb_str_each_codepoint(VALUE str)
 {
     VALUE orig = str;
-    long len;
     int n;
     unsigned int c;
     const char *ptr, *end;
@@ -6014,7 +6025,6 @@
     RETURN_ENUMERATOR(str, 0, 0);
     str = rb_str_new4(str);
     ptr = RSTRING_PTR(str);
-    len = RSTRING_LEN(str);
     end = RSTRING_END(str);
     enc = STR_ENC_GET(str);
     while (ptr < end) {
Index: test/ruby/test_m17n.rb
===================================================================
--- test/ruby/test_m17n.rb	(revision 29983)
+++ test/ruby/test_m17n.rb	(revision 29984)
@@ -232,6 +232,19 @@
     Encoding.default_external = orig_ext
   end
 
+  def test_utf_16_32_inspect
+    str = "\u3042"
+    %w/UTF-16 UTF-32/.each do |enc|
+      %w/BE LE/.each do |endian|
+        s = str.encode(enc + endian)
+        # When a UTF-16/32 string doesn't have a BOM,
+        # inspect as a dummy encoding string.
+        assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect,
+                     s.dup.force_encoding(enc).inspect)
+      end
+    end
+  end
+
   def test_str_dump
     [
       e("\xfe"),

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/