ruby-changes:17967
From: naruse <ko1@a...>
Date: Wed, 1 Dec 2010 01:53:23 +0900 (JST)
Subject: [ruby-changes:17967] Ruby:r29984 (trunk): * string.c (rb_str_inspect): inspect as a dummy encoding string
naruse 2010-12-01 01:47:24 +0900 (Wed, 01 Dec 2010) New Revision: 29984 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=29984 Log: * string.c (rb_str_inspect): inspect as a dummy encoding string when a UTF-16/32 (not BE/LE) string does not have a BOM. Unicode and some RFCs say that a string labeld as UTF-16/32 doesn't have a BOM, it should be considered big endian. But many Windows programs generates little endian UTF-16 strings without a BOM. So String#inspect treats a string labeled UTF-16/32 withaout a BOM as a dummy encoding string. patched by Martin Duerst. [ruby-core:33461] Modified files: trunk/ChangeLog trunk/string.c trunk/test/ruby/test_m17n.rb Index: ChangeLog =================================================================== --- ChangeLog (revision 29983) +++ ChangeLog (revision 29984) @@ -1,3 +1,14 @@ +Wed Dec 1 01:29:15 2010 NARUSE, Yui <naruse@r...> + + * string.c (rb_str_inspect): inspect as a dummy encoding string + when a UTF-16/32 (not BE/LE) string does not have a BOM. + Unicode and some RFCs say that a string labeld as UTF-16/32 + doesn't have a BOM, it should be considered big endian. + But many Windows programs generates little endian UTF-16 + strings without a BOM. So String#inspect treats a string + labeled UTF-16/32 withaout a BOM as a dummy encoding string. + patched by Martin Duerst. [ruby-core:33461] + Tue Nov 30 17:04:10 2010 NARUSE, Yui <naruse@r...> * addr2line.c (parse_debug_line_cu): ignore DW_LNE_set_discriminator. Index: string.c =================================================================== --- string.c (revision 29983) +++ string.c (revision 29984) @@ -4214,10 +4214,22 @@ p = RSTRING_PTR(str); pend = RSTRING_END(str); prev = p; if (enc == utf16) { - enc = *p == (char)0xFF ? rb_enc_find("UTF-16LE") : rb_enc_find("UTF-16BE"); + const unsigned char *q = (const unsigned char *)p; + if (q[0] == 0xFE && q[1] == 0xFF) + enc = rb_enc_find("UTF-16BE"); + else if (q[0] == 0xFF && q[1] == 0xFD) + enc = rb_enc_find("UTF-16LE"); + else + unicode_p = 0; } else if (enc == utf32) { - enc = *p == (char)0xFF ? rb_enc_find("UTF-32LE") : rb_enc_find("UTF-32BE"); + const unsigned char *q = (const unsigned char *)p; + if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) + enc = rb_enc_find("UTF-32BE"); + else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) + enc = rb_enc_find("UTF-32LE"); + else + unicode_p = 0; } while (p < pend) { unsigned int c, cc; @@ -6004,7 +6016,6 @@ rb_str_each_codepoint(VALUE str) { VALUE orig = str; - long len; int n; unsigned int c; const char *ptr, *end; @@ -6014,7 +6025,6 @@ RETURN_ENUMERATOR(str, 0, 0); str = rb_str_new4(str); ptr = RSTRING_PTR(str); - len = RSTRING_LEN(str); end = RSTRING_END(str); enc = STR_ENC_GET(str); while (ptr < end) { Index: test/ruby/test_m17n.rb =================================================================== --- test/ruby/test_m17n.rb (revision 29983) +++ test/ruby/test_m17n.rb (revision 29984) @@ -232,6 +232,19 @@ Encoding.default_external = orig_ext end + def test_utf_16_32_inspect + str = "\u3042" + %w/UTF-16 UTF-32/.each do |enc| + %w/BE LE/.each do |endian| + s = str.encode(enc + endian) + # When a UTF-16/32 string doesn't have a BOM, + # inspect as a dummy encoding string. + assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect, + s.dup.force_encoding(enc).inspect) + end + end + end + def test_str_dump [ e("\xfe"), -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/