ruby-changes:32995
From: naruse <ko1@a...>
Date: Fri, 21 Feb 2014 17:34:41 +0900 (JST)
Subject: [ruby-changes:32995] naruse:r45074 (ruby_2_1): merge revision(s) 44604, 44605, 44606: [Backport #9415]
naruse 2014-02-21 17:34:35 +0900 (Fri, 21 Feb 2014) New Revision: 45074 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=45074 Log: merge revision(s) 44604,44605,44606: [Backport #9415] test_m17n.rb: split tests for inspect * test/ruby/test_m17n.rb (test_utf_16_32_inspect): split tests for each encodings. * string.c (get_actual_encoding): get actual encoding according to the BOM if exists. * string.c (rb_str_inspect): use according encoding, instead of pseudo encodings, UTF-{16,32}. [ruby-core:59757] [Bug #8940] * string.c (get_encoding): respect BOM on pseudo encodings. [ruby-dev:47895] [Bug #9415] Modified directories: branches/ruby_2_1/ Modified files: branches/ruby_2_1/ChangeLog branches/ruby_2_1/encoding.c branches/ruby_2_1/string.c branches/ruby_2_1/test/ruby/test_m17n.rb branches/ruby_2_1/version.h Index: ruby_2_1/encoding.c =================================================================== --- ruby_2_1/encoding.c (revision 45073) +++ ruby_2_1/encoding.c (revision 45074) @@ -598,6 +598,12 @@ rb_enc_from_index(int index) https://github.com/ruby/ruby/blob/trunk/ruby_2_1/encoding.c#L598 return enc_table.list[index].enc; } +rb_encoding * +rb_enc_get_from_index(int index) +{ + return must_encindex(index); +} + int rb_enc_registered(const char *name) { Index: ruby_2_1/ChangeLog =================================================================== --- ruby_2_1/ChangeLog (revision 45073) +++ ruby_2_1/ChangeLog (revision 45074) @@ -1,3 +1,16 @@ https://github.com/ruby/ruby/blob/trunk/ruby_2_1/ChangeLog#L1 +Fri Feb 21 16:47:20 2014 Nobuyoshi Nakada <nobu@r...> + + * string.c (get_encoding): respect BOM on pseudo encodings. + [ruby-dev:47895] [Bug #9415] + +Fri Feb 21 16:47:20 2014 Nobuyoshi Nakada <nobu@r...> + + * string.c (get_actual_encoding): get actual encoding according to + the BOM if exists. + + * string.c (rb_str_inspect): use according encoding, instead of + pseudo encodings, UTF-{16,32}. [ruby-core:59757] [Bug #8940] + Fri Feb 21 13:39:21 2014 Charlie Somerville <charliesome@r...> * compile.c (iseq_build_from_ary_body): Use :blockptr instead of :block Index: ruby_2_1/string.c =================================================================== --- ruby_2_1/string.c (revision 45073) +++ ruby_2_1/string.c (revision 45074) @@ -121,7 +121,45 @@ VALUE rb_cSymbol; https://github.com/ruby/ruby/blob/trunk/ruby_2_1/string.c#L121 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr) #define STR_HEAP_SIZE(str) (RSTRING(str)->as.heap.aux.capa + TERM_LEN(str)) -#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) +#define STR_ENC_GET(str) get_encoding(str) + +rb_encoding *rb_enc_get_from_index(int index); + +static rb_encoding * +get_actual_encoding(const int encidx, VALUE str) +{ + const unsigned char *q; + + switch (encidx) { + case ENCINDEX_UTF_16: + if (RSTRING_LEN(str) < 2) break; + q = (const unsigned char *)RSTRING_PTR(str); + if (q[0] == 0xFE && q[1] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_16BE); + } + if (q[0] == 0xFF && q[1] == 0xFE) { + return rb_enc_get_from_index(ENCINDEX_UTF_16LE); + } + return rb_ascii8bit_encoding(); + case ENCINDEX_UTF_32: + if (RSTRING_LEN(str) < 4) break; + q = (const unsigned char *)RSTRING_PTR(str); + if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_32BE); + } + if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_32LE); + } + return rb_ascii8bit_encoding(); + } + return rb_enc_from_index(encidx); +} + +static rb_encoding * +get_encoding(VALUE str) +{ + return get_actual_encoding(ENCODING_GET(str), str); +} static int fstring_cmp(VALUE a, VALUE b); @@ -4750,8 +4788,8 @@ rb_str_buf_cat_escaped_char(VALUE result https://github.com/ruby/ruby/blob/trunk/ruby_2_1/string.c#L4788 VALUE rb_str_inspect(VALUE str) { - rb_encoding *enc = STR_ENC_GET(str); - int encidx = rb_enc_to_index(enc); + int encidx = ENCODING_GET(str); + rb_encoding *enc = rb_enc_from_index(encidx), *actenc; const char *p, *pend, *prev; char buf[CHAR_ESC_LEN + 1]; VALUE result = rb_str_buf_new(0); @@ -4766,27 +4804,10 @@ rb_str_inspect(VALUE str) https://github.com/ruby/ruby/blob/trunk/ruby_2_1/string.c#L4804 p = RSTRING_PTR(str); pend = RSTRING_END(str); prev = p; - if (encidx == ENCINDEX_UTF_16 && p + 2 <= pend) { - const unsigned char *q = (const unsigned char *)p; - if (q[0] == 0xFE && q[1] == 0xFF) - enc = rb_enc_from_index(ENCINDEX_UTF_16BE); - else if (q[0] == 0xFF && q[1] == 0xFE) - enc = rb_enc_from_index(ENCINDEX_UTF_16LE); - else { - enc = rb_ascii8bit_encoding(); - unicode_p = 0; - } - } - else if (encidx == ENCINDEX_UTF_32 && p + 4 <= pend) { - const unsigned char *q = (const unsigned char *)p; - if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) - enc = rb_enc_from_index(ENCINDEX_UTF_32BE); - else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) - enc = rb_enc_from_index(ENCINDEX_UTF_32LE); - else { - enc = rb_ascii8bit_encoding(); - unicode_p = 0; - } + actenc = get_actual_encoding(encidx, str); + if (actenc != enc) { + enc = actenc; + if (unicode_p) unicode_p = rb_enc_unicode_p(enc); } while (p < pend) { unsigned int c, cc; Index: ruby_2_1/version.h =================================================================== --- ruby_2_1/version.h (revision 45073) +++ ruby_2_1/version.h (revision 45074) @@ -1,6 +1,6 @@ https://github.com/ruby/ruby/blob/trunk/ruby_2_1/version.h#L1 #define RUBY_VERSION "2.1.1" #define RUBY_RELEASE_DATE "2014-02-21" -#define RUBY_PATCHLEVEL 40 +#define RUBY_PATCHLEVEL 41 #define RUBY_RELEASE_YEAR 2014 #define RUBY_RELEASE_MONTH 2 Index: ruby_2_1/test/ruby/test_m17n.rb =================================================================== --- ruby_2_1/test/ruby/test_m17n.rb (revision 45073) +++ ruby_2_1/test/ruby/test_m17n.rb (revision 45074) @@ -226,24 +226,35 @@ class TestM17N < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/ruby_2_1/test/ruby/test_m17n.rb#L226 end end - def test_utf_16_32_inspect - str = "\u3042" - %w/UTF-16 UTF-32/.each do |enc| - %w/BE LE/.each do |endian| - s = str.encode(enc + endian) + STR_WITHOUT_BOM = "\u3042".freeze + STR_WITH_BOM = "\uFEFF\u3042".freeze + bug8940 = '[ruby-core:59757] [Bug #8940]' + bug9415 = '[ruby-dev:47895] [Bug #9415]' + %w/UTF-16 UTF-32/.each do |enc| + %w/BE LE/.each do |endian| + bom = "\uFEFF".encode("#{enc}#{endian}").force_encoding(enc) + + define_method("test_utf_16_32_inspect(#{enc}#{endian})") do + s = STR_WITHOUT_BOM.encode(enc + endian) # When a UTF-16/32 string doesn't have a BOM, # inspect as a dummy encoding string. assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect, s.dup.force_encoding(enc).inspect) + assert_normal_exit("#{bom.b.dump}.force_encoding('#{enc}').inspect", bug8940) end - end - str = "\uFEFF\u3042" - %w/UTF-16 UTF-32/.each do |enc| - %w/BE LE/.each do |endian| - s = str.encode(enc + endian) - # When a UTF-16/32 string doesn't have a BOM, - # inspect as a dummy encoding string. + define_method("test_utf_16_32_codepoints(#{enc}#{endian})") do + assert_equal([0xFEFF], bom.codepoints, bug9415) + end + + define_method("test_utf_16_32_ord(#{enc}#{endian})") do + assert_equal(0xFEFF, bom.ord, bug9415) + end + + define_method("test_utf_16_32_inspect(#{enc}#{endian}-BOM)") do + s = STR_WITH_BOM.encode(enc + endian) + # When a UTF-16/32 string has a BOM, + # inspect as a particular encoding string. assert_equal(s.inspect, s.dup.force_encoding(enc).inspect) end Property changes on: ruby_2_1 ___________________________________________________________________ Modified: svn:mergeinfo Merged /trunk:r44604-44606 -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/