ruby-changes:39389
From: nobu <ko1@a...>
Date: Mon, 3 Aug 2015 10:08:59 +0900 (JST)
Subject: [ruby-changes:39389] nobu:r51470 (trunk): re.c: fix for wide character encodings
nobu 2015-08-03 10:08:36 +0900 (Mon, 03 Aug 2015) New Revision: 51470 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=51470 Log: re.c: fix for wide character encodings * re.c (rb_memsearch): should match only char boundaries in wide character encodings. [ruby-core:70220] [Bug #11413] Modified files: trunk/ChangeLog trunk/re.c trunk/string.c trunk/test/ruby/test_m17n.rb Index: re.c =================================================================== --- re.c (revision 51469) +++ re.c (revision 51470) @@ -221,6 +221,32 @@ rb_memsearch_qs_utf8(const unsigned char https://github.com/ruby/ruby/blob/trunk/re.c#L221 return -1; } +static inline long +rb_memsearch_wchar(const unsigned char *xs, long m, const unsigned char *ys, long n) +{ + const unsigned char *x = xs, x0 = *xs, *y = ys; + enum {char_size = 2}; + + for (n -= m; n > 0; n -= char_size, y += char_size) { + if (x0 == *y && memcmp(x+1, y+1, m-1) == 0) + return y - ys; + } + return -1; +} + +static inline long +rb_memsearch_qchar(const unsigned char *xs, long m, const unsigned char *ys, long n) +{ + const unsigned char *x = xs, x0 = *xs, *y = ys; + enum {char_size = 4}; + + for (n -= m; n > 0; n -= char_size, y += char_size) { + if (x0 == *y && memcmp(x+1, y+1, m-1) == 0) + return y - ys; + } + return -1; +} + long rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc) { @@ -241,15 +267,21 @@ rb_memsearch(const void *x0, long m, con https://github.com/ruby/ruby/blob/trunk/re.c#L267 else return -1; } - else if (m <= SIZEOF_VALUE) { - return rb_memsearch_ss(x0, m, y0, n); + else if (LIKELY(rb_enc_mbminlen(enc) == 1)) { + if (m <= SIZEOF_VALUE) { + return rb_memsearch_ss(x0, m, y0, n); + } + else if (enc == rb_utf8_encoding()){ + return rb_memsearch_qs_utf8(x0, m, y0, n); + } } - else if (enc == rb_utf8_encoding()){ - return rb_memsearch_qs_utf8(x0, m, y0, n); + else if (LIKELY(rb_enc_mbminlen(enc) == 2)) { + return rb_memsearch_wchar(x0, m, y0, n); } - else { - return rb_memsearch_qs(x0, m, y0, n); + else if (LIKELY(rb_enc_mbminlen(enc) == 4)) { + return rb_memsearch_qchar(x0, m, y0, n); } + return rb_memsearch_qs(x0, m, y0, n); } #define REG_LITERAL FL_USER5 Index: ChangeLog =================================================================== --- ChangeLog (revision 51469) +++ ChangeLog (revision 51470) @@ -1,3 +1,8 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1 +Mon Aug 3 10:08:33 2015 Nobuyoshi Nakada <nobu@r...> + + * re.c (rb_memsearch): should match only char boundaries in wide + character encodings. [ruby-core:70220] [Bug #11413] + Sun Aug 2 07:01:17 2015 Eric Wong <e@8...> * ext/openssl/lib/openssl/buffering.rb (gets): Index: string.c =================================================================== --- string.c (revision 51469) +++ string.c (revision 51470) @@ -6544,15 +6544,10 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L6544 } enc = STR_ENC_GET(str); - if (NIL_P(spat)) { - if (!NIL_P(rb_fs)) { - spat = rb_fs; - goto fs_set; - } + if (NIL_P(spat) && NIL_P(spat = rb_fs)) { split_type = awk; } else { - fs_set: spat = get_pat_quoted(spat, 0); if (BUILTIN_TYPE(spat) == T_STRING) { rb_encoding *enc2 = STR_ENC_GET(spat); Index: test/ruby/test_m17n.rb =================================================================== --- test/ruby/test_m17n.rb (revision 51469) +++ test/ruby/test_m17n.rb (revision 51470) @@ -1236,6 +1236,9 @@ class TestM17N < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/test/ruby/test_m17n.rb#L1236 each_encoding("abc,def", ",", "abc", "def") do |str, sep, *expected| assert_equal(expected, str.split(sep, -1)) end + each_encoding("abc\0def", "\0", "abc", "def") do |str, sep, *expected| + assert_equal(expected, str.split(sep, -1)) + end end def test_nonascii_method_name -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/