ruby-changes:56934
From: nagachika <ko1@a...>
Date: Sun, 11 Aug 2019 21:18:23 +0900 (JST)
Subject: [ruby-changes:56934] nagachika: ad6ffac7d6 (ruby_2_6): merge revision(s) 8aecc90974ab1ac87056f77e2cb3406c5c041504,2f6cc15cdb3d64135b29cfd5ee376a5a03ebbee7: [Backport #15965]
https://git.ruby-lang.org/ruby.git/commit/?id=ad6ffac7d6 From ad6ffac7d6121c193b95c15bc011c9969967f554 Mon Sep 17 00:00:00 2001 From: nagachika <nagachika@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> Date: Sun, 11 Aug 2019 12:17:58 +0000 Subject: merge revision(s) 8aecc90974ab1ac87056f77e2cb3406c5c041504,2f6cc15cdb3d64135b29cfd5ee376a5a03ebbee7: [Backport #15965] Hoisted out WIDE_ENCODINGS Fixed String#grapheme_clusters with wide encodings * string.c (get_reg_grapheme_cluster): make regexp from properly encoded sources fro wide-char encodings. [Bug #15965] * regparse.c (node_extended_grapheme_cluster): suppress false duplicated range warning for the time being. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_2_6@67741 b2dd03c8-39d4-4d8f-98ff-823fe69b080e diff --git a/regparse.c b/regparse.c index 5e51e39..574a07e 100644 --- a/regparse.c +++ b/regparse.c @@ -5961,6 +5961,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) https://github.com/ruby/ruby/blob/trunk/regparse.c#L5961 if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */ BBuf *inverted_buf = NULL; + /* TODO: fix false warning */ + const int dup_not_warned = env->warnings_flag | ~ONIG_SYN_WARN_CC_DUP; + env->warnings_flag |= ONIG_SYN_WARN_CC_DUP; + /* Start with a positive buffer and invert at the end. * Otherwise, adding single-character ranges work the wrong way. */ R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env)); @@ -5968,6 +5972,8 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) https://github.com/ruby/ruby/blob/trunk/regparse.c#L5972 R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */ R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env)); cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */ + + env->warnings_flag &= dup_not_warned; /* TODO: fix false warning */ } else { R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env)); diff --git a/string.c b/string.c index f7e9c3d..1d1cd2f 100644 --- a/string.c +++ b/string.c @@ -8474,9 +8474,30 @@ get_reg_grapheme_cluster(rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L8474 reg_grapheme_cluster = reg_grapheme_cluster_utf8; } if (!reg_grapheme_cluster) { - const OnigUChar source[] = "\\X"; + const OnigUChar source_ascii[] = "\\X"; OnigErrorInfo einfo; - int r = onig_new(®_grapheme_cluster, source, source + sizeof(source) - 1, + const OnigUChar *source = source_ascii; + size_t source_len = sizeof(source_ascii) - 1; + switch (encidx) { +#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x) +#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8) +#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x) +#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16) +#define CASE_UTF(e) \ + case ENCINDEX_UTF_##e: { \ + static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \ + source = source_UTF_##e; \ + source_len = sizeof(source_UTF_##e); \ + break; \ + } + CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE); +#undef CASE_UTF +#undef CHARS_16BE +#undef CHARS_16LE +#undef CHARS_32BE +#undef CHARS_32LE + } + int r = onig_new(®_grapheme_cluster, source, source + source_len, ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo); if (r) { UChar message[ONIG_MAX_ERROR_MESSAGE_LEN]; diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 55dcfb5..aad4277 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -4,6 +4,11 @@ require 'test/unit' https://github.com/ruby/ruby/blob/trunk/test/ruby/test_string.rb#L4 class TestString < Test::Unit::TestCase ENUMERATOR_WANTARRAY = RUBY_VERSION >= "3.0.0" + WIDE_ENCODINGS = [ + Encoding::UTF_16BE, Encoding::UTF_16LE, + Encoding::UTF_32BE, Encoding::UTF_32LE, + ] + def initialize(*args) @cls = String @aref_re_nth = true @@ -667,8 +672,7 @@ CODE https://github.com/ruby/ruby/blob/trunk/test/ruby/test_string.rb#L672 assert_raise(ArgumentError) {S("mypassword").crypt(S("\0a"))} assert_raise(ArgumentError) {S("mypassword").crypt(S("a\0"))} assert_raise(ArgumentError) {S("poison\u0000null").crypt(S("aa"))} - [Encoding::UTF_16BE, Encoding::UTF_16LE, - Encoding::UTF_32BE, Encoding::UTF_32LE].each do |enc| + WIDE_ENCODINGS.each do |enc| assert_raise(ArgumentError) {S("mypassword").crypt(S("aa".encode(enc)))} assert_raise(ArgumentError) {S("mypassword".encode(enc)).crypt(S("aa"))} end @@ -1032,13 +1036,20 @@ CODE https://github.com/ruby/ruby/blob/trunk/test/ruby/test_string.rb#L1036 "\u{1F468 200D 1F393}", "\u{1F46F 200D 2642 FE0F}", "\u{1f469 200d 2764 fe0f 200d 1f469}", - ].each do |g| + ].product([Encoding::UTF_8, *WIDE_ENCODINGS]) do |g, enc| + g = g.encode(enc) assert_equal [g], g.grapheme_clusters - assert_predicate g.dup.taint.grapheme_clusters[0], :tainted? + assert_predicate g.taint.grapheme_clusters[0], :tainted? end - assert_equal ["\u000A", "\u0324"], "\u{a 324}".grapheme_clusters - assert_equal ["\u000D", "\u0324"], "\u{d 324}".grapheme_clusters + [ + "\u{a 324}", + "\u{d 324}", + "abc", + ].product([Encoding::UTF_8, *WIDE_ENCODINGS]) do |g, enc| + g = g.encode(enc) + assert_equal g.chars, g.grapheme_clusters + end assert_equal ["a", "b", "c"], "abc".b.grapheme_clusters if ENUMERATOR_WANTARRAY @@ -1805,10 +1816,7 @@ CODE https://github.com/ruby/ruby/blob/trunk/test/ruby/test_string.rb#L1816 def test_split_wchar bug8642 = '[ruby-core:56036] [Bug #8642]' - [ - Encoding::UTF_16BE, Encoding::UTF_16LE, - Encoding::UTF_32BE, Encoding::UTF_32LE, - ].each do |enc| + WIDE_ENCODINGS.each do |enc| s = S("abc,def".encode(enc)) assert_equal(["abc", "def"].map {|c| c.encode(enc)}, s.split(",".encode(enc)), @@ -3018,8 +3026,7 @@ CODE https://github.com/ruby/ruby/blob/trunk/test/ruby/test_string.rb#L3026 def test_ascii_incomat_inspect bug4081 = '[ruby-core:33283]' - [Encoding::UTF_16LE, Encoding::UTF_16BE, - Encoding::UTF_32LE, Encoding::UTF_32BE].each do |e| + WIDE_ENCODINGS.each do |e| assert_equal('"abc"', "abc".encode(e).inspect) assert_equal('"\\u3042\\u3044\\u3046"', "\u3042\u3044\u3046".encode(e).inspect) assert_equal('"ab\\"c"', "ab\"c".encode(e).inspect, bug4081) diff --git a/version.h b/version.h index f043008..7ff4100 100644 --- a/version.h +++ b/version.h @@ -1,10 +1,10 @@ https://github.com/ruby/ruby/blob/trunk/version.h#L1 #define RUBY_VERSION "2.6.3" #define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR -#define RUBY_PATCHLEVEL 91 +#define RUBY_PATCHLEVEL 92 #define RUBY_RELEASE_YEAR 2019 #define RUBY_RELEASE_MONTH 8 -#define RUBY_RELEASE_DAY 9 +#define RUBY_RELEASE_DAY 11 #include "ruby/version.h" -- cgit v0.10.2 -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/