[前][次][番号順一覧][スレッド一覧]

ruby-changes:72077

From: Jeremy <ko1@a...>
Date: Tue, 7 Jun 2022 05:50:17 +0900 (JST)
Subject: [ruby-changes:72077] ec3542229b (master): Ignore invalid escapes in regexp comments

https://git.ruby-lang.org/ruby.git/commit/?id=ec3542229b

From ec3542229b29ec93062e9d90e877ea29d3c19472 Mon Sep 17 00:00:00 2001
From: Jeremy Evans <code@j...>
Date: Mon, 6 Jun 2022 13:50:03 -0700
Subject: Ignore invalid escapes in regexp comments

Invalid escapes are handled at multiple levels.  The first level
is in parse.y, so skip invalid unicode escape checks for regexps
in parse.y.

Make rb_reg_preprocess and unescape_nonascii accept the regexp
options.  In unescape_nonascii, if the regexp is an extended
regexp, when "#" is encountered, ignore all characters until the
end of line or end of regexp.

Unfortunately, in extended regexps, you can use "#" as a non-comment
character inside a character class, so also parse "[" and "]"
specially for extended regexps, and only skip comments if "#" is
not inside a character class. Handle nested character classes as well.

This issue doesn't just affect extended regexps, it also affects
"(#?" comments inside all regexps.  So for those comments, scan
until trailing ")" and ignore content inside.

I'm not sure if there are other corner cases not handled.  A
better fix would be to redesign the regexp parser so that it
unescaped during parsing instead of before parsing, so you already
know the current parsing state.

Fixes [Bug #18294]

Co-authored-by: Nobuyoshi Nakada <nobu@r...>
---
 parse.y                  | 26 ++++++++++--------
 re.c                     | 71 ++++++++++++++++++++++++++++++++++++++++++------
 test/ruby/test_regexp.rb | 53 ++++++++++++++++++++++++++++++++++++
 3 files changed, 131 insertions(+), 19 deletions(-)

diff --git a/parse.y b/parse.y
index 0bf717aa95..141c4a6739 100644
--- a/parse.y
+++ b/parse.y
@@ -6803,17 +6803,21 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp, https://github.com/ruby/ruby/blob/trunk/parse.y#L6803
     int codepoint = scan_hex(p->lex.pcur, wide ? p->lex.pend - p->lex.pcur : 4, &numlen);
     literal_flush(p, p->lex.pcur);
     p->lex.pcur += numlen;
-    if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4))  {
-	yyerror0("invalid Unicode escape");
-	return wide && numlen > 0;
-    }
-    if (codepoint > 0x10ffff) {
-	yyerror0("invalid Unicode codepoint (too large)");
-	return wide;
-    }
-    if ((codepoint & 0xfffff800) == 0xd800) {
-	yyerror0("invalid Unicode codepoint");
-	return wide;
+    if (p->lex.strterm == NULL ||
+        (p->lex.strterm->flags & STRTERM_HEREDOC) ||
+        (p->lex.strterm->u.literal.u1.func != str_regexp)) {
+        if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4))  {
+            yyerror0("invalid Unicode escape");
+            return wide && numlen > 0;
+        }
+        if (codepoint > 0x10ffff) {
+            yyerror0("invalid Unicode codepoint (too large)");
+            return wide;
+        }
+        if ((codepoint & 0xfffff800) == 0xd800) {
+            yyerror0("invalid Unicode codepoint");
+            return wide;
+        }
     }
     if (regexp_literal) {
 	tokcopy(p, (int)numlen);
diff --git a/re.c b/re.c
index 3211a47333..2093d1ace4 100644
--- a/re.c
+++ b/re.c
@@ -1527,7 +1527,7 @@ rb_reg_fixed_encoding_p(VALUE re) https://github.com/ruby/ruby/blob/trunk/re.c#L1527
 
 static VALUE
 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
-        rb_encoding **fixed_enc, onig_errmsg_buffer err);
+        rb_encoding **fixed_enc, onig_errmsg_buffer err, int options);
 
 NORETURN(static void reg_enc_error(VALUE re, VALUE str));
 
@@ -1608,7 +1608,7 @@ rb_reg_prepare_re0(VALUE re, VALUE str, onig_errmsg_buffer err) https://github.com/ruby/ruby/blob/trunk/re.c#L1608
 
     unescaped = rb_reg_preprocess(
 	pattern, pattern + RREGEXP_SRC_LEN(re), enc,
-	&fixed_enc, err);
+        &fixed_enc, err, 0);
 
     if (NIL_P(unescaped)) {
 	rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
@@ -2718,10 +2718,11 @@ unescape_unicode_bmp(const char **pp, const char *end, https://github.com/ruby/ruby/blob/trunk/re.c#L2718
 static int
 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
         VALUE buf, rb_encoding **encp, int *has_property,
-        onig_errmsg_buffer err)
+        onig_errmsg_buffer err, int options)
 {
     unsigned char c;
     char smallbuf[2];
+    int in_char_class = 0;
 
     while (p < end) {
         int chlen = rb_enc_precise_mbclen(p, end, enc);
@@ -2833,6 +2834,60 @@ escape_asis: https://github.com/ruby/ruby/blob/trunk/re.c#L2834
             }
             break;
 
+          case '#':
+            if ((options & ONIG_OPTION_EXTEND) && !in_char_class) {
+                /* consume and ignore comment in extended regexp */
+                while ((p < end) && ((c = *p++) != '\n'));
+                break;
+            }
+            rb_str_buf_cat(buf, (char *)&c, 1);
+            break;
+          case '[':
+            in_char_class++;
+            rb_str_buf_cat(buf, (char *)&c, 1);
+            break;
+          case ']':
+            if (in_char_class) {
+                in_char_class--;
+            }
+            rb_str_buf_cat(buf, (char *)&c, 1);
+            break;
+          case '(':
+            if (!in_char_class && p + 1 < end && *p == '?' && *(p+1) == '#') {
+                /* (?# is comment inside any regexp, and content inside should be ignored */
+                const char *orig_p = p;
+                int cont = 1;
+
+                while (cont && (p < end)) {
+                    switch (c = *p++) {
+		      default:
+                        if (!(c & 0x80)) break;
+			--p;
+			/* fallthrough */
+                      case '\\':
+                        chlen = rb_enc_precise_mbclen(p, end, enc);
+                        if (!MBCLEN_CHARFOUND_P(chlen)) {
+                            goto invalid_multibyte;
+                        }
+                        p += MBCLEN_CHARFOUND_LEN(chlen);
+                        break;
+                      case ')':
+                        cont = 0;
+                        break;
+                    }
+                }
+
+                if (cont) {
+                    /* unterminated (?#, rewind so it is syntax error */
+                    p = orig_p;
+                    c = '(';
+                    rb_str_buf_cat(buf, (char *)&c, 1);
+                }
+            }
+            else {
+                rb_str_buf_cat(buf, (char *)&c, 1);
+            }
+            break;
           default:
             rb_str_buf_cat(buf, (char *)&c, 1);
             break;
@@ -2844,7 +2899,7 @@ escape_asis: https://github.com/ruby/ruby/blob/trunk/re.c#L2899
 
 static VALUE
 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
-        rb_encoding **fixed_enc, onig_errmsg_buffer err)
+        rb_encoding **fixed_enc, onig_errmsg_buffer err, int options)
 {
     VALUE buf;
     int has_property = 0;
@@ -2858,7 +2913,7 @@ rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc, https://github.com/ruby/ruby/blob/trunk/re.c#L2913
         rb_enc_associate(buf, enc);
     }
 
-    if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
+    if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err, options) != 0)
         return Qnil;
 
     if (has_property && !*fixed_enc) {
@@ -2886,7 +2941,7 @@ rb_reg_check_preprocess(VALUE str) https://github.com/ruby/ruby/blob/trunk/re.c#L2941
     end = p + RSTRING_LEN(str);
     enc = rb_enc_get(str);
 
-    buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
+    buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err, 0);
     RB_GC_GUARD(str);
 
     if (NIL_P(buf)) {
@@ -2928,7 +2983,7 @@ rb_reg_preprocess_dregexp(VALUE ary, int options) https://github.com/ruby/ruby/blob/trunk/re.c#L2983
         p = RSTRING_PTR(str);
         end = p + RSTRING_LEN(str);
 
-        buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
+        buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err, options);
 
         if (NIL_P(buf))
             rb_raise(rb_eArgError, "%s", err);
@@ -2975,7 +3030,7 @@ rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc, https://github.com/ruby/ruby/blob/trunk/re.c#L3030
 	return -1;
     }
 
-    unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
+    unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err, options);
     if (NIL_P(unescaped))
         return -1;
 
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index 84687c5380..71d56ad027 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -91,6 +91,59 @@ class TestRegexp < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/test/ruby/test_regexp.rb#L91
     assert_warn('', '[ruby-core:82328] [Bug #13798]') {re.to_s}
   end
 
+  def test_extended_comment_invalid_escape_bug_18294
+    assert_separately([], <<-RUBY)
+      re = / C:\\\\[a-z]{5} # e.g. C:\\users /x
+      assert_match(re, 'C:\\users')
+      assert_not_match(re, 'C:\\user')
+
+      re = /
+        foo  # \\M-ca
+        bar
+      /x
+      assert_match(re, 'foobar')
+      assert_not_match(re, 'foobaz')
+
+      re = /
+        f[#o]o  # \\M-ca
+        bar
+      /x
+      assert_match(re, 'foobar')
+      assert_not_match(re, 'foobaz')
+
+      re = /
+        f[[:alnum:]#]o  # \\M-ca
+        bar
+      /x
+      assert_match(re, 'foobar')
+      assert_not_match(re, 'foobaz')
+
+      re = /
+        f(?# \\M-ca)oo  # \\M-ca
+        bar
+      /x
+      assert_match(re, 'foobar')
+      assert_not_match(re, 'foobaz')
+
+      re = /f(?# \\M-ca)oobar/
+      assert_match(re, 'foobar')
+      assert_not_match(re, 'foobaz')
+
+      re = /[-(?# fca)]oobar/
+      assert_match(re, 'foobar')
+      assert_not_match(re, 'foobaz')
+
+      re = /f(?# ca\0\\M-ca)oobar/
+      assert_match(re, 'foobar')
+      assert_not_match(re, 'foobaz')
+    RUBY
+
+    assert_raise(SyntaxError) {eval "/\\users/x"}
+    assert_raise(SyntaxError) {eval "/[\\users]/x"}
+    assert_raise(SyntaxError) {eval "/(?<\\users)/x"}
+    assert_raise(SyntaxError) {eval "/# \\users/"}
+  (... truncated)

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]