ruby-changes:47229

nobu	2017-07-16 22:39:18 +0900 (Sun, 16 Jul 2017)

  New Revision: 59344

  https://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=59344

  Log:
    parse.y: utf-8 codepoints
    
    * parse.y (parser_tokadd_utf8): skip spaces in the current line,
      without advancing the line, to get rid of dangling pointer.
      [ruby-core:82029] [Bug #13742]

  Modified files:
    trunk/parse.y
    trunk/test/ruby/test_parse.rb
Index: parse.y
===================================================================
--- parse.y	(revision 59343)
+++ parse.y	(revision 59344)
@@ -5652,6 +5652,7 @@ parser_tokadd_codepoint(struct parser_pa https://github.com/ruby/ruby/blob/trunk/parse.y#L5652
 {
     size_t numlen;
     int codepoint = scan_hex(lex_p, wide ? 6 : 4, &numlen);
+    lex_p += numlen;
     if (wide ? (numlen == 0) : (numlen < 4))  {
 	yyerror("invalid Unicode escape");
 	return FALSE;
@@ -5664,12 +5665,20 @@ parser_tokadd_codepoint(struct parser_pa https://github.com/ruby/ruby/blob/trunk/parse.y#L5665
 	yyerror("invalid Unicode codepoint");
 	return FALSE;
     }
-    lex_p += numlen;
     if (regexp_literal) {
 	tokcopy((int)numlen);
     }
     else if (codepoint >= 0x80) {
-	*encp = rb_utf8_encoding();
+	rb_encoding *utf8 = rb_utf8_encoding();
+	if (*encp && utf8 != *encp) {
+	    static const char mixed_utf8[] = "UTF-8 mixed within %s source";
+	    size_t len = sizeof(mixed_utf8) - 2 + strlen(rb_enc_name(*encp));
+	    char *mesg = alloca(len);
+	    snprintf(mesg, len, mixed_utf8, rb_enc_name(*encp));
+	    yyerror(mesg);
+	    return TRUE;
+	}
+	*encp = utf8;
 	tokaddmbc(codepoint, *encp);
     }
     else {
@@ -5696,19 +5705,23 @@ parser_tokadd_utf8(struct parser_params https://github.com/ruby/ruby/blob/trunk/parse.y#L5705
 
     if (peek(open_brace)) {  /* handle \u{...} form */
 	int c, last = nextc();
-	do c = nextc(); while (ISSPACE(c));
-	pushback(c);
+	if (lex_p >= lex_pend) goto unterminated;
+	while (ISSPACE(c = *lex_p) && ++lex_p < lex_pend);
 	while (!string_literal || c != close_brace) {
 	    if (regexp_literal) tokadd(last);
 	    if (!parser_tokadd_codepoint(parser, encp, regexp_literal, TRUE)) {
 		return 0;
 	    }
-	    while (ISSPACE(c = nextc())) last = c;
-	    pushback(c);
+	    while (ISSPACE(c = *lex_p)) {
+		if (++lex_p >= lex_pend) goto unterminated;
+		last = c;
+	    }
 	    if (!string_literal) break;
 	}
 
 	if (c != close_brace) {
+	  unterminated:
+	    parser->tokp = lex_p;
 	    yyerror("unterminated Unicode escape");
 	    return 0;
 	}
@@ -5999,8 +6012,7 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6012
 		     rb_encoding **encp)
 {
     int c;
-    int has_nonascii = 0;
-    rb_encoding *enc = *encp;
+    rb_encoding *enc = 0;
     char *errbuf = 0;
     static const char mixed_msg[] = "%s mixed within %s source";
 
@@ -6044,9 +6056,8 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6056
 	    }
 	}
 	else if (c == '\\') {
-	    const char *beg = lex_p - 1;
 #ifndef RIPPER
-	    parser->tokp = beg;
+	    parser->tokp = lex_p - 1;
 #endif
 	    c = nextc();
 	    switch (c) {
@@ -6065,11 +6076,10 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6076
 		    tokadd('\\');
 		    break;
 		}
-		parser_tokadd_utf8(parser, &enc, 1,
-				   func & STR_FUNC_SYMBOL,
-				   func & STR_FUNC_REGEXP);
-		if (has_nonascii && enc != *encp) {
-		    mixed_escape(beg, enc, *encp);
+		if (!parser_tokadd_utf8(parser, &enc, term,
+					func & STR_FUNC_SYMBOL,
+					func & STR_FUNC_REGEXP)) {
+		    return -1;
 		}
 		continue;
 
@@ -6087,8 +6097,8 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6097
 		    pushback(c);
 		    if ((c = tokadd_escape(&enc)) < 0)
 			return -1;
-		    if (has_nonascii && enc != *encp) {
-			mixed_escape(beg, enc, *encp);
+		    if (enc && enc != *encp) {
+			mixed_escape(parser->tokp+2, enc, *encp);
 		    }
 		    continue;
 		}
@@ -6109,8 +6119,10 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6119
 	}
 	else if (!parser_isascii()) {
 	  non_ascii:
-	    has_nonascii = 1;
-	    if (enc != *encp) {
+	    if (!enc) {
+		enc = *encp;
+	    }
+	    else if (enc != *encp) {
 		mixed_error(enc, *encp);
 		continue;
 	    }
@@ -6122,15 +6134,17 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6134
 	    break;
 	}
         if (c & 0x80) {
-	    has_nonascii = 1;
-	    if (enc != *encp) {
+	    if (!enc) {
+		enc = *encp;
+	    }
+	    else if (enc != *encp) {
 		mixed_error(enc, *encp);
 		continue;
 	    }
         }
 	tokadd(c);
     }
-    *encp = enc;
+    if (enc) *encp = enc;
     return c;
 }
 
@@ -7460,7 +7474,8 @@ parse_qmark(struct parser_params *parser https://github.com/ruby/ruby/blob/trunk/parse.y#L7474
     else if (c == '\\') {
 	if (peek('u')) {
 	    nextc();
-	    if (!parser_tokadd_utf8(parser, &enc, 0, 0, 0))
+	    enc = rb_utf8_encoding();
+	    if (!parser_tokadd_utf8(parser, &enc, -1, 0, 0))
 		return 0;
 	}
 	else if (!lex_eol_p() && !(c = *lex_p, ISASCII(c))) {
Index: test/ruby/test_parse.rb
===================================================================
--- test/ruby/test_parse.rb	(revision 59343)
+++ test/ruby/test_parse.rb	(revision 59344)
@@ -498,6 +498,9 @@ class TestParse < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/test/ruby/test_parse.rb#L498
     e = assert_syntax_error('"\C1"', /escape character syntax/)
     assert_equal(' ^~~', e.message.lines.last, mesg)
 
+    src = '"\xD0\u{90'"\n""000000000000000000000000"
+    assert_syntax_error(src, /:#{__LINE__}: unterminated/o)
+
     assert_equal("\x81", eval('"\C-\M-a"'))
     assert_equal("\177", eval('"\c?"'))
   end

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/