ruby-changes:47229
From: nobu <ko1@a...>
Date: Sun, 16 Jul 2017 22:39:25 +0900 (JST)
Subject: [ruby-changes:47229] nobu:r59344 (trunk): parse.y: utf-8 codepoints
nobu 2017-07-16 22:39:18 +0900 (Sun, 16 Jul 2017) New Revision: 59344 https://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=59344 Log: parse.y: utf-8 codepoints * parse.y (parser_tokadd_utf8): skip spaces in the current line, without advancing the line, to get rid of dangling pointer. [ruby-core:82029] [Bug #13742] Modified files: trunk/parse.y trunk/test/ruby/test_parse.rb Index: parse.y =================================================================== --- parse.y (revision 59343) +++ parse.y (revision 59344) @@ -5652,6 +5652,7 @@ parser_tokadd_codepoint(struct parser_pa https://github.com/ruby/ruby/blob/trunk/parse.y#L5652 { size_t numlen; int codepoint = scan_hex(lex_p, wide ? 6 : 4, &numlen); + lex_p += numlen; if (wide ? (numlen == 0) : (numlen < 4)) { yyerror("invalid Unicode escape"); return FALSE; @@ -5664,12 +5665,20 @@ parser_tokadd_codepoint(struct parser_pa https://github.com/ruby/ruby/blob/trunk/parse.y#L5665 yyerror("invalid Unicode codepoint"); return FALSE; } - lex_p += numlen; if (regexp_literal) { tokcopy((int)numlen); } else if (codepoint >= 0x80) { - *encp = rb_utf8_encoding(); + rb_encoding *utf8 = rb_utf8_encoding(); + if (*encp && utf8 != *encp) { + static const char mixed_utf8[] = "UTF-8 mixed within %s source"; + size_t len = sizeof(mixed_utf8) - 2 + strlen(rb_enc_name(*encp)); + char *mesg = alloca(len); + snprintf(mesg, len, mixed_utf8, rb_enc_name(*encp)); + yyerror(mesg); + return TRUE; + } + *encp = utf8; tokaddmbc(codepoint, *encp); } else { @@ -5696,19 +5705,23 @@ parser_tokadd_utf8(struct parser_params https://github.com/ruby/ruby/blob/trunk/parse.y#L5705 if (peek(open_brace)) { /* handle \u{...} form */ int c, last = nextc(); - do c = nextc(); while (ISSPACE(c)); - pushback(c); + if (lex_p >= lex_pend) goto unterminated; + while (ISSPACE(c = *lex_p) && ++lex_p < lex_pend); while (!string_literal || c != close_brace) { if (regexp_literal) tokadd(last); if (!parser_tokadd_codepoint(parser, encp, regexp_literal, TRUE)) { return 0; } - while (ISSPACE(c = nextc())) last = c; - pushback(c); + while (ISSPACE(c = *lex_p)) { + if (++lex_p >= lex_pend) goto unterminated; + last = c; + } if (!string_literal) break; } if (c != close_brace) { + unterminated: + parser->tokp = lex_p; yyerror("unterminated Unicode escape"); return 0; } @@ -5999,8 +6012,7 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6012 rb_encoding **encp) { int c; - int has_nonascii = 0; - rb_encoding *enc = *encp; + rb_encoding *enc = 0; char *errbuf = 0; static const char mixed_msg[] = "%s mixed within %s source"; @@ -6044,9 +6056,8 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6056 } } else if (c == '\\') { - const char *beg = lex_p - 1; #ifndef RIPPER - parser->tokp = beg; + parser->tokp = lex_p - 1; #endif c = nextc(); switch (c) { @@ -6065,11 +6076,10 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6076 tokadd('\\'); break; } - parser_tokadd_utf8(parser, &enc, 1, - func & STR_FUNC_SYMBOL, - func & STR_FUNC_REGEXP); - if (has_nonascii && enc != *encp) { - mixed_escape(beg, enc, *encp); + if (!parser_tokadd_utf8(parser, &enc, term, + func & STR_FUNC_SYMBOL, + func & STR_FUNC_REGEXP)) { + return -1; } continue; @@ -6087,8 +6097,8 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6097 pushback(c); if ((c = tokadd_escape(&enc)) < 0) return -1; - if (has_nonascii && enc != *encp) { - mixed_escape(beg, enc, *encp); + if (enc && enc != *encp) { + mixed_escape(parser->tokp+2, enc, *encp); } continue; } @@ -6109,8 +6119,10 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6119 } else if (!parser_isascii()) { non_ascii: - has_nonascii = 1; - if (enc != *encp) { + if (!enc) { + enc = *encp; + } + else if (enc != *encp) { mixed_error(enc, *encp); continue; } @@ -6122,15 +6134,17 @@ parser_tokadd_string(struct parser_param https://github.com/ruby/ruby/blob/trunk/parse.y#L6134 break; } if (c & 0x80) { - has_nonascii = 1; - if (enc != *encp) { + if (!enc) { + enc = *encp; + } + else if (enc != *encp) { mixed_error(enc, *encp); continue; } } tokadd(c); } - *encp = enc; + if (enc) *encp = enc; return c; } @@ -7460,7 +7474,8 @@ parse_qmark(struct parser_params *parser https://github.com/ruby/ruby/blob/trunk/parse.y#L7474 else if (c == '\\') { if (peek('u')) { nextc(); - if (!parser_tokadd_utf8(parser, &enc, 0, 0, 0)) + enc = rb_utf8_encoding(); + if (!parser_tokadd_utf8(parser, &enc, -1, 0, 0)) return 0; } else if (!lex_eol_p() && !(c = *lex_p, ISASCII(c))) { Index: test/ruby/test_parse.rb =================================================================== --- test/ruby/test_parse.rb (revision 59343) +++ test/ruby/test_parse.rb (revision 59344) @@ -498,6 +498,9 @@ class TestParse < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/test/ruby/test_parse.rb#L498 e = assert_syntax_error('"\C1"', /escape character syntax/) assert_equal(' ^~~', e.message.lines.last, mesg) + src = '"\xD0\u{90'"\n""000000000000000000000000" + assert_syntax_error(src, /:#{__LINE__}: unterminated/o) + assert_equal("\x81", eval('"\C-\M-a"')) assert_equal("\177", eval('"\c?"')) end -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/