ruby-changes:2640
From: ko1@a...
Date: 8 Dec 2007 11:51:03 +0900
Subject: [ruby-changes:2640] akr - Ruby:r14131 (trunk): * encoding.c (rb_enc_mbclen): make it never fail.
akr 2007-12-08 11:50:43 +0900 (Sat, 08 Dec 2007) New Revision: 14131 Modified files: trunk/ChangeLog trunk/encoding.c trunk/ext/tk/sample/tkextlib/vu/canvSticker2.rb trunk/include/ruby/encoding.h trunk/include/ruby/regex.h trunk/parse.y trunk/re.c trunk/string.c trunk/test/ruby/test_m17n.rb trunk/test/ruby/test_regexp.rb Log: * encoding.c (rb_enc_mbclen): make it never fail. (rb_enc_nth): don't check the return value of rb_enc_mbclen. (rb_enc_strlen): ditto. (rb_enc_precise_mbclen): return needmore(1) if e <= p. (rb_enc_get_ascii): new function for extracting ASCII character. * include/ruby/encoding.h (rb_enc_get_ascii): declared. * include/ruby/regex.h (ismbchar): removed. * re.c (rb_reg_expr_str): use rb_enc_get_ascii. (unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine the termination of escaped non-ASCII character. (unescape_nonascii): use rb_enc_precise_mbclen. (rb_reg_quote): use rb_enc_get_ascii. (rb_reg_regsub): use rb_enc_get_ascii. * string.c (rb_str_reverse) don't check the return value of rb_enc_mbclen. (rb_str_split_m): don't call rb_enc_mbclen with e <= p. * parse.y (is_identchar): use ISASCII. (parser_ismbchar): removed. (parser_precise_mbclen): new macro. (parser_isascii): new macro. (parser_tokadd_mbchar): use parser_precise_mbclen to check invalid character precisely. (parser_tokadd_string): use parser_isascii. (parser_yylex): ditto. (is_special_global_name): don't call is_identchar with e <= p. (rb_enc_symname_p): ditto. [ruby-dev:32455] * ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie because the encoding is not UTF-8. [ruby-dev:32475] http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/parse.y?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/encoding.h?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_regexp.rb?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ext/tk/sample/tkextlib/vu/canvSticker2.rb?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/re.c?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/regex.h?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/encoding.c?r1=14131&r2=14130 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_m17n.rb?r1=14131&r2=14130 Index: encoding.c =================================================================== --- encoding.c (revision 14130) +++ encoding.c (revision 14131) @@ -459,7 +459,6 @@ for (c=0; p<e && nth--; c++) { int n = rb_enc_mbclen(p, e, enc); - if (n == 0) return 0; p += n; } } @@ -478,7 +477,6 @@ for (c=0; p<e; c++) { int n = rb_enc_mbclen(p, e, enc); - if (n == 0) return -1; p += n; } return c; @@ -487,19 +485,39 @@ int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) { - int n = ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); - if (n == 0) { - rb_raise(rb_eArgError, "invalid mbstring sequence"); - } - return n; + int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); + if (MBCLEN_CHARFOUND(n)) + return n; + else + return 1; } int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc) { + if (e <= p) + return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); } +int rb_enc_get_ascii(const char *p, const char *e, rb_encoding *enc) +{ + int c, l; + if (e <= p) + return -1; + if (rb_enc_asciicompat(enc)) { + c = (unsigned char)*p; + return ISASCII(c) ? c : -1; + } + l = rb_enc_precise_mbclen(p, e, enc); + if (!MBCLEN_CHARFOUND(l)) + return -1; + c = rb_enc_codepoint(p, e, enc); + if (rb_enc_isascii(c, enc)) + return c; + return -1; +} + int rb_enc_codelen(int c, rb_encoding *enc) { Index: include/ruby/regex.h =================================================================== --- include/ruby/regex.h (revision 14130) +++ include/ruby/regex.h (revision 14131) @@ -29,7 +29,6 @@ ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; -#define ismbchar(p, e, enc) (mbclen((p),(e),(enc)) != 1) #define mbclen(p,e,enc) rb_enc_mbclen((p),(e),(enc)) #endif /* ifndef ONIG_RUBY_M17N */ Index: include/ruby/encoding.h =================================================================== --- include/ruby/encoding.h (revision 14130) +++ include/ruby/encoding.h (revision 14131) @@ -77,6 +77,9 @@ #define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret) #define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret) +/* ptr,endptr,encoding -> 0x00..0x7f, -1 */ +int rb_enc_get_ascii(const char*, const char *, rb_encoding*); + /* code,encoding -> codelen */ int rb_enc_codelen(int, rb_encoding*); Index: re.c =================================================================== --- re.c (revision 14130) +++ re.c (revision 14131) @@ -218,10 +218,12 @@ rb_encoding *enc = rb_enc_get(str); const char *p, *pend; int need_escape = 0; + int c; p = s; pend = p + len; while (p<pend) { - if (*p == '/' || (!rb_enc_isprint(*p, enc) && !ismbchar(p, pend, enc))) { + c = rb_enc_get_ascii(p, pend, enc); + if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) { need_escape = 1; break; } @@ -233,29 +235,31 @@ else { p = s; while (p<pend) { - if (*p == '\\') { + c = rb_enc_get_ascii(p, pend, enc); + if (c == '\\') { int n = mbclen(p+1, pend, enc) + 1; rb_str_buf_cat(str, p, n); p += n; continue; } - else if (*p == '/') { + else if (c == '/') { char c = '\\'; rb_str_buf_cat(str, &c, 1); rb_str_buf_cat(str, p, 1); } - else if (ismbchar(p, pend, enc)) { - rb_str_buf_cat(str, p, mbclen(p, pend, enc)); - p += mbclen(p, pend, enc); + else if (c == -1) { + int l = mbclen(p, pend, enc); + rb_str_buf_cat(str, p, l); + p += l; continue; } - else if (rb_enc_isprint(*p, enc)) { + else if (rb_enc_isprint(c, enc)) { rb_str_buf_cat(str, p, 1); } - else if (!rb_enc_isspace(*p, enc)) { + else if (!rb_enc_isspace(c, enc)) { char b[8]; - sprintf(b, "\\%03o", *p & 0377); + sprintf(b, "\\%03o", c); rb_str_buf_cat(str, b, 4); } else { @@ -1377,6 +1381,7 @@ char *chbuf = ALLOCA_N(char, chmaxlen); int chlen = 0; int byte; + int l; memset(chbuf, 0, chmaxlen); @@ -1386,7 +1391,8 @@ } chbuf[chlen++] = byte; - while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chlen, enc)) { + while (chlen < chmaxlen && + MBCLEN_NEEDMORE(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) { byte = read_escaped_byte(&p, end, err); if (byte == -1) { return -1; @@ -1394,11 +1400,11 @@ chbuf[chlen++] = byte; } - if (chlen != mbclen(chbuf, chbuf+chlen, enc)) { + l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc); + if (MBCLEN_INVALID(l)) { strcpy(err, "invalid multibyte escape"); return -1; } - if (1 < chlen || (chbuf[0] & 0x80)) { rb_str_buf_cat(buf, chbuf, chlen); @@ -1515,13 +1521,12 @@ char smallbuf[2]; while (p < end) { - int chlen = mbclen(p, end, enc); + int chlen = rb_enc_precise_mbclen(p, end, enc); + if (!MBCLEN_CHARFOUND(chlen)) { + strcpy(err, "invalid multibyte character"); + return -1; + } if (1 < chlen || (*p & 0x80)) { - if (end < p + chlen) { - strcpy(err, "too short multibyte character"); - return -1; - } - /* xxx: validate the non-ascii character */ rb_str_buf_cat(buf, p, chlen); p += chlen; if (*encp == 0) @@ -2093,8 +2098,8 @@ s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); for (; s < send; s++) { - c = *s; - if (ismbchar(s, send, enc)) { + c = rb_enc_get_ascii(s, send, enc); + if (c == -1) { int n = mbclen(s, send, enc); while (n-- && s < send) @@ -2129,8 +2134,8 @@ t += s - RSTRING_PTR(str); for (; s < send; s++) { - c = *s; - if (ismbchar(s, send, enc)) { + c = rb_enc_get_ascii(s, send, enc); + if (c == -1) { int n = mbclen(s, send, enc); while (n-- && s < send) @@ -2397,13 +2402,14 @@ e = s + RSTRING_LEN(str); while (s < e) { + int c = rb_enc_get_ascii(s, e, enc); char *ss = s++; - if (ismbchar(ss, e, enc)) { + if (c == -1) { s += mbclen(ss, e, enc) - 1; continue; } - if (*ss != '\\' || s == e) continue; + if (c != '\\' || s == e) continue; if (!val) { val = rb_str_buf_new(ss-p); Index: ChangeLog =================================================================== --- ChangeLog (revision 14130) +++ ChangeLog (revision 14131) @@ -1,3 +1,42 @@ +Sat Dec 8 11:06:29 2007 Tanaka Akira <akr@f...> + + * encoding.c (rb_enc_mbclen): make it never fail. + (rb_enc_nth): don't check the return value of rb_enc_mbclen. + (rb_enc_strlen): ditto. + (rb_enc_precise_mbclen): return needmore(1) if e <= p. + (rb_enc_get_ascii): new function for extracting ASCII character. + + * include/ruby/encoding.h (rb_enc_get_ascii): declared. + + * include/ruby/regex.h (ismbchar): removed. + + * re.c (rb_reg_expr_str): use rb_enc_get_ascii. + (unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine + the termination of escaped non-ASCII character. + (unescape_nonascii): use rb_enc_precise_mbclen. + (rb_reg_quote): use rb_enc_get_ascii. + (rb_reg_regsub): use rb_enc_get_ascii. + + * string.c (rb_str_reverse) don't check the return value of + rb_enc_mbclen. + (rb_str_split_m): don't call rb_enc_mbclen with e <= p. + + * parse.y (is_identchar): use ISASCII. + (parser_ismbchar): removed. + (parser_precise_mbclen): new macro. + (parser_isascii): new macro. + (parser_tokadd_mbchar): use parser_precise_mbclen to check invalid + character precisely. + (parser_tokadd_string): use parser_isascii. + (parser_yylex): ditto. + (is_special_global_name): don't call is_identchar with e <= p. + (rb_enc_symname_p): ditto. + + [ruby-dev:32455] + + * ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie + because the encoding is not UTF-8. [ruby-dev:32475] + Fri Dec 7 20:21:35 2007 GOTOU Yuuzou <gotoyuzo@n...> * ext/openssl/lib/net/ftptls.rb, ext/openssl/lib/net/telnets.rb: Index: string.c =================================================================== --- string.c (revision 14130) +++ string.c (revision 14131) @@ -2725,9 +2725,6 @@ while (s < e) { int clen = rb_enc_mbclen(s, e, enc); - if (clen == 0) { - rb_raise(rb_eArgError, "invalid mbstring sequence"); - } p -= clen; memcpy(p, s, clen); s += clen; @@ -4079,7 +4076,10 @@ beg = start; } else { - start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc); + if (RSTRING_PTR(str)+start == RSTRING_END(str)) + start++; + else + start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc); last_null = 1; continue; } Index: parse.y =================================================================== --- parse.y (revision 14130) +++ parse.y (revision 14131) @@ -4583,10 +4583,12 @@ #endif #define parser_mbclen() mbclen((lex_p-1),lex_pend,parser->enc) -#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || ismbchar(p,e,enc)) -#define parser_ismbchar() ismbchar((lex_p-1), lex_pend, parser->enc) +#define parser_precise_mbclen() rb_enc_precise_mbclen((lex_p-1),lex_pend,parser->enc) +#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || !ISASCII(*p)) #define parser_is_identchar() (!parser->eofp && is_identchar((lex_p-1),lex_pend,parser->enc)) +#define parser_isascii() ISASCII(*(lex_p-1)) + static int parser_yyerror(struct parser_params *parser, const char *msg) { @@ -5305,8 +5307,8 @@ static int parser_tokadd_mbchar(struct parser_params *parser, int c) { - int len = parser_mbclen(); - if (len <= 0 || lex_p + len - 1 > lex_pend) { + int len = parser_precise_mbclen(); + if (!MBCLEN_CHARFOUND(len)) { compile_error(PARSER_ARG "illegal multibyte char"); return -1; } @@ -5414,7 +5416,7 @@ } } } - else if (parser_ismbchar()) { + else if (!parser_isascii()) { has_nonascii = 1; if (enc != *encp) { mixed_error(enc, *encp); @@ -6306,7 +6308,7 @@ } newtok(); enc = parser->enc; - if (parser_ismbchar()) { + if (!parser_isascii()) { if (tokadd_mbchar(c) == -1) return 0; } else if ((rb_enc_isalnum(c, parser->enc) || c == '_') && @@ -6889,7 +6891,7 @@ } else { term = nextc(); - if (rb_enc_isalnum(term, parser->enc) || parser_ismbchar()) { + if (rb_enc_isalnum(term, parser->enc) || !parser_isascii()) { yyerror("unknown type of %string"); return 0; } @@ -8693,7 +8695,7 @@ break; case '-': ++m; - if (is_identchar(m, e, enc)) { + if (m < e && is_identchar(m, e, enc)) { if (!ISASCII(*m)) mb = 1; m += rb_enc_mbclen(m, e, enc); } @@ -8776,9 +8778,9 @@ default: localid = !rb_enc_isupper(*m, enc); id: - if (*m != '_' && !rb_enc_isalpha(*m, enc) && !ismbchar(m, e, enc)) + if (m >= e || (*m != '_' && !rb_enc_isalpha(*m, enc) && ISASCII(*m))) return Qfalse; - while (is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc); + while (m < e && is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc); if (localid) { switch (*m) { case '!': case '?': case '=': ++m; Index: ext/tk/sample/tkextlib/vu/canvSticker2.rb =================================================================== --- ext/tk/sample/tkextlib/vu/canvSticker2.rb (revision 14130) +++ ext/tk/sample/tkextlib/vu/canvSticker2.rb (revision 14131) @@ -1,5 +1,4 @@ #!/usr/bin/env ruby -# -*- coding: utf-8 -*- require 'tk' require 'tkextlib/vu/charts' Index: test/ruby/test_m17n.rb =================================================================== --- test/ruby/test_m17n.rb (revision 14130) +++ test/ruby/test_m17n.rb (revision 14131) @@ -77,8 +77,8 @@ assert_raise(SyntaxError) { eval('/\xc2/u') } assert_raise(SyntaxError) { eval('/\xe0\x80/u') } assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') } - #assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } - #assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } + assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } + assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } # raw 8bit assert_raise(SyntaxError) { eval("/\xfe/e") } @@ -87,7 +87,7 @@ # invalid suffix assert_raise(SyntaxError) { eval('/\xc2\xff/u') } assert_raise(SyntaxError) { eval('/\xc2 /u') } - #assert_raise(SyntaxError) { eval('/\xc2\x20/u') } + assert_raise(SyntaxError) { eval('/\xc2\x20/u') } end def assert_regexp_generic_encoding(r) Index: test/ruby/test_regexp.rb =================================================================== --- test/ruby/test_regexp.rb (revision 14130) +++ test/ruby/test_regexp.rb (revision 14131) @@ -20,7 +20,7 @@ def test_yoshidam_net_20041111_2 assert_raise(RegexpError) do - s = "[\xFF-\xFF]" + s = "[\xFF-\xFF]".force_encoding("utf-8") Regexp.new(s, nil, "u") end end @@ -42,8 +42,8 @@ assert_equal :ok, begin Regexp.union( "a", - Regexp.new("\x80".force_encoding("euc-jp")), - Regexp.new("\x80".force_encoding("utf-8"))) + Regexp.new("\xc2\xa1".force_encoding("euc-jp")), + Regexp.new("\xc2\xa1".force_encoding("utf-8"))) :ng rescue ArgumentError :ok -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml