ruby-changes:2027
From: ko1@a...
Date: 26 Sep 2007 18:39:26 +0900
Subject: [ruby-changes:2027] nobu - Ruby:r13518 (trunk): * encoding.c (rb_enc_check): check for ASCII-compatibilities.
nobu 2007-09-26 18:39:08 +0900 (Wed, 26 Sep 2007) New Revision: 13518 Modified files: trunk/ChangeLog trunk/encoding.c trunk/include/ruby/encoding.h trunk/include/ruby/intern.h trunk/insns.def trunk/parse.y trunk/string.c trunk/vm.c Log: * encoding.c (rb_enc_check): check for ASCII-compatibilities. * parse.y (parser_tokadd_string, parser_parse_string, parser_here_document, parser_yylex): set encoding to US-ASCII. * parse.y (rb_enc_symname_p): check if valid with encoding. * parse.y (rb_intern3): let symbols have encoding. * string.c (rb_str_hash): add encoding index. * string.c (rb_str_comparable, rb_str_equal, rb_str_eql): check if compatible encoding. * string.c (sym_inspect): made encoding aware. * insns.def (opt_eq): compare with encoding. * include/ruby/encoding.h (rb_enc_asciicompat): check if ASCII compatible. * include/ruby/encoding.h (rb_enc_get_index): added prototype. * include/ruby/intern.h (rb_str_comparable, rb_str_equal): ditto. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=13518&r2=13517 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/parse.y?r1=13518&r2=13517 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=13518&r2=13517 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/encoding.h?r1=13518&r2=13517 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/vm.c?r1=13518&r2=13517 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/encoding.c?r1=13518&r2=13517 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/insns.def?r1=13518&r2=13517 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/intern.h?r1=13518&r2=13517 Index: encoding.c =================================================================== --- encoding.c (revision 13517) +++ encoding.c (revision 13518) @@ -194,19 +194,15 @@ if (idx1 == 0) { enc = rb_enc_from_index(idx2); -#if 0 - if (m17n_asciicompat(enc)) { + if (rb_enc_asciicompat(enc)) { return enc; } -#endif } else if (idx2 == 0) { enc = rb_enc_from_index(idx1); -#if 0 - if (m17n_asciicompat(enc)) { + if (rb_enc_asciicompat(enc)) { return enc; } -#endif } rb_raise(rb_eArgError, "character encodings differ"); } Index: include/ruby/intern.h =================================================================== --- include/ruby/intern.h (revision 13517) +++ include/ruby/intern.h (revision 13518) @@ -516,7 +516,9 @@ VALUE rb_str_concat(VALUE, VALUE); int rb_memhash(const void *ptr, long len); int rb_str_hash(VALUE); +int rb_str_comparable(VALUE, VALUE); int rb_str_cmp(VALUE, VALUE); +VALUE rb_str_equal(VALUE str1, VALUE str2); void rb_str_update(VALUE, long, long, VALUE); VALUE rb_str_inspect(VALUE); VALUE rb_str_dump(VALUE); Index: include/ruby/encoding.h =================================================================== --- include/ruby/encoding.h (revision 13517) +++ include/ruby/encoding.h (revision 13518) @@ -27,6 +27,7 @@ typedef OnigEncodingType rb_encoding; int rb_enc_to_index(rb_encoding*); +int rb_enc_get_index(VALUE obj); rb_encoding* rb_enc_get(VALUE); rb_encoding* rb_enc_check(VALUE,VALUE); void rb_enc_associate(VALUE, rb_encoding*); @@ -73,8 +74,11 @@ #define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE(enc,c) #define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT(enc,c) +#define rb_enc_asciicompat(enc) (rb_enc_mbminlen(enc)==1) + int rb_enc_toupper(int c, rb_encoding *enc); int rb_enc_tolower(int c, rb_encoding *enc); ID rb_intern3(const char*, long, rb_encoding*); +int rb_enc_symname_p(const char*, rb_encoding*); #endif /* RUBY_ENCODING_H */ Index: insns.def =================================================================== --- insns.def (revision 13517) +++ insns.def (revision 13518) @@ -1700,13 +1700,14 @@ if (str1 == str2) { val = Qtrue; } - else if (RSTRING_LEN(str1) == RSTRING_LEN(str2) && + else if (!ENCODING_GET(str1) && !ENCODING_GET(str2) && + RSTRING_LEN(str1) == RSTRING_LEN(str2) && rb_memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), RSTRING_LEN(str1)) == 0) { val = Qtrue; } else { - val = Qfalse; + val = rb_str_equal(str1, str2); } } else { Index: ChangeLog =================================================================== --- ChangeLog (revision 13517) +++ ChangeLog (revision 13518) @@ -1,3 +1,30 @@ +Wed Sep 26 18:38:41 2007 Nobuyoshi Nakada <nobu@r...> + + * encoding.c (rb_enc_check): check for ASCII-compatibilities. + + * parse.y (parser_tokadd_string, parser_parse_string, + parser_here_document, parser_yylex): set encoding to US-ASCII. + + * parse.y (rb_enc_symname_p): check if valid with encoding. + + * parse.y (rb_intern3): let symbols have encoding. + + * string.c (rb_str_hash): add encoding index. + + * string.c (rb_str_comparable, rb_str_equal, rb_str_eql): check if + compatible encoding. + + * string.c (sym_inspect): made encoding aware. + + * insns.def (opt_eq): compare with encoding. + + * include/ruby/encoding.h (rb_enc_asciicompat): check if ASCII + compatible. + + * include/ruby/encoding.h (rb_enc_get_index): added prototype. + + * include/ruby/intern.h (rb_str_comparable, rb_str_equal): ditto. + Wed Sep 26 15:01:16 2007 Nobuyoshi Nakada <nobu@r...> * eval_method.ci (rb_get_alloc_func): cast to suppress a warning. Index: string.c =================================================================== --- string.c (revision 13517) +++ string.c (revision 13518) @@ -1129,7 +1129,8 @@ int rb_str_hash(VALUE str) { - return rb_memhash(RSTRING_PTR(str), RSTRING_LEN(str)); + return hash((const void *)RSTRING_PTR(str), RSTRING_LEN(str), + rb_enc_get_index(str)); } /* @@ -1148,7 +1149,33 @@ #define lesser(a,b) (((a)>(b))?(b):(a)) +static int +is_ascii_string(VALUE str) +{ + long i; + + for (i = 0; i < RSTRING_LEN(str); ++i) { + int c = (unsigned char)RSTRING_PTR(str)[i]; + if (!ISASCII(c)) return Qfalse; + } + return Qtrue; +} + int +rb_str_comparable(VALUE str1, VALUE str2) +{ + int idx1 = rb_enc_get_index(str1); + int idx2 = rb_enc_get_index(str2); + + if (idx1 == idx2) return Qtrue; + if (!rb_enc_asciicompat(rb_enc_from_index(idx1))) return Qfalse; + if (!rb_enc_asciicompat(rb_enc_from_index(idx2))) return Qfalse; + if (!is_ascii_string(str1)) return Qfalse; + if (!is_ascii_string(str2)) return Qfalse; + return Qtrue; +} + +int rb_str_cmp(VALUE str1, VALUE str2) { long len; @@ -1176,7 +1203,7 @@ * <code><=></code> <i>obj</i> returns zero. */ -static VALUE +VALUE rb_str_equal(VALUE str1, VALUE str2) { if (str1 == str2) return Qtrue; @@ -1186,7 +1213,7 @@ } return rb_equal(str2, str1); } - rb_enc_check(str1, str2); /* need weak check */ + if (!rb_str_comparable(str1, str2)) return Qfalse; if (RSTRING_LEN(str1) == RSTRING_LEN(str2) && rb_str_cmp(str1, str2) == 0) { return Qtrue; @@ -1207,6 +1234,9 @@ if (TYPE(str2) != T_STRING || RSTRING_LEN(str1) != RSTRING_LEN(str2)) return Qfalse; + if (rb_enc_get_index(str1) != rb_enc_get_index(str2)) + return Qfalse; + if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), lesser(RSTRING_LEN(str1), RSTRING_LEN(str2))) == 0) return Qtrue; @@ -5126,13 +5156,15 @@ { VALUE str, klass = Qundef; ID id = SYM2ID(sym); + rb_encoding *enc; sym = rb_id2str(id); - str = rb_str_new(0, RSTRING_LEN(sym)+1); + enc = rb_enc_get(sym); + str = rb_enc_str_new(0, RSTRING_LEN(sym)+1, enc); RSTRING_PTR(str)[0] = ':'; memcpy(RSTRING_PTR(str)+1, RSTRING_PTR(sym), RSTRING_LEN(sym)); if (RSTRING_LEN(sym) != strlen(RSTRING_PTR(sym)) || - !rb_symname_p(RSTRING_PTR(sym))) { + !rb_enc_symname_p(RSTRING_PTR(sym), enc)) { str = rb_str_dump(str); strncpy(RSTRING_PTR(str), ":\"", 2); } Index: parse.y =================================================================== --- parse.y (revision 13517) +++ parse.y (revision 13518) @@ -262,6 +262,8 @@ #define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc) #define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc) +#define STR_NEW3(p,n,m) rb_enc_str_new((p),(n), STR_ENC(m)) +#define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0)) #ifdef YYMALLOC void *rb_parser_malloc(struct parser_params *, size_t); @@ -3886,7 +3888,7 @@ yyerror("empty symbol literal"); break; } - $$->nd_lit = ID2SYM(rb_intern2(RSTRING_PTR(lit), RSTRING_LEN(lit))); + $$->nd_lit = ID2SYM(rb_intern_str(lit)); nd_set_type($$, NODE_LIT); break; default: @@ -4478,7 +4480,7 @@ # define yylval (*((YYSTYPE*)(parser->parser_yylval))) static int parser_regx_options(struct parser_params*); -static int parser_tokadd_string(struct parser_params*,int,int,int,long*); +static int parser_tokadd_string(struct parser_params*,int,int,int,long*,int*); static int parser_parse_string(struct parser_params*,NODE*); static int parser_here_document(struct parser_params*,NODE*); @@ -4489,7 +4491,7 @@ # define read_escape() parser_read_escape(parser) # define tokadd_escape(t) parser_tokadd_escape(parser, t) # define regx_options() parser_regx_options(parser) -# define tokadd_string(f,t,p,n) parser_tokadd_string(parser,f,t,p,n) +# define tokadd_string(f,t,p,n,m) parser_tokadd_string(parser,f,t,p,n,m) # define parse_string(n) parser_parse_string(parser,n) # define here_document(n) parser_here_document(parser,n) # define heredoc_identifier() parser_heredoc_identifier(parser) @@ -5150,15 +5152,24 @@ rb_gc_force_recycle(str); } +static void +parser_tokadd_mbchar(struct parser_params *parser, int c) +{ + int len = parser_mbclen(); + do { + tokadd(c); + } while (--len > 0 && (c = nextc()) != -1); +} + +#define tokadd_mbchar(c) parser_tokadd_mbchar(parser, c) + static int parser_tokadd_string(struct parser_params *parser, - int func, int term, int paren, long *nest) + int func, int term, int paren, long *nest, int *mb) { int c; - unsigned char uc; while ((c = nextc()) != -1) { - uc = (unsigned char)c; if (paren && c == paren) { ++*nest; } @@ -5210,12 +5221,9 @@ } } else if (parser_ismbchar()) { - int i, len = parser_mbclen()-1; - - for (i = 0; i < len; i++) { - tokadd(c); - c = nextc(); - } + tokadd_mbchar(c); + if (mb) *mb = 1; + continue; } else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) { pushback(c); @@ -5240,7 +5248,7 @@ int func = quote->nd_func; int term = nd_term(quote); int paren = nd_paren(quote); - int c, space = 0; + int c, space = 0, mb = 0; if (func == -1) return tSTRING_END; c = nextc(); @@ -5274,7 +5282,7 @@ tokadd('#'); } pushback(c); - if (tokadd_string(func, term, paren, "e->nd_nest) == -1) { + if (tokadd_string(func, term, paren, "e->nd_nest, &mb) == -1) { if (func & STR_FUNC_REGEXP) { ruby_sourceline = nd_line(quote); compile_error(PARSER_ARG "unterminated regexp meets end of file"); @@ -5288,7 +5296,7 @@ } tokfix(); - set_yylval_str(STR_NEW(tok(), toklen())); + set_yylval_str(STR_NEW3(tok(), toklen(), mb)); return tSTRING_CONTENT; } @@ -5451,6 +5459,7 @@ } while (!whole_match_p(eos, len, indent)); } else { + int mb = 0; newtok(); if (c == '#') { switch (c = nextc()) { @@ -5465,15 +5474,15 @@ } do { pushback(c); - if ((c = tokadd_string(func, '\n', 0, NULL)) == -1) goto error; + if ((c = tokadd_string(func, '\n', 0, NULL, &mb)) == -1) goto error; if (c != '\n') { - set_yylval_str(STR_NEW(tok(), toklen())); + set_yylval_str(STR_NEW3(tok(), toklen(), mb)); return tSTRING_CONTENT; } tokadd(nextc()); if ((c = nextc()) == -1) goto error; } while (!whole_match_p(eos, len, indent)); - str = STR_NEW(tok(), toklen()); + str = STR_NEW3(tok(), toklen(), mb); } heredoc_restore(lex_strterm); lex_strterm = NEW_STRTERM(-1, 0, 0); @@ -5687,6 +5696,7 @@ int space_seen = 0; int cmd_state; enum lex_state_e last_state; + int mb; #ifdef RIPPER int fallthru = Qfalse; #endif @@ -6005,13 +6015,7 @@ } newtok(); if (parser_ismbchar()) { - int i, len = parser_mbclen()-1; - - tokadd(c); - for (i = 0; i < len; i++) { - c = nextc(); - tokadd(c); - } + tokadd_mbchar(c); } else if ((rb_enc_isalnum(c, parser->enc) || c == '_') && lex_p < lex_pend && is_identchar(lex_p, lex_pend, parser->enc)) { @@ -6696,7 +6700,7 @@ tokadd(c); c = nextc(); if (parser_is_identchar()) { - tokadd(c); + tokadd_mbchar(c); } else { pushback(c); @@ -6794,15 +6798,10 @@ break; } + mb = 0; do { - int i, len; - tokadd(c); - - len = parser_mbclen()-1; - for (i = 0; i < len; i++) { - c = nextc(); - tokadd(c); - } + if (!ISASCII(c)) mb = 1; + tokadd_mbchar(c); c = nextc(); } while (parser_is_identchar()); if ((c == '!' || c == '?') && !peek('=')) { @@ -6854,7 +6853,7 @@ } } - if (lex_state != EXPR_DOT) { + if (!mb && lex_state != EXPR_DOT) { const struct kwtable *kw; /* See if it is a reserved word. */ @@ -6896,7 +6895,7 @@ if (peek(':') && !(lex_p + 1 < lex_pend && lex_p[1] == ':')) { lex_state = EXPR_BEG; nextc(); - set_yylval_id(rb_intern(tok())); + set_yylval_id(rb_intern3(tok(), toklen(), STR_ENC(mb))); return tLABEL; } } @@ -6915,7 +6914,7 @@ } } { - ID ident = rb_intern(tok()); + ID ident = rb_intern3(tok(), toklen(), STR_ENC(mb)); set_yylval_id(ident); if (last_state != EXPR_DOT && is_local_id(ident) && lvar_defined(ident)) { @@ -8371,10 +8370,15 @@ int rb_symname_p(const char *name) { + return rb_enc_symname_p(name, rb_enc_from_index(0)); +} + +int +rb_enc_symname_p(const char *name, rb_encoding *enc) +{ const char *m = name; const char *e = m + strlen(m); int localid = Qfalse; - rb_encoding *enc = rb_enc_from_index(0); if (!m) return Qfalse; switch (*m) { @@ -8458,8 +8462,10 @@ fake_str.as.heap.len = len; fake_str.as.heap.ptr = (char *)name; fake_str.as.heap.aux.capa = len; + str = (VALUE)&fake_str; + rb_enc_associate(str, enc); - if (st_lookup(global_symbols.sym_id, (st_data_t)&fake_str, (st_data_t *)&id)) + if (st_lookup(global_symbols.sym_id, str, (st_data_t *)&id)) return id; last = len-1; @@ -8520,7 +8526,7 @@ new_id: id |= ++global_symbols.last_id << ID_SCOPE_SHIFT; id_register: - str = rb_str_new(name, len); + str = rb_enc_str_new(name, len, enc); OBJ_FREEZE(str); st_add_direct(global_symbols.sym_id, (st_data_t)str, id); st_add_direct(global_symbols.id_str, id, (st_data_t)str); Index: vm.c =================================================================== --- vm.c (revision 13517) +++ vm.c (revision 13518) @@ -12,6 +12,7 @@ #include "ruby/ruby.h" #include "ruby/node.h" #include "ruby/st.h" +#include "ruby/encoding.h" #include "gc.h" #include "insnhelper.h" -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml