ruby-changes:12816
From: naruse <ko1@a...>
Date: Sun, 16 Aug 2009 01:01:50 +0900 (JST)
Subject: [ruby-changes:12816] Ruby:r24544 (trunk): \d, \s and \w are now non Unicode class.
naruse 2009-08-16 01:01:33 +0900 (Sun, 16 Aug 2009) New Revision: 24544 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=24544 Log: \d, \s and \w are now non Unicode class. [ruby-dev:39026] * include/ruby/oniguruma.h (ONIGENC_CTYPE_SPECIAL_MASK): added. (ONIGENC_CTYPE_D): ditto. (ONIGENC_CTYPE_S): ditto. (ONIGENC_CTYPE_W): ditto. * regparse.c: \d, \s and \w are now non Unicode class. [ruby-dev:39026] (fetch_token_in_cc): use ONIGENC_CTYPE_[DSW] for \d/\s/\w. (fetch_token): ditto. (add_ctype_to_cc): add routines for ONIGENC_CTYPE_[DSW]. (parse_exp): ditto. * test/ruby/test_regexp.rb (TestRegexp#test_char_class): add tests for above. Modified files: trunk/ChangeLog trunk/include/ruby/oniguruma.h trunk/regparse.c trunk/test/ruby/test_regexp.rb Index: regparse.c =================================================================== --- regparse.c (revision 24543) +++ regparse.c (revision 24544) @@ -2974,32 +2974,32 @@ switch (c) { case 'w': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_W; tok->u.prop.not = 0; break; case 'W': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_W; tok->u.prop.not = 1; break; case 'd': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_D; tok->u.prop.not = 0; break; case 'D': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_D; tok->u.prop.not = 1; break; case 's': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_S; tok->u.prop.not = 0; break; case 'S': tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_S; tok->u.prop.not = 1; break; case 'h': @@ -3261,14 +3261,14 @@ case 'w': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_W; tok->u.prop.not = 0; break; case 'W': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_WORD; + tok->u.prop.ctype = ONIGENC_CTYPE_W; tok->u.prop.not = 1; break; @@ -3301,28 +3301,28 @@ case 's': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_S; tok->u.prop.not = 0; break; case 'S': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; + tok->u.prop.ctype = ONIGENC_CTYPE_S; tok->u.prop.not = 1; break; case 'd': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_D; tok->u.prop.not = 0; break; case 'D': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; tok->type = TK_CHAR_TYPE; - tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; + tok->u.prop.ctype = ONIGENC_CTYPE_D; tok->u.prop.not = 1; break; @@ -3864,6 +3864,28 @@ OnigCodePoint sb_out; OnigEncoding enc = env->enc; + switch (ctype) { + case ONIGENC_CTYPE_D: + case ONIGENC_CTYPE_S: + case ONIGENC_CTYPE_W: + ctype ^= ONIGENC_CTYPE_SPECIAL_MASK; + if (not != 0) { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype)) + BITSET_SET_BIT_CHKDUP(cc->bs, c); + } + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + } + else { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype)) + BITSET_SET_BIT_CHKDUP(cc->bs, c); + } + } + return 0; + break; + } + r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); if (r == 0) { return add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges); @@ -5212,6 +5234,19 @@ case TK_CHAR_TYPE: { switch (tok->u.prop.ctype) { + case ONIGENC_CTYPE_D: + case ONIGENC_CTYPE_S: + case ONIGENC_CTYPE_W: + { + CClassNode* cc; + *np = node_new_cclass(); + CHECK_NULL_RETURN_MEMERR(*np); + cc = NCCLASS(*np); + add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); + } + break; + case ONIGENC_CTYPE_WORD: *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not); CHECK_NULL_RETURN_MEMERR(*np); Index: include/ruby/oniguruma.h =================================================================== --- include/ruby/oniguruma.h (revision 24543) +++ include/ruby/oniguruma.h (revision 24544) @@ -200,6 +200,14 @@ #define ONIGENC_CTYPE_ALNUM 13 /* alpha || digit */ #define ONIGENC_CTYPE_ASCII 14 #define ONIGENC_MAX_STD_CTYPE ONIGENC_CTYPE_ASCII +#define ONIGENC_CTYPE_SPECIAL_MASK 128 +#define ONIGENC_CTYPE_S /* [\t\n\v\f\r\s] */ \ + ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_SPACE +#define ONIGENC_CTYPE_D /* [0-9] */ \ + ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_DIGIT +#define ONIGENC_CTYPE_W /* [0-9A-Za-z_] */ \ + ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_WORD +#define ONIGENC_CTYPE_SPECIAL_P(ctype) ((ctype) & ONIGENC_CTYPE_SPECIAL_MASK) #define onig_enc_len(enc,p,e) ONIGENC_MBC_ENC_LEN(enc, p, e) Index: ChangeLog =================================================================== --- ChangeLog (revision 24543) +++ ChangeLog (revision 24544) @@ -1,3 +1,21 @@ +Sun Aug 16 00:30:33 2009 NARUSE, Yui <naruse@r...> + + * include/ruby/oniguruma.h + (ONIGENC_CTYPE_SPECIAL_MASK): added. + (ONIGENC_CTYPE_D): ditto. + (ONIGENC_CTYPE_S): ditto. + (ONIGENC_CTYPE_W): ditto. + + * regparse.c: \d, \s and \w are now non Unicode class. + [ruby-dev:39026] + (fetch_token_in_cc): use ONIGENC_CTYPE_[DSW] for \d/\s/\w. + (fetch_token): ditto. + (add_ctype_to_cc): add routines for ONIGENC_CTYPE_[DSW]. + (parse_exp): ditto. + + * test/ruby/test_regexp.rb (TestRegexp#test_char_class): + add tests for above. + Sat Aug 15 10:39:53 2009 Nobuyoshi Nakada <nobu@r...> * parse.y (fname, string_dvar, sym, dsym, f_arglist): removed Index: test/ruby/test_regexp.rb =================================================================== --- test/ruby/test_regexp.rb (revision 24543) +++ test/ruby/test_regexp.rb (revision 24544) @@ -665,6 +665,13 @@ check(/\A[[^b-c]&&[^e]&&a-f]\z/, %w(a d f), %w(b c e g 0)) check(/\A[\n\r\t]\z/, ["\n", "\r", "\t"]) failcheck('[9-1]') + + assert_match(/\A\d+\z/, "0123456789") + assert_no_match(/\d/, "\uff10\uff11\uff12\uff13\uff14\uff15\uff16\uff17\uff18\uff19") + assert_match(/\A\w+\z/, "09azAZ_") + assert_no_match(/\w/, "\uff10\uff19\uff41\uff5a\uff21\uff3a") + assert_match(/\A\s+\z/, "\r\n\v\f\r\s") + assert_no_match(/\s/, "\u0085") end def test_posix_bracket -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/