ruby-changes:3779
From: ko1@a...
Date: Sun, 27 Jan 2008 17:21:42 +0900 (JST)
Subject: [ruby-changes:3779] akr - Ruby:r15268 (trunk): * string.c (rb_str_succ): don't increment/decrement codepoint.
akr 2008-01-27 17:21:24 +0900 (Sun, 27 Jan 2008) New Revision: 15268 Modified files: trunk/ChangeLog trunk/string.c trunk/test/ruby/test_m17n_comb.rb Log: * string.c (rb_str_succ): don't increment/decrement codepoint. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_m17n_comb.rb?r1=15268&r2=15267&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=15268&r2=15267&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=15268&r2=15267&diff_format=u Index: ChangeLog =================================================================== --- ChangeLog (revision 15267) +++ ChangeLog (revision 15268) @@ -1,3 +1,7 @@ +Sun Jan 27 17:20:10 2008 Tanaka Akira <akr@f...> + + * string.c (rb_str_succ): don't increment/decrement codepoint. + Sun Jan 27 16:03:42 2008 NARUSE, Yui <naruse@r...> * lib/irb/ruby-lex.rb (RubyLex#buf_input): use chars.to_a. Index: string.c =================================================================== --- string.c (revision 15267) +++ string.c (revision 15268) @@ -2000,74 +2000,143 @@ return result; } -static int -succ_char(char *s) +enum neighbor_char { + NEIGHBOR_NOT_CHAR, + NEIGHBOR_FOUND, + NEIGHBOR_WRAPPED +}; + +static enum neighbor_char +enc_succ_char(char *p, int len, rb_encoding *enc) { - char c = *s; + int i, l; + while (1) { + for (i = len-1; 0 <= i; i--) { + int c; + c = ++((unsigned char*)p)[i]; + if (c != 0) + break; + } + if (i < 0) + return NEIGHBOR_WRAPPED; + l = rb_enc_precise_mbclen(p, p+len, enc); + if (MBCLEN_CHARFOUND(l)) { + if (l == len) { + return NEIGHBOR_FOUND; + } + else { + memset(p+l, '\xff', len-l); + } + } + if (MBCLEN_INVALID(l) && i < len-1) { + int len2, l2; + for (len2 = len-1; 0 < len2; len2--) { + l2 = rb_enc_precise_mbclen(p, p+len2, enc); + if (!MBCLEN_INVALID(l2)) + break; + } + memset(p+len2+1, '\xff', len-(len2+1)); + } + } +} - /* numerics */ - if ('0' <= c && c < '9') (*s)++; - else if (c == '9') { - *s = '0'; - return '1'; +static enum neighbor_char +enc_pred_char(char *p, int len, rb_encoding *enc) +{ + int i, l; + while (1) { + for (i = len-1; 0 <= i; i--) { + int c; + c = --((unsigned char*)p)[i]; + if (c != 0xff) + break; + } + if (i < 0) + return NEIGHBOR_WRAPPED; + l = rb_enc_precise_mbclen(p, p+len, enc); + if (MBCLEN_CHARFOUND(l)) { + if (l == len) { + return NEIGHBOR_FOUND; + } + else { + memset(p+l, '\0', len-l); + } + } + if (MBCLEN_INVALID(l) && i < len-1) { + int len2, l2; + for (len2 = len-1; 0 < len2; len2--) { + l2 = rb_enc_precise_mbclen(p, p+len2, enc); + if (!MBCLEN_INVALID(l2)) + break; + } + memset(p+len2+1, '\0', len-(len2+1)); + } } - /* small alphabets */ - else if ('a' <= c && c < 'z') (*s)++; - else if (c == 'z') { - return *s = 'a'; - } - /* capital alphabets */ - else if ('A' <= c && c < 'Z') (*s)++; - else if (c == 'Z') { - return *s = 'A'; - } - return 0; } /* - overwrite +s+ by succeeding letter of +c+ in +enc+ and returns - carried-out letter. assuming each ranges are successive, and mbclen + overwrite +p+ by succeeding letter in +enc+ and returns + NEIGHBOR_FOUND or NEIGHBOR_WRAPPED. + When NEIGHBOR_WRAPPED, carried-out letter is stored into carry. + assuming each ranges are successive, and mbclen never change in each ranges. + NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one + character. */ -static int -enc_succ_char(unsigned int c, char *s, rb_encoding *enc) +static enum neighbor_char +enc_succ_alnum_char(char *p, int len, rb_encoding *enc, char *carry) { - unsigned int cs; + enum neighbor_char ret; + int c; + int ctype; + int range; + char save[ONIGENC_CODE_TO_MBC_MAXLEN]; - /* numerics */ - if (rb_enc_isdigit(c, enc)) { - cs = c++; - if (rb_enc_isdigit(c, enc)) { - rb_enc_mbcput(c, s, enc); - return 0; - } - do c = cs--; while (rb_enc_isdigit(cs, enc)); - rb_enc_mbcput(c, s, enc); - return ++c; + c = rb_enc_mbc_to_codepoint(p, p+len, enc); + if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc)) + ctype = ONIGENC_CTYPE_DIGIT; + else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc)) + ctype = ONIGENC_CTYPE_ALPHA; + else + return NEIGHBOR_NOT_CHAR; + + MEMCPY(save, p, char, len); + ret = enc_succ_char(p, len, enc); + if (ret == NEIGHBOR_FOUND) { + c = rb_enc_mbc_to_codepoint(p, p+len, enc); + if (rb_enc_isctype(c, ctype, enc)) + return NEIGHBOR_FOUND; } - /* small alphabets */ - if (rb_enc_islower(c, enc)) { - cs = c++; - if (rb_enc_islower(c, enc)) { - rb_enc_mbcput(c, s, enc); - return 0; - } - do c = cs--; while (rb_enc_islower(cs, enc)); - rb_enc_mbcput(c, s, enc); - return c; + MEMCPY(p, save, char, len); + range = 1; + while (1) { + MEMCPY(save, p, char, len); + ret = enc_pred_char(p, len, enc); + if (ret == NEIGHBOR_FOUND) { + c = rb_enc_mbc_to_codepoint(p, p+len, enc); + if (!rb_enc_isctype(c, ctype, enc)) { + MEMCPY(p, save, char, len); + break; + } + } + else { + MEMCPY(p, save, char, len); + break; + } + range++; } - /* capital alphabets */ - if (rb_enc_isupper(c, enc)) { - cs = c++; - if (rb_enc_isupper(c, enc)) { - rb_enc_mbcput(c, s, enc); - return 0; - } - do c = cs--; while (rb_enc_isupper(cs, enc)); - rb_enc_mbcput(c, s, enc); - return c; + if (range == 1) { + return NEIGHBOR_NOT_CHAR; } - return -1; + + if (ctype != ONIGENC_CTYPE_DIGIT) { + MEMCPY(carry, p, char, len); + return NEIGHBOR_WRAPPED; + } + + MEMCPY(carry, p, char, len); + enc_succ_char(carry, len, enc); + return NEIGHBOR_WRAPPED; } @@ -2103,9 +2172,9 @@ VALUE str; char *sbeg, *s, *e; int c = -1; - unsigned int cc = 0; - long n = 0, o = 0, l; + long l; char carry[ONIGENC_CODE_TO_MBC_MAXLEN]; + int carry_pos, carry_len; str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); rb_enc_copy(str, orig); @@ -2117,41 +2186,45 @@ s = e = sbeg + RSTRING_LEN(str); while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) { + enum neighbor_char neighbor; if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; - cc = rb_enc_mbc_to_codepoint(s, e, enc); - if (rb_enc_isalnum(cc, enc)) { - if (rb_enc_isascii(cc, enc)) { - if ((c = succ_char(s)) == 0) break; - } - else { - if ((c = enc_succ_char(cc, s, enc)) == 0) break; - } - n = s - sbeg; - } + neighbor = enc_succ_alnum_char(s, l, enc, carry); + if (neighbor == NEIGHBOR_NOT_CHAR) + continue; + if (neighbor == NEIGHBOR_FOUND) + return str; + c = 1; + carry_pos = s - sbeg; + carry_len = l; } if (c == -1) { /* str contains no alnum */ - c = '\001'; + carry[0] = '\001'; + carry_len = 1; s = e; while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) { - int limit = 256; - if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; - cc = rb_enc_mbc_to_codepoint(s, e, enc); - while ((l = rb_enc_mbcput(++cc, carry, enc)) < 0 && --limit); - if (l > 0) { - if (l == (o = e - s)) goto overlay; - n = s - sbeg; - goto insert; - } + enum neighbor_char neighbor; + if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; + neighbor = enc_succ_char(s, l, enc); + if (neighbor == NEIGHBOR_FOUND) + return str; + if (rb_enc_precise_mbclen(s, s+l, enc) != l) { + /* wrapped to \0...\0. search next valid char. */ + enc_succ_char(s, l, enc); + } + c = 1; + carry_pos = s - sbeg; } + if (c == -1) { + c = 1; + carry_pos = 0; + } } - if (!s && (l = rb_enc_mbcput(c, carry, enc)) > 0) { - insert: - RESIZE_CAPA(str, RSTRING_LEN(str) + l - o); - s = RSTRING_PTR(str) + n; - memmove(s + l, s + o, RSTRING_LEN(str) - n - o); - overlay: - memmove(s, carry, l); - STR_SET_LEN(str, RSTRING_LEN(str) + l - o); + if (!s && c == 1) { + RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len); + s = RSTRING_PTR(str) + carry_pos; + memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos); + memmove(s, carry, carry_len); + STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; } Index: test/ruby/test_m17n_comb.rb =================================================================== --- test/ruby/test_m17n_comb.rb (revision 15267) +++ test/ruby/test_m17n_comb.rb (revision 15268) @@ -1349,10 +1349,6 @@ end def test_str_succ - starts = [ - e("\xA1\xA1"), - e("\xFE\xFE") - ] STRINGS.each {|s0| next if s0.empty? s = s0.dup @@ -1360,11 +1356,16 @@ h = {} n.times {|i| if h[s] - assert(false, "#{encdump s} cycle with succ! #{i-h[s]} times") + assert(false, "#{encdump s} cycle with succ #{i-h[s]} times") end h[s] = i - assert_operator(s.length, :<=, s0.length + Math.log2(i+1) + 1, "#{encdump s0} succ! #{i} times => #{encdump s}") - s.succ! + assert_operator(s.length, :<=, s0.length + Math.log2(i+1) + 1, "#{encdump s0} succ #{i} times => #{encdump s}") + #puts encdump(s) + t = s.succ + if s.valid_encoding? + assert(t.valid_encoding?, "#{encdump s}.succ.valid_encoding?") + end + s = t } } end -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/