ruby-changes:3417
From: ko1@a...
Date: 6 Jan 2008 18:25:27 +0900
Subject: [ruby-changes:3417] akr - Ruby:r14910 (trunk): * include/ruby/encoding.h (rb_enc_str_buf_cat): declared.
akr 2008-01-06 18:25:09 +0900 (Sun, 06 Jan 2008) New Revision: 14910 Modified files: trunk/ChangeLog trunk/include/ruby/encoding.h trunk/re.c trunk/string.c trunk/test/ruby/test_m17n_comb.rb Log: * include/ruby/encoding.h (rb_enc_str_buf_cat): declared. * string.c (coderange_scan): extracted from rb_enc_str_coderange. (rb_enc_str_coderange): use coderange_scan. (rb_str_shared_replace): copy encoding and coderange. (rb_enc_str_buf_cat): new function for linear complexity string accumulation with encoding. (rb_str_sub_bang): don't conflict substituted part and replacement. (str_gsub): use rb_enc_str_buf_cat. (rb_str_clear): clear coderange. * re.c (rb_reg_regsub): use rb_enc_str_buf_cat. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_m17n_comb.rb?r1=14910&r2=14909&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=14910&r2=14909&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=14910&r2=14909&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/encoding.h?r1=14910&r2=14909&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/re.c?r1=14910&r2=14909&diff_format=u Index: include/ruby/encoding.h =================================================================== --- include/ruby/encoding.h (revision 14909) +++ include/ruby/encoding.h (revision 14910) @@ -65,6 +65,7 @@ long rb_enc_strlen(const char*, const char*, rb_encoding*); char* rb_enc_nth(const char*, const char*, int, rb_encoding*); VALUE rb_obj_encoding(VALUE); +VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc); /* index -> rb_encoding */ rb_encoding* rb_enc_from_index(int idx); Index: re.c =================================================================== --- re.c (revision 14909) +++ re.c (revision 14910) @@ -2795,17 +2795,18 @@ VALUE val = 0; char *p, *s, *e; int no, clen; - rb_encoding *enc = rb_enc_check(str, src); + rb_encoding *str_enc = rb_enc_get(str); + rb_encoding *src_enc = rb_enc_get(src); p = s = RSTRING_PTR(str); e = s + RSTRING_LEN(str); while (s < e) { - int c = rb_enc_ascget(s, e, &clen, enc); + int c = rb_enc_ascget(s, e, &clen, str_enc); char *ss; if (c == -1) { - s += mbclen(s, e, enc); + s += mbclen(s, e, str_enc); continue; } ss = s; @@ -2816,12 +2817,12 @@ if (!val) { val = rb_str_buf_new(ss-p); } - rb_str_buf_cat(val, p, ss-p); + rb_enc_str_buf_cat(val, p, ss-p, str_enc); - c = rb_enc_ascget(s, e, &clen, enc); + c = rb_enc_ascget(s, e, &clen, str_enc); if (c == -1) { - s += mbclen(s, e, enc); - rb_str_buf_cat(val, ss, s-ss); + s += mbclen(s, e, str_enc); + rb_enc_str_buf_cat(val, ss, s-ss, str_enc); continue; } s += clen; @@ -2839,14 +2840,14 @@ break; case 'k': - if (s < e && rb_enc_ascget(s, e, &clen, enc) == '<') { + if (s < e && rb_enc_ascget(s, e, &clen, str_enc) == '<') { char *name, *name_end; name_end = name = s + clen; while (name_end < e) { - c = rb_enc_ascget(name_end, e, &clen, enc); + c = rb_enc_ascget(name_end, e, &clen, str_enc); if (c == '>') break; - name_end += c == -1 ? mbclen(name_end, e, enc) : clen; + name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen; } if (name_end < e) { no = name_to_backref_number(regs, regexp, name, name_end); @@ -2858,7 +2859,7 @@ } } - rb_str_buf_cat(val, ss, s-ss); + rb_enc_str_buf_cat(val, ss, s-ss, str_enc); continue; case '0': @@ -2867,11 +2868,11 @@ break; case '`': - rb_str_buf_cat(val, RSTRING_PTR(src), BEG(0)); + rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc); continue; case '\'': - rb_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0)); + rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc); continue; case '+': @@ -2881,26 +2882,25 @@ break; case '\\': - rb_str_buf_cat(val, s-clen, clen); + rb_enc_str_buf_cat(val, s-clen, clen, str_enc); continue; default: - rb_str_buf_cat(val, ss, s-ss); + rb_enc_str_buf_cat(val, ss, s-ss, str_enc); continue; } if (no >= 0) { if (no >= regs->num_regs) continue; if (BEG(no) == -1) continue; - rb_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no)); + rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc); } } if (!val) return str; if (p < e) { - rb_str_buf_cat(val, p, e-p); + rb_enc_str_buf_cat(val, p, e-p, str_enc); } - rb_enc_associate(val, enc); return val; } Index: ChangeLog =================================================================== --- ChangeLog (revision 14909) +++ ChangeLog (revision 14910) @@ -1,3 +1,18 @@ +Sun Jan 6 18:19:12 2008 Tanaka Akira <akr@f...> + + * include/ruby/encoding.h (rb_enc_str_buf_cat): declared. + + * string.c (coderange_scan): extracted from rb_enc_str_coderange. + (rb_enc_str_coderange): use coderange_scan. + (rb_str_shared_replace): copy encoding and coderange. + (rb_enc_str_buf_cat): new function for linear complexity string + accumulation with encoding. + (rb_str_sub_bang): don't conflict substituted part and replacement. + (str_gsub): use rb_enc_str_buf_cat. + (rb_str_clear): clear coderange. + + * re.c (rb_reg_regsub): use rb_enc_str_buf_cat. + Sun Jan 6 17:55:44 2008 Technorama Ltd. <oss-ruby@t...> * lib/securerandom.rb: Add Win32 support. Index: string.c =================================================================== --- string.c (revision 14909) +++ string.c (revision 14910) @@ -114,6 +114,31 @@ VALUE rb_fs; +static int +coderange_scan(const char *p, long len, rb_encoding *enc) +{ + const char *e = p + len; + int cr; + + cr = rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + int len = MBCLEN_CHARFOUND(ret); + + if (len) { + if (len != 1 || !ISASCII((unsigned char)*p)) { + cr = ENC_CODERANGE_VALID; + } + p += len; + } + else { + cr = ENC_CODERANGE_BROKEN; + break; + } + } + return cr; +} + int rb_enc_str_coderange(VALUE str) { @@ -121,26 +146,7 @@ if (cr == ENC_CODERANGE_UNKNOWN) { rb_encoding *enc = rb_enc_get(str); - - const char *p = RSTRING_PTR(str); - const char *e = p + RSTRING_LEN(str); - - cr = rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; - while (p < e) { - int ret = rb_enc_precise_mbclen(p, e, enc); - int len = MBCLEN_CHARFOUND(ret); - - if (len) { - if (len != 1 || !rb_enc_isascii((unsigned char)*p, enc)) { - cr = ENC_CODERANGE_VALID; - } - p += len; - } - else { - cr = ENC_CODERANGE_BROKEN; - break; - } - } + cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); ENC_CODERANGE_SET(str, cr); } return cr; @@ -404,13 +410,19 @@ void rb_str_shared_replace(VALUE str, VALUE str2) { + rb_encoding *enc; + int cr; if (str == str2) return; + enc = rb_enc_get(str2); + cr = ENC_CODERANGE(str2); rb_str_modify(str); if (OBJ_TAINTED(str2)) OBJ_TAINT(str); if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { STR_SET_EMBED(str); memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); + rb_enc_associate(str, enc); + ENC_CODERANGE_SET(str, cr); return; } if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { @@ -431,6 +443,8 @@ RSTRING(str2)->as.heap.len = 0; RSTRING(str2)->as.heap.aux.capa = 0; STR_UNSET_NOCAPA(str2); + rb_enc_associate(str, enc); + ENC_CODERANGE_SET(str, cr); } static ID id_to_s; @@ -1032,6 +1046,94 @@ } VALUE +rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc) +{ + long capa, total; + + rb_encoding *str_enc = rb_enc_get(str); + rb_encoding *res_enc; + int str_cr, ptr_cr, res_cr; + int str_a8 = rb_enc_to_index(str_enc) == 0; + int ptr_a8 = rb_enc_to_index(ptr_enc) == 0; + + str_cr = ENC_CODERANGE(str); + ptr_cr = coderange_scan(ptr, len, ptr_enc); + + if (str_cr == ENC_CODERANGE_UNKNOWN) { + if (str_a8 ? !ptr_a8 + : (str_enc != ptr_enc && ptr_cr != ENC_CODERANGE_7BIT)) { + str_cr = rb_enc_str_coderange(str); + } + } + + if (str_enc != ptr_enc && + str_cr != ENC_CODERANGE_7BIT && + ptr_cr != ENC_CODERANGE_7BIT) { + rb_raise(rb_eArgError, "append incompatible encoding strings: %s and %s", + rb_enc_name(str_enc), + rb_enc_name(ptr_enc)); + } + + if (str_cr == ENC_CODERANGE_UNKNOWN) { + res_enc = str_enc; + res_cr = ENC_CODERANGE_UNKNOWN; + } + else if (str_cr == ENC_CODERANGE_7BIT) { + if (ptr_cr == ENC_CODERANGE_7BIT) { + res_enc = !str_a8 ? str_enc : ptr_enc; + res_cr = ENC_CODERANGE_7BIT; + } + else { + res_enc = ptr_enc; + res_cr = ptr_cr; + } + } + else if (str_cr == ENC_CODERANGE_VALID) { + res_enc = str_enc; + res_cr = str_cr; + } + else { /* str_cr == ENC_CODERANGE_BROKEN */ + res_enc = str_enc; + res_cr = str_cr; + if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; + } + + if (len < 0) { + rb_raise(rb_eArgError, "negative string size (or size too big)"); + } + rb_str_modify(str); + if (len == 0) { + rb_enc_associate(str, res_enc); + ENC_CODERANGE_SET(str, res_cr); + return str; + } + if (STR_ASSOC_P(str)) { + FL_UNSET(str, STR_ASSOC); + capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); + } + else if (STR_EMBED_P(str)) { + capa = RSTRING_EMBED_LEN_MAX; + } + else { + capa = RSTRING(str)->as.heap.aux.capa; + } + total = RSTRING_LEN(str)+len; + if (capa <= total) { + while (total > capa) { + capa = (capa + 1) * 2; + } + RESIZE_CAPA(str, capa); + } + memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); + STR_SET_LEN(str, total); + RSTRING_PTR(str)[total] = '\0'; // sentinel + + rb_enc_associate(str, res_enc); + ENC_CODERANGE_SET(str, res_cr); + return str; +} + +VALUE rb_str_buf_append(VALUE str, VALUE str2) { rb_encoding *enc; @@ -2463,15 +2565,27 @@ rb_match_busy(match); repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); - enc = rb_enc_check(str, repl); str_mod_check(str, p, len); str_frozen_check(str); rb_backref_set(match); } else { repl = rb_reg_regsub(repl, str, regs, pat); - enc = rb_enc_check(str, repl); } + enc = rb_enc_compatible(str, repl); + if (!enc) { + rb_encoding *str_enc = rb_enc_get(str); + if (coderange_scan(RSTRING_PTR(str), BEG(0), str_enc) != + ENC_CODERANGE_7BIT || + coderange_scan(RSTRING_PTR(str)+END(0), + RSTRING_LEN(str)-END(0), str_enc) != + ENC_CODERANGE_7BIT) { + rb_raise(rb_eArgError, "character encodings differ: %s and %s", + rb_enc_name(str_enc), + rb_enc_name(rb_enc_get(repl))); + } + enc = rb_enc_get(repl); + } rb_str_modify(str); rb_enc_associate(str, enc); if (OBJ_TAINTED(repl)) tainted = 1; @@ -2548,9 +2662,8 @@ long beg, n; long offset, blen, slen, len; int iter = 0; - char *buf, *bp, *sp, *cp; + char *sp, *cp; int tainted = 0; - int cr; switch (argc) { case 1: @@ -2575,25 +2688,18 @@ } blen = RSTRING_LEN(str) + 30; /* len + margin */ - dest = str_new(0, 0, blen); - buf = RSTRING_PTR(dest); - bp = buf; - sp = cp = RSTRING_PTR(str); + dest = rb_str_buf_new(blen); + sp = RSTRING_PTR(str); slen = RSTRING_LEN(str); - cr = ENC_CODERANGE(str); + cp = sp; - rb_str_locktmp(dest); do { - rb_encoding *enc; - n++; match = rb_backref_get(); regs = RMATCH(match)->regs; if (iter) { - rb_match_busy(match); val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); - enc = rb_enc_check(str, val); str_mod_check(str, sp, slen); if (bang) str_frozen_check(str); if (val == dest) { /* paranoid check [ruby-dev:24827] */ @@ -2603,28 +2709,16 @@ } else { val = rb_reg_regsub(repl, str, regs, pat); - enc = rb_enc_check(str, val); } - rb_enc_associate(str, enc); + if (OBJ_TAINTED(val)) tainted = 1; - if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { - int cr2 = ENC_CODERANGE(val); - if (cr2 == ENC_CODERANGE_UNKNOWN || cr2 > cr) cr = cr2; - } - len = (bp - buf) + (beg - offset) + RSTRING_LEN(val) + 3; - if (blen < len) { - while (blen < len) blen *= 2; - len = bp - buf; - RESIZE_CAPA(dest, blen); - STR_SET_LEN(dest, blen); - buf = RSTRING_PTR(dest); - bp = buf + len; - } + len = beg - offset; /* copy pre-match substr */ - memcpy(bp, cp, len); - bp += len; - memcpy(bp, RSTRING_PTR(val), RSTRING_LEN(val)); - bp += RSTRING_LEN(val); + rb_enc_str_buf_cat(dest, cp, len, rb_enc_get(str)); + + rb_enc_str_buf_cat(dest, RSTRING_PTR(val), RSTRING_LEN(val), rb_enc_get(val)); + RB_GC_GUARD(val); + offset = END(0); if (BEG(0) == END(0)) { /* @@ -2632,9 +2726,8 @@ * in order to prevent infinite loops. */ if (RSTRING_LEN(str) <= END(0)) break; - len = rb_enc_mbclen(RSTRING_PTR(str)+END(0), RSTRING_END(str), enc); - memcpy(bp, RSTRING_PTR(str)+END(0), len); - bp += len; + len = rb_enc_mbclen(RSTRING_PTR(str)+END(0), RSTRING_END(str), rb_enc_get(str)); + rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+END(0), len, rb_enc_get(str)); offset = END(0) + len; } cp = RSTRING_PTR(str) + offset; @@ -2642,39 +2735,18 @@ beg = rb_reg_search(pat, str, offset, 0); } while (beg >= 0); if (RSTRING_LEN(str) > offset) { - len = bp - buf; - if (blen - len < RSTRING_LEN(str) - offset) { - blen = len + RSTRING_LEN(str) - offset; - RESIZE_CAPA(dest, blen); - buf = RSTRING_PTR(dest); - bp = buf + len; - } - memcpy(bp, cp, RSTRING_LEN(str) - offset); - bp += RSTRING_LEN(str) - offset; + rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, rb_enc_get(str)); } rb_backref_set(match); - *bp = '\0'; - rb_str_unlocktmp(dest); if (bang) { - if (str_independent(str) && !STR_EMBED_P(str)) { - free(RSTRING_PTR(str)); - } - STR_SET_NOEMBED(str); - STR_UNSET_NOCAPA(str); - RSTRING(str)->as.heap.ptr = buf; - RSTRING(str)->as.heap.aux.capa = blen; - RSTRING(dest)->as.heap.ptr = 0; - RSTRING(dest)->as.heap.len = 0; + rb_str_shared_replace(str, dest); } else { RBASIC(dest)->klass = rb_obj_class(str); OBJ_INFECT(dest, str); - rb_enc_copy(dest, str); str = dest; } - STR_SET_LEN(str, bp - buf); - ENC_CODERANGE_SET(str, cr); if (tainted) OBJ_TAINT(str); return str; } @@ -2782,6 +2854,7 @@ OBJ_INFECT(str, str2); rb_enc_copy(str, str2); + ENC_CODERANGE_SET(str, ENC_CODERANGE(str2)); return str; } @@ -2805,6 +2878,7 @@ STR_SET_EMBED(str); STR_SET_EMBED_LEN(str, 0); RSTRING_PTR(str)[0] = 0; + ENC_CODERANGE_CLEAR(str); return str; } Index: test/ruby/test_m17n_comb.rb =================================================================== --- test/ruby/test_m17n_comb.rb (revision 14909) +++ test/ruby/test_m17n_comb.rb (revision 14910) @@ -335,7 +335,6 @@ end def test_str_aref_substr - combination(STRINGS, STRINGS) {|s1, s2| if s1.ascii_only? || s2.ascii_only? || s1.encoding == s2.encoding t = s1[s2] @@ -1359,7 +1358,7 @@ assert_equal(s1, doit.call) next end - if !str_enc_compatible?(s1, s3) + if !str_enc_compatible?(s1.gsub(r2, ''), s3) assert_raise(ArgumentError, desc) { doit.call } next end @@ -1413,7 +1412,7 @@ assert_equal([s1, nil], doit.call) next end - if !str_enc_compatible?(s1, s3) + if !str_enc_compatible?(s1.gsub(r2, ''), s3) assert_raise(ArgumentError, desc) { doit.call } next end -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml