[前][次][番号順一覧][スレッド一覧]

ruby-changes:7856

From: matz <ko1@a...>
Date: Tue, 16 Sep 2008 09:48:05 +0900 (JST)
Subject: [ruby-changes:7856] Ruby:r19377 (trunk): * string.c ():

matz	2008-09-16 09:47:20 +0900 (Tue, 16 Sep 2008)

  New Revision: 19377

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=19377

  Log:
    * string.c ():
    
    * string.c ():
    
    * string.c (single_byte_optimizable): make function inline.  based
      on a patch from Michael Selig <michael.selig at fs.com.au> in
      [ruby-core:18532].
    
    * string.c (str_modify_keep_cr): new function act as
      rb_str_modify(), but don't clear coderange
    
    * string.c (rb_str_casecmp): specialized for single byte strings.
    
    * string.c (rb_str_splice): preserve coderange.
    
    * string.c (rb_str_slice_bang, rb_str_reverse_bang,
      rb_str_upcase_bang, rb_str_downcase_bang, tr_trans,
      rb_str_capitalize_bang, rb_str_swapcase_bang,
      rb_str_delete_bang, rb_str_chop_bang, rb_str_chomp_bang,
      rb_str_lstrip_bang, rb_str_rstrip_bang): ditto.
    
    * string.c (rb_str_clear): preset coderange.
    
    * string.c (rb_str_split_m): specialized for splitting with a
      string.

  Modified files:
    trunk/ChangeLog
    trunk/string.c
    trunk/test/ruby/test_m17n_comb.rb

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 19376)
+++ ChangeLog	(revision 19377)
@@ -18,6 +18,38 @@
 
 	* string.c (prefix_escape): ditto.
 
+Tue Sep 16 01:50:21 2008  Yukihiro Matsumoto  <matz@r...>
+
+	* string.c ():
+
+Tue Sep 16 01:49:44 2008  Yukihiro Matsumoto  <matz@r...>
+
+	* string.c ():
+
+Tue Sep 16 01:47:07 2008  Yukihiro Matsumoto  <matz@r...>
+
+	* string.c (single_byte_optimizable): make function inline.  based
+	  on a patch from Michael Selig <michael.selig at fs.com.au> in
+	  [ruby-core:18532].
+
+	* string.c (str_modify_keep_cr): new function act as
+	  rb_str_modify(), but don't clear coderange
+
+	* string.c (rb_str_casecmp): specialized for single byte strings.
+
+	* string.c (rb_str_splice): preserve coderange.
+
+	* string.c (rb_str_slice_bang, rb_str_reverse_bang,
+	  rb_str_upcase_bang, rb_str_downcase_bang, tr_trans,
+	  rb_str_capitalize_bang, rb_str_swapcase_bang,
+	  rb_str_delete_bang, rb_str_chop_bang, rb_str_chomp_bang,
+	  rb_str_lstrip_bang, rb_str_rstrip_bang): ditto.
+
+	* string.c (rb_str_clear): preset coderange.
+
+	* string.c (rb_str_split_m): specialized for splitting with a
+	  string.
+
 Tue Sep 16 00:57:56 2008  Tanaka Akira  <akr@f...>
 
 	* re.c (rb_reg_quote): use rb_enc_mbcput to generate ASCII
Index: string.c
===================================================================
--- string.c	(revision 19376)
+++ string.c	(revision 19377)
@@ -112,7 +112,7 @@
 
 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
 
-static int
+static inline int
 single_byte_optimizable(VALUE str)
 {
     rb_encoding *enc;
@@ -1059,6 +1059,17 @@
     ENC_CODERANGE_CLEAR(str);
 }
 
+/* As rb_str_modify(), but don't clear coderange */
+static void
+str_modify_keep_cr(VALUE str)
+{
+    if (!str_independent(str))
+	str_make_independent(str);
+    if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
+	/* Force re-scan later */
+	ENC_CODERANGE_CLEAR(str);
+}
+
 void
 rb_str_associate(VALUE str, VALUE add)
 {
@@ -1283,12 +1294,27 @@
     rb_encoding *enc = STR_ENC_GET(str);
     VALUE str2;
     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
-    int singlebyte;
 
     if (len < 0) return Qnil;
     if (!RSTRING_LEN(str)) {
 	len = 0;
     }
+    if (single_byte_optimizable(str)) {
+	if (beg > RSTRING_LEN(str)) return Qnil;
+	if (beg < 0) {
+	    beg += RSTRING_LEN(str);
+	    if (beg < 0) return Qnil;
+	}
+	if (beg + len > RSTRING_LEN(str))
+	    len = RSTRING_LEN(str) - beg;
+	if (len <= 0) {
+	    len = 0;
+	    p = 0;
+	}
+	else
+	    p = s + beg;
+	goto sub;
+    }
     if (beg < 0) {
 	if (len > -beg) len = -beg;
 	if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
@@ -1309,7 +1335,6 @@
     else if (beg > 0 && beg > str_strlen(str, enc)) {
 	return Qnil;
     }
-    singlebyte = single_byte_optimizable(str);
     if (len == 0) {
 	p = 0;
     }
@@ -1320,17 +1345,24 @@
         len = str_utf8_offset(p, e, len);
     }
 #endif
-    else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
-	len = 0;
-    }
     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
-        if (len * rb_enc_mbmaxlen(enc) > e - p)
+	int char_sz = rb_enc_mbmaxlen(enc);
+
+	p = s + beg * char_sz;
+	if (p > e) {
+	    p = e;
+	    len = 0;
+	}
+        else if (len * char_sz > e - p)
             len = e - p;
         else
-	    len *= rb_enc_mbmaxlen(enc);
+	    len *= char_sz;
     }
+    else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
+	len = 0;
+    }
     else {
-	len = str_offset(p, e, len, enc, singlebyte);
+	len = str_offset(p, e, len, enc, 0);
     }
   sub:
     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
@@ -2067,19 +2099,33 @@
 
     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
-    while (p1 < p1end && p2 < p2end) {
-	unsigned int c1 = rb_enc_codepoint(p1, p1end, enc);
-	unsigned int c2 = rb_enc_codepoint(p2, p2end, enc);
+    if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
+	while (p1 < p1end && p2 < p2end) {
+	    if (*p1 != *p2) {
+		unsigned int c1 = rb_enc_toupper(*p1 & 0xff, enc);
+		unsigned int c2 = rb_enc_toupper(*p2 & 0xff, enc);
+		if (c1 > c2) return INT2FIX(1);
+		if (c1 < c2) return INT2FIX(-1);
+	    }
+	    p1++;
+	    p2++;
+	}
+    }
+    else {
+	while (p1 < p1end && p2 < p2end) {
+	    unsigned int c1 = rb_enc_codepoint(p1, p1end, enc);
+	    unsigned int c2 = rb_enc_codepoint(p2, p2end, enc);
 
-	if (c1 != c2) {
-	    c1 = rb_enc_toupper(c1, enc);
-	    c2 = rb_enc_toupper(c2, enc);
-	    if (c1 > c2) return INT2FIX(1);
-	    if (c1 < c2) return INT2FIX(-1);
+	    if (c1 != c2) {
+		c1 = rb_enc_toupper(c1, enc);
+		c2 = rb_enc_toupper(c2, enc);
+		if (c1 > c2) return INT2FIX(1);
+		if (c1 < c2) return INT2FIX(-1);
+	    }
+	    len = rb_enc_codelen(c1, enc);
+	    p1 += len;
+	    p2 += len;
 	}
-	len = rb_enc_codelen(c1, enc);
-	p1 += len;
-	p2 += len;
     }
     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
@@ -2897,11 +2943,11 @@
     char *p, *e;
     rb_encoding *enc;
     int singlebyte = single_byte_optimizable(str);
+    int cr;
 
     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
 
     StringValue(val);
-    rb_str_modify(str);
     enc = rb_enc_check(str, val);
     slen = str_strlen(str, enc);
 
@@ -2918,6 +2964,7 @@
     if (slen < len || slen < beg + len) {
 	len = slen - beg;
     }
+    str_modify_keep_cr(str);
     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
     if (!p) p = RSTRING_END(str);
     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
@@ -2927,6 +2974,9 @@
     len = e - p;		/* physical length */
     rb_str_splice_0(str, beg, len, val);
     rb_enc_associate(str, enc);
+    cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
+    if (cr != ENC_CODERANGE_BROKEN)
+	ENC_CODERANGE_SET(str, cr);
 }
 
 void
@@ -3117,7 +3167,7 @@
     for (i=0; i<argc; i++) {
 	buf[i] = argv[i];
     }
-    rb_str_modify(str);
+    str_modify_keep_cr(str);
     buf[i] = rb_str_new(0,0);
     result = rb_str_aref_m(argc, buf, str);
     if (!NIL_P(result)) {
@@ -3527,7 +3577,10 @@
     STR_SET_EMBED(str);
     STR_SET_EMBED_LEN(str, 0);
     RSTRING_PTR(str)[0] = 0;
-    ENC_CODERANGE_CLEAR(str);
+    if (rb_enc_asciicompat(STR_ENC_GET(str)))
+	ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
+    else
+	ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
     return str;
 }
 
@@ -3659,22 +3712,15 @@
     if (RSTRING_LEN(str) > 1) {
 	if (single_byte_optimizable(str)) {
 	    char *s, *e, c;
-	    int cr = ENC_CODERANGE(str);
-	    int single = 1;
 
-	    rb_str_modify(str);
+	    str_modify_keep_cr(str);
 	    s = RSTRING_PTR(str);
 	    e = RSTRING_END(str) - 1;
 	    while (s < e) {
 		c = *s;
-		if (*s & 0x80) single = 0;
 		*s++ = *e;
  		*e-- = c;
 	    }
-	    if (cr == ENC_CODERANGE_UNKNOWN && single) {
-		cr = ENC_CODERANGE_7BIT;
-	    }
-	    ENC_CODERANGE_SET(str, cr);
 	}
 	else {
 	    rb_str_shared_replace(str, rb_str_reverse(str));
@@ -4036,23 +4082,46 @@
     rb_encoding *enc;
     char *s, *send;
     int modify = 0;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
+    str_modify_keep_cr(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
-    while (s < send) {
-	unsigned int c = rb_enc_codepoint(s, send, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < send) {
+	    unsigned int c = *(unsigned char*)s;
 
-	if (rb_enc_islower(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
-	    modify = 1;
+	    if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
+		*s = 'A' + (c - 'a');
+		modify = 1;
+	    }
+	    s++;
 	}
-	s += rb_enc_codelen(c, enc);
     }
+    else {
+	int ascompat = rb_enc_asciicompat(enc);
 
-    ENC_CODERANGE_SET(str, cr);
+	while (s < send) {
+	    unsigned int c;
+
+	    if (ascompat && (c = *(unsigned char*)s) < 0x80) {
+		if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
+		    *s = 'A' + (c - 'a');
+		    modify = 1;
+		}
+		s++;
+	    }
+	    else {
+		c = rb_enc_codepoint(s, send, enc);
+		if (rb_enc_islower(c, enc)) {
+		    /* assuming toupper returns codepoint with same size */
+		    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
+		    modify = 1;
+		}
+		s += rb_enc_codelen(c, enc);
+	    }
+	}
+    }
+
     if (modify) return str;
     return Qnil;
 }
@@ -4094,23 +4163,46 @@
     rb_encoding *enc;
     char *s, *send;
     int modify = 0;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
+    str_modify_keep_cr(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
-    while (s < send) {
-	unsigned int c = rb_enc_codepoint(s, send, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < send) {
+	    unsigned int c = *(unsigned char*)s;
 
-	if (rb_enc_isupper(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
-	    modify = 1;
+	    if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
+		*s = 'a' + (c - 'A');
+		modify = 1;
+	    }
+	    s++;
 	}
-	s += rb_enc_codelen(c, enc);
     }
+    else {
+	int ascompat = rb_enc_asciicompat(enc);
 
-    ENC_CODERANGE_SET(str, cr);
+	while (s < send) {
+	    unsigned int c;
+
+	    if (ascompat && (c = *(unsigned char*)s) < 0x80) {
+		if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
+		    *s = 'a' + (c - 'A');
+		    modify = 1;
+		}
+		s++;
+	    }
+	    else {
+		c = rb_enc_codepoint(s, send, enc);
+		if (rb_enc_isupper(c, enc)) {
+		    /* assuming toupper returns codepoint with same size */
+		    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
+		    modify = 1;
+		}
+		s += rb_enc_codelen(c, enc);
+	    }
+	}
+    }
+
     if (modify) return str;
     return Qnil;
 }
@@ -4158,9 +4250,8 @@
     char *s, *send;
     int modify = 0;
     unsigned int c;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
+    str_modify_keep_cr(str);
     enc = STR_ENC_GET(str);
     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
     s = RSTRING_PTR(str); send = RSTRING_END(str);
@@ -4180,7 +4271,6 @@
 	s += rb_enc_codelen(c, enc);
     }
 
-    ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
 }
@@ -4223,9 +4313,8 @@
     rb_encoding *enc;
     char *s, *send;
     int modify = 0;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
+    str_modify_keep_cr(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
     while (s < send) {
@@ -4237,14 +4326,13 @@
 	    modify = 1;
 	}
 	else if (rb_enc_islower(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
+	    /* assuming tolower returns codepoint with same size */
 	    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
 	    modify = 1;
 	}
-	s += rb_enc_codelen(c, enc);
+	s += rb_enc_mbclen(s, send, enc);
     }
 
-    ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
 }
@@ -4326,7 +4414,7 @@
     char *s, *send;
     VALUE hash = 0;
     int singlebyte = single_byte_optimizable(str);
-    int cr, cr1, cr2;
+    int cr;
 
     StringValue(src);
     StringValue(repl);
@@ -4336,11 +4424,6 @@
     }
 
     cr = ENC_CODERANGE(str);
-    cr1 = ENC_CODERANGE(src);
-    cr2 = ENC_CODERANGE(repl);
-    if (cr != cr1 || cr1 != cr2) {
-	cr = ENC_CODERANGE_UNKNOWN;
-    }
     e1 = rb_enc_check(str, src);
     e2 = rb_enc_check(str, repl);
     if (e1 == e2) {
@@ -4404,7 +4487,7 @@
 	}
     }
 
-    rb_str_modify(str);
+    str_modify_keep_cr(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
     if (sflag) {
 	int clen, tlen, max = RSTRING_LEN(str);
@@ -4525,8 +4608,10 @@
 	RSTRING(str)->as.heap.aux.capa = max;
     }
     
-    ENC_CODERANGE_SET(str, cr);
     if (modify) {
+	cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(repl));
+	if (cr != ENC_CODERANGE_BROKEN)
+	    ENC_CODERANGE_SET(str, cr);
 	rb_enc_associate(str, enc);
 	return str;
     }
@@ -4665,11 +4750,9 @@
     char *s, *send, *t;
     VALUE del = 0, nodel = 0;
     int modify = 0;
-    int i;
-    int cr;
+    int i, ascompat;
 
     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
-    cr = ENC_CODERANGE(str);
     if (argc < 1) {
 	rb_raise(rb_eArgError, "wrong number of arguments");
     }
@@ -4681,27 +4764,41 @@
 	tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
     }
 
-    rb_str_modify(str);
+    str_modify_keep_cr(str);
+    ascompat = rb_enc_asciicompat(enc);
     s = t = RSTRING_PTR(str);
-    if (!s || RSTRING_LEN(str) == 0) return Qnil;
     send = RSTRING_END(str);
     while (s < send) {
-	unsigned int c = rb_enc_codepoint(s, send, enc);
-	int clen = rb_enc_codelen(c, enc);
+	unsigned int c;
+	int clen;
 
-	if (tr_find(c, squeez, del, nodel)) {
-	    modify = 1;
+	if (ascompat && (c = *(unsigned char*)s) < 0x80) {
+	    if (squeez[c]) {
+		modify = 1;
+	    }
+	    else {
+		if (t != s) *t = c;
+		t++;
+	    }
+	    s++;
 	}
 	else {
-	    if (t != s) rb_enc_mbcput(c, t, enc);
-	    t += clen;
+	    c = rb_enc_codepoint(s, send, enc);
+	    clen = rb_enc_codelen(c, enc);
+
+	    if (tr_find(c, squeez, del, nodel)) {
+		modify = 1;
+	    }
+	    else {
+		if (t != s) rb_enc_mbcput(c, t, enc);
+		t += clen;
+	    }
+	    s += clen;
 	}
-	s += clen;
     }
     *t = '\0';
     STR_SET_LEN(str, t - RSTRING_PTR(str));
 
-    ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
 }
@@ -4748,7 +4845,6 @@
     int save, modify = 0;
     int i;
     int ascompat, singlebyte = single_byte_optimizable(str);
-    int cr = ENC_CODERANGE(str);
 
     if (argc == 0) {
 	enc = STR_ENC_GET(str);
@@ -4759,11 +4855,13 @@
 
 	    StringValue(s);
 	    enc = rb_enc_check(str, s);
+	    if (singlebyte && !single_byte_optimizable(s))
+		singlebyte = 0;
 	    tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
 	}
     }
 
-    rb_str_modify(str);
+    str_modify_keep_cr(str);
     s = t = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return Qnil;
     send = RSTRING_END(str);
@@ -4808,7 +4906,6 @@
 	modify = 1;
     }
 
-    ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
 }
@@ -4991,7 +5088,7 @@
     rb_encoding *enc;
     VALUE spat;
     VALUE limit;
-    int awk_split = Qfalse;
+    enum {awk, string, regexp} split_type;
     long beg, end, i = 0;
     int lim = 0;
     VALUE result, tmp;
@@ -5013,37 +5110,41 @@
 	    spat = rb_fs;
 	    goto fs_set;
 	}
-	awk_split = Qtrue;
+	split_type = awk;
     }
     else {
       fs_set:
 	if (TYPE(spat) == T_STRING) {
 	    rb_encoding *enc2 = STR_ENC_GET(spat);
 
-	    if (rb_enc_mbminlen(enc2) == 1) {
+	    split_type = string;
+	    if (RSTRING_LEN(spat) == 0) {
+		/* Special case - split into chars */
+		spat = rb_reg_regcomp(spat);
+		split_type = regexp;
+	    }
+	    else if (rb_enc_mbminlen(enc2) == 1) {
 		if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
-		    awk_split = Qtrue;
+		    split_type = awk;
 		}
 	    }
 	    else {
 		int l;
 		if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
 		    RSTRING_LEN(spat) == l) {
-		    awk_split = Qtrue;
+		    split_type = awk;
 		}
 	    }
-	    if (!awk_split) {
-		spat = rb_reg_regcomp(rb_reg_quote(spat));
-	    }
 	}
 	else {
 	    spat = get_pat(spat, 1);
+	    split_type = regexp;
 	}
     }
 
     result = rb_ary_new();
     beg = 0;
-    if (awk_split) {
+    if (split_type == awk) {
 	char *ptr = RSTRING_PTR(str);
 	char *eptr = RSTRING_END(str);
 	char *bptr = ptr;
@@ -5077,6 +5178,33 @@
 	    }
 	}
     }
+    else if (split_type == string) {
+	char *ptr = RSTRING_PTR(str);
+	char *eptr = RSTRING_END(str);
+	char *sptr = RSTRING_PTR(spat);
+	int slen = RSTRING_LEN(spat);
+
+	if (is_broken_string(str)) {
+	    rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
+	}
+	if (is_broken_string(spat)) {
+	    rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
+	}
+	enc = rb_enc_check(str, spat);
+	while (ptr < eptr &&
+	       (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
+	    /* Check we are at the start of a char */
+	    char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
+	    if (t != ptr + end) {
+		ptr = t;
+		continue;
+	    }
+	    rb_ary_push(result, rb_str_substr(str, ptr - RSTRING_PTR(str), end));
+	    ptr += end + slen;
+	    if (!NIL_P(limit) && lim <= ++i) break;
+	}
+	beg = ptr - RSTRING_PTR(str);
+    }
     else {
 	long start = beg;
 	long idx;
@@ -5410,7 +5538,7 @@
 {
     if (RSTRING_LEN(str) > 0) {
 	long len;
-	rb_str_modify(str);
+	str_modify_keep_cr(str);
 	len = chopped_length(str);
 	STR_SET_LEN(str, len);
 	RSTRING_PTR(str)[len] = '\0';
@@ -5472,7 +5600,7 @@
 	rs = rb_rs;
 	if (rs == rb_default_rs) {
 	  smart_chomp:
-	    rb_str_modify(str);
+	    str_modify_keep_cr(str);
 	    enc = rb_enc_get(str);
 	    if (rb_enc_mbminlen(enc) > 1) {
 		pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
@@ -5524,7 +5652,7 @@
 		len--;
 	}
 	if (len < RSTRING_LEN(str)) {
-	    rb_str_modify(str);
+	    str_modify_keep_cr(str);
 	    STR_SET_LEN(str, len);
 	    RSTRING_PTR(str)[len] = '\0';
 	    return str;
@@ -5546,7 +5674,7 @@
 	 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
 	if (rb_enc_left_char_head(p, pp, e, enc) != pp)
 	    return Qnil;
-	rb_str_modify(str);
+	str_modify_keep_cr(str);
 	STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
 	return str;
@@ -5599,10 +5727,8 @@
 {
     rb_encoding *enc;
     char *s, *t, *e;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
-    ENC_CODERANGE_SET(str, cr);
+    str_modify_keep_cr(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return Qnil;
@@ -5616,7 +5742,6 @@
     }
 
     if (s > RSTRING_PTR(str)) {
-	rb_str_modify(str);
 	STR_SET_LEN(str, t-s);
 	memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
@@ -5663,11 +5788,7 @@
 {
     rb_encoding *enc;
     char *s, *t, *e;
-    int space_seen = Qfalse;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
-    ENC_CODERANGE_SET(str, cr);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return Qnil;
@@ -5679,25 +5800,21 @@
       
 	/* remove trailing spaces */
 	while (s < t && rb_enc_isspace(*(t-1), enc)) t--;
-    } else {
-        while (s < e) {
-	    unsigned int cc = rb_enc_codepoint(s, e, enc);
+    }
+    else {
+	char *tp;
 
-	    if (!cc || rb_enc_isspace(cc, enc)) {
-	        if (!space_seen) t = s;
-		space_seen = Qtrue;
-	    }
-	    else {
-	        space_seen = Qfalse;
-	    }
-	    s += rb_enc_codelen(cc, enc);
+        while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
+	    if (!rb_enc_isspace(rb_enc_codepoint(tp, e, enc), enc)) break;
+	    t = tp;
 	}
-	if (!space_seen) t = s;
     }
+    if (t < e) {
+	int len = t-RSTRING_PTR(str);
 
-    if (t < e) {
-	STR_SET_LEN(str, t-RSTRING_PTR(str));
-	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
+	str_modify_keep_cr(str);
+	STR_SET_LEN(str, len);
+	RSTRING_PTR(str)[len] = '\0';
 	return str;
     }
     return Qnil;
@@ -6094,7 +6211,7 @@
     const char *f = " ";
     long n, llen, rlen;
     volatile VALUE pad;
-    int singlebyte = 1;
+    int singlebyte = 1, cr;
 
     rb_scan_args(argc, argv, "11", &w, &pad);
     enc = STR_ENC_GET(str);
@@ -6115,6 +6232,7 @@
     n = width - len;
     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
     rlen = n - llen;
+    cr = ENC_CODERANGE(str);
     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
     p = RSTRING_PTR(res);
     while (llen) {
@@ -6160,6 +6278,10 @@
     OBJ_INFECT(res, str);
     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
     rb_enc_associate(res, enc);
+    if (argc == 2)
+	cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
+    if (cr != ENC_CODERANGE_BROKEN)
+	ENC_CODERANGE_SET(res, cr);
     return res;
 }
 
Index: test/ruby/test_m17n_comb.rb
===================================================================
--- test/ruby/test_m17n_comb.rb	(revision 19376)
+++ test/ruby/test_m17n_comb.rb	(revision 19377)
@@ -1158,11 +1158,11 @@
   def test_str_split
     combination(STRINGS, STRINGS) {|s1, s2|
       if !s2.valid_encoding?
-        assert_raise(RegexpError) { s1.split(s2) }
+        assert_raise(ArgumentError, RegexpError) { s1.split(s2) }
         next
       end
       if !s1.ascii_only? && !s2.ascii_only? && s1.encoding != s2.encoding
-        assert_raise(ArgumentError) { s1.split(s2) }
+        assert_raise(ArgumentError, EncodingCompatibilityError) { s1.split(s2) }
         next
       end
       if !s1.valid_encoding?

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]