[前][次][番号順一覧][スレッド一覧]

ruby-changes:11840

From: matz <ko1@a...>
Date: Wed, 20 May 2009 13:44:53 +0900 (JST)
Subject: [ruby-changes:11840] Ruby:r23495 (trunk): * encoding.c (rb_enc_fast_mbclen): faster mbclen for strings known

matz	2009-05-20 13:44:36 +0900 (Wed, 20 May 2009)

  New Revision: 23495

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=23495

  Log:
    * encoding.c (rb_enc_fast_mbclen): faster mbclen for strings known
      to be valid.
    * string.c (enc_strlen): coderange specified version of
      rb_enc_strlen().  use rb_enc_fast_mbclen() if coderange is 7bit
      or valid.
    
    * string.c (str_gsub): use rb_enc_fast_mbclen().
    
    * string.c (rb_str_reverse, rb_str_split_m, rb_str_each_char,
      scan_once): ditto.

  Modified files:
    trunk/ChangeLog
    trunk/encoding.c
    trunk/include/ruby/encoding.h
    trunk/string.c

Index: encoding.c
===================================================================
--- encoding.c	(revision 23494)
+++ encoding.c	(revision 23495)
@@ -727,6 +727,12 @@
 }
 
 int
+rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
+{
+    return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
+}
+
+int
 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
 {
     int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
Index: include/ruby/encoding.h
===================================================================
--- include/ruby/encoding.h	(revision 23494)
+++ include/ruby/encoding.h	(revision 23495)
@@ -112,6 +112,9 @@
 /* -> mbclen (no error notification: 0 < ret <= e-p, no exception) */
 int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc);
 
+/* -> mbclen (only for valid encoding) */
+int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc);
+
 /* -> chlen, invalid or needmore */
 int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc);
 #define MBCLEN_CHARFOUND_P(ret)     ONIGENC_MBCLEN_CHARFOUND_P(ret)
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 23494)
+++ ChangeLog	(revision 23495)
@@ -1,3 +1,17 @@
+Wed May 20 06:25:29 2009  Yukihiro Matsumoto  <matz@r...>
+
+	* encoding.c (rb_enc_fast_mbclen): faster mbclen for strings known
+	  to be valid.
+
+	* string.c (enc_strlen): coderange specified version of
+	  rb_enc_strlen().  use rb_enc_fast_mbclen() if coderange is 7bit
+	  or valid.
+
+	* string.c (str_gsub): use rb_enc_fast_mbclen().
+
+	* string.c (rb_str_reverse, rb_str_split_m, rb_str_each_char,
+	  scan_once): ditto.
+
 Wed May 20 06:20:05 2009  Yukihiro Matsumoto  <matz@r...>
 
 	* lib/tempfile.rb (Tempfile#unlink): close first for Windows.  a
Index: string.c
===================================================================
--- string.c	(revision 23494)
+++ string.c	(revision 23495)
@@ -851,8 +851,8 @@
     return str;
 }
 
-long
-rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
+static inline long
+enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
 {
     long c;
     const char *q;
@@ -862,17 +862,32 @@
     }
     else if (rb_enc_asciicompat(enc)) {
         c = 0;
-        while (p < e) {
-            if (ISASCII(*p)) {
-                q = search_nonascii(p, e);
-                if (!q)
-                    return c + (e - p);
-                c += q - p;
-                p = q;
-            }
-            p += rb_enc_mbclen(p, e, enc);
-            c++;
-        }
+	if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
+	    while (p < e) {
+		if (ISASCII(*p)) {
+		    q = search_nonascii(p, e);
+		    if (!q)
+			return c + (e - p);
+		    c += q - p;
+		    p = q;
+		}
+		p += rb_enc_fast_mbclen(p, e, enc);
+		c++;
+	    }
+	}
+	else {
+	    while (p < e) {
+		if (ISASCII(*p)) {
+		    q = search_nonascii(p, e);
+		    if (!q)
+			return c + (e - p);
+		    c += q - p;
+		    p = q;
+		}
+		p += rb_enc_mbclen(p, e, enc);
+		c++;
+	    }
+	}
         return c;
     }
 
@@ -883,6 +898,12 @@
 }
 
 long
+rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
+{
+    return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
+}
+
+long
 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
 {
     long c;
@@ -964,10 +985,12 @@
     if (!enc) enc = STR_ENC_GET(str);
     p = RSTRING_PTR(str);
     e = RSTRING_END(str);
+    cr = ENC_CODERANGE(str);
 #ifdef NONASCII_MASK
     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
         enc == rb_utf8_encoding()) {
-        VALUE len = 0;
+
+	VALUE len = 0;
 	if ((int)sizeof(VALUE) * 2 < e - p) {
 	    const VALUE *s, *t;
 	    const VALUE lowbits = sizeof(VALUE) - 1;
@@ -1419,7 +1442,7 @@
         return pos;
     else {
 	char *p = RSTRING_PTR(str);
-        return rb_enc_strlen(p, p + pos, STR_ENC_GET(str));
+        return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
     }
 }
 
@@ -3721,7 +3744,7 @@
 	     * in order to prevent infinite loops.
 	     */
 	    if (RSTRING_LEN(str) <= end0) break;
-	    len = rb_enc_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
+	    len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
 	    offset = end0 + len;
 	}
@@ -3955,6 +3978,16 @@
 		*--p = *s++;
 	    }
 	}
+	else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
+	    while (s < e) {
+		int clen = rb_enc_fast_mbclen(s, e, enc);
+
+		if (clen > 1 || (*s & 0x80)) single = 0;
+		p -= clen;
+		memcpy(p, s, clen);
+		s += clen;
+	    }
+	}
 	else {
 	    while (s < e) {
 		int clen = rb_enc_mbclen(s, e, enc);
@@ -5610,16 +5643,16 @@
 		}
 		else if (last_null == 1) {
 		    rb_ary_push(result, rb_str_subseq(str, beg,
-						      rb_enc_mbclen(RSTRING_PTR(str)+beg,
-								    RSTRING_END(str),
-								    enc)));
+						      rb_enc_fast_mbclen(RSTRING_PTR(str)+beg,
+									 RSTRING_END(str),
+									 enc)));
 		    beg = start;
 		}
 		else {
                     if (RSTRING_PTR(str)+start == RSTRING_END(str))
                         start++;
                     else
-                        start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
+                        start += rb_enc_fast_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
 		    last_null = 1;
 		    continue;
 		}
@@ -5889,9 +5922,19 @@
     ptr = RSTRING_PTR(str);
     len = RSTRING_LEN(str);
     enc = rb_enc_get(str);
-    for (i = 0; i < len; i += n) {
-	n = rb_enc_mbclen(ptr + i, ptr + len, enc);
-	rb_yield(rb_str_subseq(str, i, n));
+    switch (ENC_CODERANGE(str)) {
+      case ENC_CODERANGE_VALID:
+      case ENC_CODERANGE_7BIT:
+	for (i = 0; i < len; i += n) {
+	    n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
+	    rb_yield(rb_str_subseq(str, i, n));
+	}
+	break;
+      default:
+	for (i = 0; i < len; i += n) {
+	    n = rb_enc_mbclen(ptr + i, ptr + len, enc);
+	    rb_yield(rb_str_subseq(str, i, n));
+	}
     }
     return str;
 }
@@ -6340,8 +6383,8 @@
 	     * Always consume at least one character of the input string
 	     */
 	    if (RSTRING_LEN(str) > END(0))
-		*start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
-					      RSTRING_END(str), enc);
+		*start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
+						   RSTRING_END(str), enc);
 	    else
 		*start = END(0)+1;
 	}

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]