[前][次][番号順一覧][スレッド一覧]

ruby-changes:4205

From: ko1@a...
Date: Wed, 5 Mar 2008 22:55:24 +0900 (JST)
Subject: [ruby-changes:4205] naruse - Ruby:r15695 (trunk): * string.c (is_utf8_lead_byte, count_utf8_lead_bytes_with_ulong):

naruse	2008-03-05 22:54:36 +0900 (Wed, 05 Mar 2008)

  New Revision: 15695

  Modified files:
    trunk/ChangeLog
    trunk/string.c

  Log:
    * string.c (is_utf8_lead_byte, count_utf8_lead_bytes_with_ulong):
      defined for UTF-8 optimization.
    
    * string.c (str_strlen): use is_utf8_lead_byte and
      count_utf8_lead_bytes_with_ulong.
    
    * string.c (str_utf8_nth) ditto.

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=15695&r2=15694&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=15695&r2=15694&diff_format=u

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 15694)
+++ ChangeLog	(revision 15695)
@@ -1,3 +1,13 @@
+Wed Mar 05 22:49:20 2008  NARUSE, Yui  <naruse@r...>
+
+	* string.c (is_utf8_lead_byte, count_utf8_lead_bytes_with_ulong):
+	  defined for UTF-8 optimization.
+
+	* string.c (str_strlen): use is_utf8_lead_byte and
+	  count_utf8_lead_bytes_with_ulong.
+
+	* string.c (str_utf8_nth) ditto.
+
 Wed Mar  5 17:53:01 2008  Nobuyoshi Nakada  <nobu@r...>
 
 	* file.c (rb_file_flock): returns false on EAGAIN if non-blocking.
Index: string.c
===================================================================
--- string.c	(revision 15694)
+++ string.c	(revision 15695)
@@ -755,6 +755,24 @@
     return c;
 }
 
+#ifdef NONASCII_MASK
+#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
+static inline const long
+count_utf8_lead_bytes_with_ulong(const unsigned long *s)
+{
+    unsigned long d = *s;
+    d |= ~(d>>1);
+    d >>= 6;
+    d &= NONASCII_MASK >> 3;
+    d += (d>>8);
+    d += (d>>16);
+#if NONASCII_MASK == 0x8080808080808080UL
+    d += (d>>32);
+#endif
+    return (long)(d&0xF);
+}
+#endif
+
 static long
 str_strlen(VALUE str, rb_encoding *enc)
 {
@@ -774,26 +792,19 @@
 	    const VALUE lowbits = sizeof(unsigned long) - 1;
 	    s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits));
 	    t = (const unsigned long*)(~lowbits & (VALUE)e);
-	    for (len=0; p<(const char *)s; p++) {
-		if (((*p)&0xC0) != 0x80) len++;
+	    while (p < (const char *)s) {
+		if (is_utf8_lead_byte(*p)) len++;
+		p++;
 	    }
 	    while (s < t) {
-		unsigned long d = *s;
-		d = ~d | (d<<1);
-		d &= NONASCII_MASK;
-		d >>= 7;
-		d += (d>>8);
-		d += (d>>16);
-#if NONASCII_MASK == 0x8080808080808080UL
-		d = d + (d>>32);
-#endif
-		len += (long)(d&0xF);
+		len += count_utf8_lead_bytes_with_ulong(s);
 		s++;
 	    }
-	    p = (const char *)t;
+	    p = (const char *)s;
 	}
-	for (; p<e; p++) {
-	    if (((*p)&0xC0) != 0x80) len++;
+	while (p < e) {
+	    if (is_utf8_lead_byte(*p)) len++;
+	    p++;
 	}
 	return len;
     }
@@ -1162,33 +1173,22 @@
 	const VALUE lowbits = sizeof(unsigned long) - 1;
 	s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits));
 	t = (const unsigned long*)(~lowbits & (VALUE)e);
-	for (; p<(const char *)s && 0<nth; p++) {
-	    if (((*p)&0xC0) != 0x80) nth--;
+	while (p < (const char *)s) {
+	    if (is_utf8_lead_byte(*p)) nth--;
+	    p++;
 	}
 	while (s < t) {
-	    unsigned long d = *s++;
-	    d = ~d | (d<<1);
-	    d &= NONASCII_MASK;
-	    d >>= 7;
-	    d += (d>>8);
-	    d += (d>>16);
-#if NONASCII_MASK == 0x8080808080808080UL
-	    d += (d>>32);
-#endif
-	    nth -= (long)(d&0xF);
-	    if (nth < 8) {
-		t = s;
-		break;
-	    }
+	    nth -= count_utf8_lead_bytes_with_ulong(s);
+	    if (nth < sizeof(long)) break;
+	    s++;
 	}
-	p = (char *)t;
+	p = (char *)s;
     }
     if (0 < nth) {
 	while (p < e) {
-	    if (((*p)&0xC0) != 0x80) {
+	    if (is_utf8_lead_byte(*p)) {
 		nth--;
-		if (nth < 0)
-		    break;
+		if (nth < 0) break;
 	    }
 	    p++;
 	}

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]