[前][次][番号順一覧][スレッド一覧]

ruby-changes:33536

From: naruse <ko1@a...>
Date: Fri, 18 Apr 2014 15:42:56 +0900 (JST)
Subject: [ruby-changes:33536] naruse:r45617 (trunk): * string.c (enc_strlen): move UTF-8 optimization from str_strlen to

naruse	2014-04-18 15:42:51 +0900 (Fri, 18 Apr 2014)

  New Revision: 45617

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=45617

  Log:
    * string.c (enc_strlen): move UTF-8 optimization from str_strlen to
      enc_strlen.

  Modified files:
    trunk/ChangeLog
    trunk/string.c
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 45616)
+++ ChangeLog	(revision 45617)
@@ -1,3 +1,8 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1
+Fri Apr 18 14:21:21 2014  NARUSE, Yui  <naruse@r...>
+
+	* string.c (enc_strlen): move UTF-8 optimization from str_strlen to
+	  enc_strlen.
+
 Fri Apr 18 08:50:18 2014  Nobuyoshi Nakada  <nobu@r...>
 
 	* configure.in (rb_cv_getcwd_malloc): check if getcwd allocates
Index: string.c
===================================================================
--- string.c	(revision 45616)
+++ string.c	(revision 45617)
@@ -1075,6 +1075,41 @@ rb_str_init(int argc, VALUE *argv, VALUE https://github.com/ruby/ruby/blob/trunk/string.c#L1075
     return str;
 }
 
+#ifdef NONASCII_MASK
+#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
+
+/*
+ * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
+ * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
+ * Therefore, following pseudo code can detect UTF-8 leading byte.
+ *
+ * if (!(byte & 0x80))
+ *   byte |= 0x40;          // turn on bit6
+ * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
+ *
+ * This function calculate every bytes in the argument word `s'
+ * using the above logic concurrently. and gather every bytes result.
+ */
+static inline VALUE
+count_utf8_lead_bytes_with_word(const VALUE *s)
+{
+    VALUE d = *s;
+
+    /* Transform into bit0 represent UTF-8 leading or not. */
+    d |= ~(d>>1);
+    d >>= 6;
+    d &= NONASCII_MASK >> 7;
+
+    /* Gather every bytes. */
+    d += (d>>8);
+    d += (d>>16);
+#if SIZEOF_VALUE == 8
+    d += (d>>32);
+#endif
+    return (d&0xF);
+}
+#endif
+
 static inline long
 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
 {
@@ -1084,6 +1119,31 @@ enc_strlen(const char *p, const char *e, https://github.com/ruby/ruby/blob/trunk/string.c#L1119
     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
     }
+#ifdef NONASCII_MASK
+    else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
+	VALUE len = 0;
+	if ((int)sizeof(VALUE) * 2 < e - p) {
+	    const VALUE *s, *t;
+	    const VALUE lowbits = sizeof(VALUE) - 1;
+	    s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
+	    t = (const VALUE*)(~lowbits & (VALUE)e);
+	    while (p < (const char *)s) {
+		if (is_utf8_lead_byte(*p)) len++;
+		p++;
+	    }
+	    while (s < t) {
+		len += count_utf8_lead_bytes_with_word(s);
+		s++;
+	    }
+	    p = (const char *)s;
+	}
+	while (p < e) {
+	    if (is_utf8_lead_byte(*p)) len++;
+	    p++;
+	}
+	return (long)len;
+    }
+#endif
     else if (rb_enc_asciicompat(enc)) {
         c = 0;
 	if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
@@ -1183,41 +1243,7 @@ rb_enc_strlen_cr(const char *p, const ch https://github.com/ruby/ruby/blob/trunk/string.c#L1243
     return c;
 }
 
-#ifdef NONASCII_MASK
-#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
-
-/*
- * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
- * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
- * Therefore, following pseudo code can detect UTF-8 leading byte.
- *
- * if (!(byte & 0x80))
- *   byte |= 0x40;          // turn on bit6
- * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
- *
- * This function calculate every bytes in the argument word `s'
- * using the above logic concurrently. and gather every bytes result.
- */
-static inline VALUE
-count_utf8_lead_bytes_with_word(const VALUE *s)
-{
-    VALUE d = *s;
-
-    /* Transform into bit0 represent UTF-8 leading or not. */
-    d |= ~(d>>1);
-    d >>= 6;
-    d &= NONASCII_MASK >> 7;
-
-    /* Gather every bytes. */
-    d += (d>>8);
-    d += (d>>16);
-#if SIZEOF_VALUE == 8
-    d += (d>>32);
-#endif
-    return (d&0xF);
-}
-#endif
-
+/* enc must be compatible with str's enc */
 static long
 str_strlen(VALUE str, rb_encoding *enc)
 {
@@ -1230,33 +1256,7 @@ str_strlen(VALUE str, rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L1256
     p = RSTRING_PTR(str);
     e = RSTRING_END(str);
     cr = ENC_CODERANGE(str);
-#ifdef NONASCII_MASK
-    if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
-        enc == rb_utf8_encoding()) {
 
-	VALUE len = 0;
-	if ((int)sizeof(VALUE) * 2 < e - p) {
-	    const VALUE *s, *t;
-	    const VALUE lowbits = sizeof(VALUE) - 1;
-	    s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
-	    t = (const VALUE*)(~lowbits & (VALUE)e);
-	    while (p < (const char *)s) {
-		if (is_utf8_lead_byte(*p)) len++;
-		p++;
-	    }
-	    while (s < t) {
-		len += count_utf8_lead_bytes_with_word(s);
-		s++;
-	    }
-	    p = (const char *)s;
-	}
-	while (p < e) {
-	    if (is_utf8_lead_byte(*p)) len++;
-	    p++;
-	}
-	return (long)len;
-    }
-#endif
     n = rb_enc_strlen_cr(p, e, enc, &cr);
     if (cr) {
         ENC_CODERANGE_SET(str, cr);

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]