[前][次][番号順一覧][スレッド一覧]

ruby-changes:55374

From: nobu <ko1@a...>
Date: Wed, 17 Apr 2019 14:34:53 +0900 (JST)
Subject: [ruby-changes:55374] nobu:r67582 (trunk): string.c: improve splitting into chars

nobu	2019-04-17 14:34:46 +0900 (Wed, 17 Apr 2019)

  New Revision: 67582

  https://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=67582

  Log:
    string.c: improve splitting into chars
    
    * string.c (rb_str_split_m): improve splitting into chars by an
      empty string, without a regexp.
    
        Comparison:
                               to_chars-1
                  built-ruby:   1273527.6 i/s
                compare-ruby:    189423.3 i/s - 6.72x  slower
    
                              to_chars-10
                  built-ruby:    120993.5 i/s
                compare-ruby:     37075.8 i/s - 3.26x  slower
    
                             to_chars-100
                  built-ruby:     15646.4 i/s
                compare-ruby:      4012.1 i/s - 3.90x  slower
    
                            to_chars-1000
                  built-ruby:      1295.1 i/s
                compare-ruby:       408.5 i/s - 3.17x  slower

  Added files:
    trunk/benchmark/string_split.yml
  Modified files:
    trunk/string.c
Index: string.c
===================================================================
--- string.c	(revision 67581)
+++ string.c	(revision 67582)
@@ -7759,7 +7759,7 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7759
     rb_encoding *enc;
     VALUE spat;
     VALUE limit;
-    enum {awk, string, regexp} split_type;
+    enum {awk, string, regexp, chars} split_type;
     long beg, end, i = 0, empty_count = -1;
     int lim = 0;
     VALUE result, tmp;
@@ -7801,8 +7801,7 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7801
 	    split_type = string;
 	    if (RSTRING_LEN(spat) == 0) {
 		/* Special case - split into chars */
-		spat = rb_reg_regcomp(spat);
-		split_type = regexp;
+                split_type = chars;
 	    }
 	    else if (rb_enc_asciicompat(enc2) == 1) {
 		if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
@@ -7823,9 +7822,9 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7822
 
     if (result) result = rb_ary_new();
     beg = 0;
+    char *ptr = RSTRING_PTR(str);
+    char *eptr = RSTRING_END(str);
     if (split_type == awk) {
-	char *ptr = RSTRING_PTR(str);
-	char *eptr = RSTRING_END(str);
 	char *bptr = ptr;
 	int skip = 1;
 	unsigned int c;
@@ -7884,10 +7883,8 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7883
 	}
     }
     else if (split_type == string) {
-	char *ptr = RSTRING_PTR(str);
 	char *str_start = ptr;
 	char *substr_start = ptr;
-	char *eptr = RSTRING_END(str);
 	char *sptr = RSTRING_PTR(spat);
 	long slen = RSTRING_LEN(spat);
 
@@ -7908,8 +7905,21 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7905
 	}
 	beg = ptr - str_start;
     }
+    else if (split_type == chars) {
+        char *str_start = ptr;
+        int n;
+
+        mustnot_broken(str);
+        enc = rb_enc_get(str);
+        while (ptr < eptr &&
+               (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
+            SPLIT_STR(ptr - str_start, n);
+            ptr += n;
+            if (!NIL_P(limit) && lim <= ++i) break;
+        }
+        beg = ptr - str_start;
+    }
     else {
-	char *ptr = RSTRING_PTR(str);
 	long len = RSTRING_LEN(str);
 	long start = beg;
 	long idx;
@@ -7924,14 +7934,14 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7934
 		    break;
 		}
 		else if (last_null == 1) {
-		    SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, ptr+len, enc));
+                    SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
 		    beg = start;
 		}
 		else {
                     if (start == len)
                         start++;
                     else
-                        start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
+                        start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
 		    last_null = 1;
 		    continue;
 		}
Index: benchmark/string_split.yml
===================================================================
--- benchmark/string_split.yml	(nonexistent)
+++ benchmark/string_split.yml	(revision 67582)
@@ -0,0 +1,7 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/string_split.yml#L1
+prelude: |
+  str0 = [*0..9].join("")
+benchmark:
+  to_chars-1: str0.split('')
+  to_chars-10: (str0 * 10).split('')
+  to_chars-100: (str0 * 100).split('')
+  to_chars-1000: (str0 * 1000).split('')

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]