ruby-changes:61214

https://git.ruby-lang.org/ruby.git/commit/?id=693f7ab315

From 693f7ab31578bf23d165f022b60da3a32055ceea Mon Sep 17 00:00:00 2001
From: Nobuyoshi Nakada <nobu@r...>
Date: Tue, 12 May 2020 15:50:15 +0900
Subject: Optimize String#split

Optimized `String#split` with `/ /` (single space regexp) as
simple string splitting.  [ruby-core:98272]

|               |compare-ruby|built-ruby|
|:--------------|-----------:|---------:|
|re_space-1     |    432.786k|    1.539M|
|               |           -|     3.56x|
|re_space-10    |     76.231k|  191.547k|
|               |           -|     2.51x|
|re_space-100   |      8.152k|   19.557k|
|               |           -|     2.40x|
|re_space-1000  |     837.405|    2.022k|
|               |           -|     2.41x|

ruby-core:98272: https://bugs.ruby-lang.org/issues/15771#change-85511

diff --git a/benchmark/string_split.yml b/benchmark/string_split.yml
index 84ffe8f..ac6ed0d 100644
--- a/benchmark/string_split.yml
+++ b/benchmark/string_split.yml
@@ -1,7 +1,18 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/string_split.yml#L1
 prelude: |
-  str0 = [*0..9].join("")
+  str1 = [*0..5].join(" ") + " "
+  str10 = str1 * 10
+  str100 = str10 * 10
+  str1000 = str100 * 10
 benchmark:
-  to_chars-1: str0.split('')
-  to_chars-10: (str0 * 10).split('')
-  to_chars-100: (str0 * 100).split('')
-  to_chars-1000: (str0 * 1000).split('')
+  to_chars-1: str1.split('')
+  to_chars-10: str10.split('')
+  to_chars-100: str100.split('')
+  to_chars-1000: str1000.split('')
+  to_words-1: str1.split(' ')
+  to_words-10: str10.split(' ')
+  to_words-100: str100.split(' ')
+  to_words-1000: str1000.split(' ')
+  re_space-1: str1.split(/ /)
+  re_space-10: str10.split(/ /)
+  re_space-100: str100.split(/ /)
+  re_space-1000: str1000.split(/ /)
diff --git a/string.c b/string.c
index f084f5e..792eb76 100644
--- a/string.c
+++ b/string.c
@@ -7926,6 +7926,35 @@ split_string(VALUE result, VALUE str, long beg, long len, long empty_count) https://github.com/ruby/ruby/blob/trunk/string.c#L7926
     return empty_count;
 }
 
+typedef enum {
+    SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
+} split_type_t;
+
+static split_type_t
+literal_split_pattern(VALUE spat, split_type_t default_type)
+{
+    rb_encoding *enc = STR_ENC_GET(spat);
+    const char *ptr;
+    long len;
+    RSTRING_GETMEM(spat, ptr, len);
+    if (len == 0) {
+        /* Special case - split into chars */
+        return SPLIT_TYPE_CHARS;
+    }
+    else if (rb_enc_asciicompat(enc)) {
+        if (len == 1 && ptr[0] == ' ') {
+            return SPLIT_TYPE_AWK;
+        }
+    }
+    else {
+        int l;
+        if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
+            return SPLIT_TYPE_AWK;
+        }
+    }
+    return default_type;
+}
+
 /*
  *  call-seq:
  *     str.split(pattern=nil, [limit])                -> an_array
@@ -7987,7 +8016,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8016
     rb_encoding *enc;
     VALUE spat;
     VALUE limit;
-    enum {awk, string, regexp, chars} split_type;
+    split_type_t split_type;
     long beg, end, i = 0, empty_count = -1;
     int lim = 0;
     VALUE result, tmp;
@@ -8011,12 +8040,12 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8040
     if (NIL_P(limit) && !lim) empty_count = 0;
 
     enc = STR_ENC_GET(str);
-    split_type = regexp;
+    split_type = SPLIT_TYPE_REGEXP;
     if (!NIL_P(spat)) {
 	spat = get_pat_quoted(spat, 0);
     }
     else if (NIL_P(spat = rb_fs)) {
-	split_type = awk;
+	split_type = SPLIT_TYPE_AWK;
     }
     else if (!(spat = rb_fs_check(spat))) {
 	rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
@@ -8024,28 +8053,25 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8053
     else {
         rb_warn("$; is set to non-nil value");
     }
-    if (split_type != awk) {
-	if (BUILTIN_TYPE(spat) == T_STRING) {
-	    rb_encoding *enc2 = STR_ENC_GET(spat);
+    if (split_type != SPLIT_TYPE_AWK) {
+        switch (BUILTIN_TYPE(spat)) {
+          case T_REGEXP:
+            rb_reg_options(spat); /* check if uninitialized */
+            tmp = RREGEXP_SRC(spat);
+            split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
+            if (split_type == SPLIT_TYPE_AWK) {
+                spat = tmp;
+                split_type = SPLIT_TYPE_STRING;
+            }
+            break;
 
+          case T_STRING:
 	    mustnot_broken(spat);
-	    split_type = string;
-	    if (RSTRING_LEN(spat) == 0) {
-		/* Special case - split into chars */
-                split_type = chars;
-	    }
-	    else if (rb_enc_asciicompat(enc2) == 1) {
-		if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
-		    split_type = awk;
-		}
-	    }
-	    else {
-		int l;
-		if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
-		    RSTRING_LEN(spat) == l) {
-		    split_type = awk;
-		}
-	    }
+            split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
+            break;
+
+          default:
+            UNREACHABLE_RETURN(Qnil);
 	}
     }
 
@@ -8055,7 +8081,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8081
     beg = 0;
     char *ptr = RSTRING_PTR(str);
     char *eptr = RSTRING_END(str);
-    if (split_type == awk) {
+    if (split_type == SPLIT_TYPE_AWK) {
 	char *bptr = ptr;
 	int skip = 1;
 	unsigned int c;
@@ -8113,7 +8139,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8139
 	    }
 	}
     }
-    else if (split_type == string) {
+    else if (split_type == SPLIT_TYPE_STRING) {
 	char *str_start = ptr;
 	char *substr_start = ptr;
 	char *sptr = RSTRING_PTR(spat);
@@ -8136,7 +8162,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8162
 	}
 	beg = ptr - str_start;
     }
-    else if (split_type == chars) {
+    else if (split_type == SPLIT_TYPE_CHARS) {
         char *str_start = ptr;
         int n;
 
-- 
cgit v0.10.2


--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/