ruby-changes:61214
From: Nobuyoshi <ko1@a...>
Date: Tue, 12 May 2020 19:59:34 +0900 (JST)
Subject: [ruby-changes:61214] 693f7ab315 (master): Optimize String#split
https://git.ruby-lang.org/ruby.git/commit/?id=693f7ab315 From 693f7ab31578bf23d165f022b60da3a32055ceea Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada <nobu@r...> Date: Tue, 12 May 2020 15:50:15 +0900 Subject: Optimize String#split Optimized `String#split` with `/ /` (single space regexp) as simple string splitting. [ruby-core:98272] | |compare-ruby|built-ruby| |:--------------|-----------:|---------:| |re_space-1 | 432.786k| 1.539M| | | -| 3.56x| |re_space-10 | 76.231k| 191.547k| | | -| 2.51x| |re_space-100 | 8.152k| 19.557k| | | -| 2.40x| |re_space-1000 | 837.405| 2.022k| | | -| 2.41x| ruby-core:98272: https://bugs.ruby-lang.org/issues/15771#change-85511 diff --git a/benchmark/string_split.yml b/benchmark/string_split.yml index 84ffe8f..ac6ed0d 100644 --- a/benchmark/string_split.yml +++ b/benchmark/string_split.yml @@ -1,7 +1,18 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/string_split.yml#L1 prelude: | - str0 = [*0..9].join("") + str1 = [*0..5].join(" ") + " " + str10 = str1 * 10 + str100 = str10 * 10 + str1000 = str100 * 10 benchmark: - to_chars-1: str0.split('') - to_chars-10: (str0 * 10).split('') - to_chars-100: (str0 * 100).split('') - to_chars-1000: (str0 * 1000).split('') + to_chars-1: str1.split('') + to_chars-10: str10.split('') + to_chars-100: str100.split('') + to_chars-1000: str1000.split('') + to_words-1: str1.split(' ') + to_words-10: str10.split(' ') + to_words-100: str100.split(' ') + to_words-1000: str1000.split(' ') + re_space-1: str1.split(/ /) + re_space-10: str10.split(/ /) + re_space-100: str100.split(/ /) + re_space-1000: str1000.split(/ /) diff --git a/string.c b/string.c index f084f5e..792eb76 100644 --- a/string.c +++ b/string.c @@ -7926,6 +7926,35 @@ split_string(VALUE result, VALUE str, long beg, long len, long empty_count) https://github.com/ruby/ruby/blob/trunk/string.c#L7926 return empty_count; } +typedef enum { + SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS +} split_type_t; + +static split_type_t +literal_split_pattern(VALUE spat, split_type_t default_type) +{ + rb_encoding *enc = STR_ENC_GET(spat); + const char *ptr; + long len; + RSTRING_GETMEM(spat, ptr, len); + if (len == 0) { + /* Special case - split into chars */ + return SPLIT_TYPE_CHARS; + } + else if (rb_enc_asciicompat(enc)) { + if (len == 1 && ptr[0] == ' ') { + return SPLIT_TYPE_AWK; + } + } + else { + int l; + if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) { + return SPLIT_TYPE_AWK; + } + } + return default_type; +} + /* * call-seq: * str.split(pattern=nil, [limit]) -> an_array @@ -7987,7 +8016,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8016 rb_encoding *enc; VALUE spat; VALUE limit; - enum {awk, string, regexp, chars} split_type; + split_type_t split_type; long beg, end, i = 0, empty_count = -1; int lim = 0; VALUE result, tmp; @@ -8011,12 +8040,12 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8040 if (NIL_P(limit) && !lim) empty_count = 0; enc = STR_ENC_GET(str); - split_type = regexp; + split_type = SPLIT_TYPE_REGEXP; if (!NIL_P(spat)) { spat = get_pat_quoted(spat, 0); } else if (NIL_P(spat = rb_fs)) { - split_type = awk; + split_type = SPLIT_TYPE_AWK; } else if (!(spat = rb_fs_check(spat))) { rb_raise(rb_eTypeError, "value of $; must be String or Regexp"); @@ -8024,28 +8053,25 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8053 else { rb_warn("$; is set to non-nil value"); } - if (split_type != awk) { - if (BUILTIN_TYPE(spat) == T_STRING) { - rb_encoding *enc2 = STR_ENC_GET(spat); + if (split_type != SPLIT_TYPE_AWK) { + switch (BUILTIN_TYPE(spat)) { + case T_REGEXP: + rb_reg_options(spat); /* check if uninitialized */ + tmp = RREGEXP_SRC(spat); + split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP); + if (split_type == SPLIT_TYPE_AWK) { + spat = tmp; + split_type = SPLIT_TYPE_STRING; + } + break; + case T_STRING: mustnot_broken(spat); - split_type = string; - if (RSTRING_LEN(spat) == 0) { - /* Special case - split into chars */ - split_type = chars; - } - else if (rb_enc_asciicompat(enc2) == 1) { - if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') { - split_type = awk; - } - } - else { - int l; - if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && - RSTRING_LEN(spat) == l) { - split_type = awk; - } - } + split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING); + break; + + default: + UNREACHABLE_RETURN(Qnil); } } @@ -8055,7 +8081,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8081 beg = 0; char *ptr = RSTRING_PTR(str); char *eptr = RSTRING_END(str); - if (split_type == awk) { + if (split_type == SPLIT_TYPE_AWK) { char *bptr = ptr; int skip = 1; unsigned int c; @@ -8113,7 +8139,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8139 } } } - else if (split_type == string) { + else if (split_type == SPLIT_TYPE_STRING) { char *str_start = ptr; char *substr_start = ptr; char *sptr = RSTRING_PTR(spat); @@ -8136,7 +8162,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L8162 } beg = ptr - str_start; } - else if (split_type == chars) { + else if (split_type == SPLIT_TYPE_CHARS) { char *str_start = ptr; int n; -- cgit v0.10.2 -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/