ruby-changes:55374
From: nobu <ko1@a...>
Date: Wed, 17 Apr 2019 14:34:53 +0900 (JST)
Subject: [ruby-changes:55374] nobu:r67582 (trunk): string.c: improve splitting into chars
nobu 2019-04-17 14:34:46 +0900 (Wed, 17 Apr 2019) New Revision: 67582 https://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=67582 Log: string.c: improve splitting into chars * string.c (rb_str_split_m): improve splitting into chars by an empty string, without a regexp. Comparison: to_chars-1 built-ruby: 1273527.6 i/s compare-ruby: 189423.3 i/s - 6.72x slower to_chars-10 built-ruby: 120993.5 i/s compare-ruby: 37075.8 i/s - 3.26x slower to_chars-100 built-ruby: 15646.4 i/s compare-ruby: 4012.1 i/s - 3.90x slower to_chars-1000 built-ruby: 1295.1 i/s compare-ruby: 408.5 i/s - 3.17x slower Added files: trunk/benchmark/string_split.yml Modified files: trunk/string.c Index: string.c =================================================================== --- string.c (revision 67581) +++ string.c (revision 67582) @@ -7759,7 +7759,7 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7759 rb_encoding *enc; VALUE spat; VALUE limit; - enum {awk, string, regexp} split_type; + enum {awk, string, regexp, chars} split_type; long beg, end, i = 0, empty_count = -1; int lim = 0; VALUE result, tmp; @@ -7801,8 +7801,7 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7801 split_type = string; if (RSTRING_LEN(spat) == 0) { /* Special case - split into chars */ - spat = rb_reg_regcomp(spat); - split_type = regexp; + split_type = chars; } else if (rb_enc_asciicompat(enc2) == 1) { if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') { @@ -7823,9 +7822,9 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7822 if (result) result = rb_ary_new(); beg = 0; + char *ptr = RSTRING_PTR(str); + char *eptr = RSTRING_END(str); if (split_type == awk) { - char *ptr = RSTRING_PTR(str); - char *eptr = RSTRING_END(str); char *bptr = ptr; int skip = 1; unsigned int c; @@ -7884,10 +7883,8 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7883 } } else if (split_type == string) { - char *ptr = RSTRING_PTR(str); char *str_start = ptr; char *substr_start = ptr; - char *eptr = RSTRING_END(str); char *sptr = RSTRING_PTR(spat); long slen = RSTRING_LEN(spat); @@ -7908,8 +7905,21 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7905 } beg = ptr - str_start; } + else if (split_type == chars) { + char *str_start = ptr; + int n; + + mustnot_broken(str); + enc = rb_enc_get(str); + while (ptr < eptr && + (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) { + SPLIT_STR(ptr - str_start, n); + ptr += n; + if (!NIL_P(limit) && lim <= ++i) break; + } + beg = ptr - str_start; + } else { - char *ptr = RSTRING_PTR(str); long len = RSTRING_LEN(str); long start = beg; long idx; @@ -7924,14 +7934,14 @@ rb_str_split_m(int argc, VALUE *argv, VA https://github.com/ruby/ruby/blob/trunk/string.c#L7934 break; } else if (last_null == 1) { - SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, ptr+len, enc)); + SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc)); beg = start; } else { if (start == len) start++; else - start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); + start += rb_enc_fast_mbclen(ptr+start,eptr,enc); last_null = 1; continue; } Index: benchmark/string_split.yml =================================================================== --- benchmark/string_split.yml (nonexistent) +++ benchmark/string_split.yml (revision 67582) @@ -0,0 +1,7 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/string_split.yml#L1 +prelude: | + str0 = [*0..9].join("") +benchmark: + to_chars-1: str0.split('') + to_chars-10: (str0 * 10).split('') + to_chars-100: (str0 * 100).split('') + to_chars-1000: (str0 * 1000).split('') -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/