ruby-changes:55652
From: Nobuyoshi <ko1@a...>
Date: Fri, 3 May 2019 23:59:58 +0900 (JST)
Subject: [ruby-changes:55652] Nobuyoshi Nakada: 77440e949b (trunk): Improve performance of case-conversion methods
https://git.ruby-lang.org/ruby.git/commit/?id=77440e949b From 77440e949bd69e6ed86d70026d238521adb8319a Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada <nobu@r...> Date: Fri, 3 May 2019 22:37:51 +0900 Subject: Improve performance of case-conversion methods diff --git a/benchmark/string_capitalize.yml b/benchmark/string_capitalize.yml new file mode 100644 index 0000000..7d23fd3 --- /dev/null +++ b/benchmark/string_capitalize.yml @@ -0,0 +1,10 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/string_capitalize.yml#L1 +prelude: | + str1 = [*"a".."m",*"N".."Z",*"0".."9"].join("") + str10 = str1 * 10 + str100 = str10 * 10 + str1000 = str100 * 10 +benchmark: + capitalize-1: str1.capitalize + capitalize-10: str10.capitalize + capitalize-100: str100.capitalize + capitalize-1000: str1000.capitalize diff --git a/benchmark/string_downcase.yml b/benchmark/string_downcase.yml new file mode 100644 index 0000000..a31c3ac --- /dev/null +++ b/benchmark/string_downcase.yml @@ -0,0 +1,10 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/string_downcase.yml#L1 +prelude: | + str1 = [*"A".."Z",*"0".."9"].join("") + str10 = str1 * 10 + str100 = str10 * 10 + str1000 = str100 * 10 +benchmark: + downcase-1: str1.upcase + downcase-10: str10.upcase + downcase-100: str100.upcase + downcase-1000: str1000.upcase diff --git a/benchmark/string_swapcase.yml b/benchmark/string_swapcase.yml new file mode 100644 index 0000000..afaae3f --- /dev/null +++ b/benchmark/string_swapcase.yml @@ -0,0 +1,10 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/string_swapcase.yml#L1 +prelude: | + str1 = [*"A".."M",*"n".."z",*"0".."9"].join("") + str10 = str1 * 10 + str100 = str10 * 10 + str1000 = str100 * 10 +benchmark: + swapcase-1: str1.swapcase + swapcase-10: str10.swapcase + swapcase-100: str100.swapcase + swapcase-1000: str1000.swapcase diff --git a/benchmark/string_upcase.yml b/benchmark/string_upcase.yml new file mode 100644 index 0000000..456d213 --- /dev/null +++ b/benchmark/string_upcase.yml @@ -0,0 +1,10 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/string_upcase.yml#L1 +prelude: | + str1 = [*"a".."z",*"0".."9"].join("") + str10 = str1 * 10 + str100 = str10 * 10 + str1000 = str100 * 10 +benchmark: + upcase-1: str1.upcase + upcase-10: str10.upcase + upcase-100: str100.upcase + upcase-1000: str1000.upcase diff --git a/string.c b/string.c index 2febe52..76d8c56 100644 --- a/string.c +++ b/string.c @@ -6408,6 +6408,14 @@ rb_str_check_dummy_enc(rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L6408 } } +static rb_encoding * +str_true_enc(VALUE str) +{ + rb_encoding *enc = STR_ENC_GET(str); + rb_str_check_dummy_enc(enc); + return enc; +} + static OnigCaseFoldType check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags) { @@ -6448,6 +6456,14 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags) https://github.com/ruby/ruby/blob/trunk/string.c#L6456 return flags; } +static inline bool +case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str) +{ + if ((flags & ONIGENC_CASE_ASCII_ONLY) && (rb_enc_mbmaxlen(enc) == 1)) + return true; + return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT; +} + /* 16 should be long enough to absorb any kind of single character length increase */ #define CASE_MAPPING_ADDITIONAL_LENGTH 20 #ifndef CASEMAP_DEBUG @@ -6484,7 +6500,7 @@ rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L6500 { VALUE target; - OnigUChar *source_current, *source_end; + const OnigUChar *source_current, *source_end; int target_length = 0; VALUE buffer_anchor; mapping_buffer *current_buffer = 0; @@ -6554,21 +6570,30 @@ rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L6570 return target; } -static void -rb_str_ascii_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) +static VALUE +rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc) { - OnigUChar *source_current, *source_end; + const OnigUChar *source_current, *source_end; + OnigUChar *target_current, *target_end; long old_length = RSTRING_LEN(source); int length_or_invalid; - if (old_length == 0) return; + if (old_length == 0) return Qnil; source_current = (OnigUChar*)RSTRING_PTR(source); source_end = (OnigUChar*)RSTRING_END(source); + if (source == target) { + target_current = (OnigUChar*)source_current; + target_end = (OnigUChar*)source_end; + } + else { + target_current = (OnigUChar*)RSTRING_PTR(target); + target_end = (OnigUChar*)RSTRING_END(target); + } length_or_invalid = onigenc_ascii_only_case_map(flags, - (const OnigUChar**)&source_current, source_end, - source_current, source_end, enc); + &source_current, source_end, + target_current, target_end, enc); if (length_or_invalid < 0) rb_raise(rb_eArgError, "input string invalid"); if (CASEMAP_DEBUG && length_or_invalid != old_length) { @@ -6577,6 +6602,29 @@ rb_str_ascii_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L6602 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap" "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid); } + + OBJ_INFECT_RAW(target, source); + str_enc_copy(target, source); + + return target; +} + +static bool +upcase_single(VALUE str) +{ + char *s = RSTRING_PTR(str), *send = RSTRING_END(str); + bool modified = false; + + while (s < send) { + unsigned int c = *(unsigned char*)s; + + if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { + *s = 'A' + (c - 'a'); + modified = true; + } + s++; + } + return modified; } /* @@ -6598,24 +6646,13 @@ rb_str_upcase_bang(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L6646 flags = check_case_options(argc, argv, flags); str_modify_keep_cr(str); - enc = STR_ENC_GET(str); - rb_str_check_dummy_enc(enc); - if (((flags&ONIGENC_CASE_ASCII_ONLY) && (rb_enc_mbmaxlen(enc)==1)) - || (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str)==ENC_CODERANGE_7BIT)) { - char *s = RSTRING_PTR(str), *send = RSTRING_END(str); - - while (s < send) { - unsigned int c = *(unsigned char*)s; - - if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { - *s = 'A' + (c - 'a'); - flags |= ONIGENC_CASE_MODIFIED; - } - s++; - } + enc = str_true_enc(str); + if (case_option_single_p(flags, enc, str)) { + if (upcase_single(str)) + flags |= ONIGENC_CASE_MODIFIED; } else if (flags&ONIGENC_CASE_ASCII_ONLY) - rb_str_ascii_casemap(str, &flags, enc); + rb_str_ascii_casemap(str, str, &flags, enc); else str_shared_replace(str, rb_str_casemap(str, &flags, enc)); @@ -6640,9 +6677,46 @@ rb_str_upcase_bang(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L6677 static VALUE rb_str_upcase(int argc, VALUE *argv, VALUE str) { - str = rb_str_dup(str); - rb_str_upcase_bang(argc, argv, str); - return str; + rb_encoding *enc; + OnigCaseFoldType flags = ONIGENC_CASE_UPCASE; + VALUE ret; + + flags = check_case_options(argc, argv, flags); + enc = str_true_enc(str); + if (case_option_single_p(flags, enc, str)) { + ret = rb_str_new_with_class(str, RSTRING_PTR(str), RSTRING_LEN(str)); + OBJ_INFECT_RAW(ret, str); + str_enc_copy(ret, str); + upcase_single(ret); + } + else if (flags&ONIGENC_CASE_ASCII_ONLY) { + ret = rb_str_new_with_class(str, 0, RSTRING_LEN(str)); + rb_str_ascii_casemap(str, ret, &flags, enc); + } + else { + ret = rb_str_casemap(str, &flags, enc); + } + + return ret; +} + +static bool +downcase_single(VALUE str) +{ + char *s = RSTRING_PTR(str), *send = RSTRING_END(str); + bool modified = false; + + while (s < send) { + unsigned int c = *(unsigned char*)s; + + if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { + *s = 'a' + (c - 'A'); + modified = true; + } + s++; + } + + return modified; } /* @@ -6664,24 +6738,13 @@ rb_str_downcase_bang(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L6738 flags = check_case_options(argc, argv, flags); str_modify_keep_cr(str); - enc = STR_ENC_GET(str); - rb_str_check_dummy_enc(enc); - if (((flags&ONIGENC_CASE_ASCII_ONLY) && (rb_enc_mbmaxlen(enc)==1)) - || (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str)==ENC_CODERANGE_7BIT)) { - char *s = RSTRING_PTR(str), *send = RSTRING_END(str); - - while (s < send) { - unsigned int c = *(unsigned char*)s; - - if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { - *s = 'a' + (c - 'A'); - flags |= ONIGENC_CASE_MODIFIED; - } - s++; - } + enc = str_true_enc(str); + if (case_option_single_p(flags, enc, str)) { + if (downcase_single(str)) + flags |= ONIGENC_CASE_MODIFIED; } else if (flags&ONIGENC_CASE_ASCII_ONLY) - rb_str_ascii_casemap(str, &flags, enc); + rb_str_ascii_casemap(str, str, &flags, enc); else str_shared_replace(str, rb_str_casemap(str, &flags, enc)); @@ -6743,9 +6806,27 @@ rb_str_downcase_bang(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L6806 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str) { - str = rb_str_dup(str); - rb_str_downcase_bang(argc, argv, str); - return str; + rb_encoding *enc; + OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE; + VALUE ret; + + flags = check_case_options(argc, argv, flag (... truncated) -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/