ruby-changes:56007
From: Takashi <ko1@a...>
Date: Wed, 5 Jun 2019 10:13:28 +0900 (JST)
Subject: [ruby-changes:56007] Takashi Kokubun: 8d81e59aa7 (trunk): Optimize CGI.escapeHTML by reducing buffer extension
https://git.ruby-lang.org/ruby.git/commit/?id=8d81e59aa7 From 8d81e59aa7a62652caf85f9c8db371703668c149 Mon Sep 17 00:00:00 2001 From: Takashi Kokubun <takashikkbn@g...> Date: Tue, 4 Jun 2019 19:58:39 +0900 Subject: Optimize CGI.escapeHTML by reducing buffer extension and switch-case branches. Buffer allocation optimization using `ALLOCA_N` would be the main benefit of patch. It eliminates the O(N) buffer extensions. It also reduces the number of branches using escape table like https://mattn.kaoriya.net/software/lang/c/20160817011915.htm. Closes: https://github.com/ruby/ruby/pull/2226 Co-authored-by: Nobuyoshi Nakada <nobu@r...> Co-authored-by: Yasuhiro MATSUMOTO <mattn.jp@g...> diff --git a/benchmark/cgi_escape_html.yml b/benchmark/cgi_escape_html.yml new file mode 100644 index 0000000..af6abd0 --- /dev/null +++ b/benchmark/cgi_escape_html.yml @@ -0,0 +1,40 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/cgi_escape_html.yml#L1 +prelude: require 'cgi/escape' +benchmark: + - name: escape_html_blank + prelude: str = "" + script: CGI.escapeHTML(str) + loop_count: 20000000 + - name: escape_html_short_none + prelude: str = "abcde" + script: CGI.escapeHTML(str) + loop_count: 20000000 + - name: escape_html_short_one + prelude: str = "abcd<" + script: CGI.escapeHTML(str) + loop_count: 20000000 + - name: escape_html_short_all + prelude: str = "'&\"<>" + script: CGI.escapeHTML(str) + loop_count: 5000000 + - name: escape_html_long_none + prelude: str = "abcde" * 300 + script: CGI.escapeHTML(str) + loop_count: 1000000 + - name: escape_html_long_all + prelude: str = "'&\"<>" * 10 + script: CGI.escapeHTML(str) + loop_count: 1000000 + - name: escape_html_real + prelude: | # http://example.com/ + str = <<~HTML + <body> + <div> + <h1>Example Domain</h1> + <p>This domain is established to be used for illustrative examples in documents. You may use this + domain in examples without prior coordination or asking for permission.</p> + <p><a href="http://www.iana.org/domains/example">More information...</a></p> + </div> + </body> + HTML + script: CGI.escapeHTML(str) + loop_count: 1000000 diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c index ced1b18..9b64c35 100644 --- a/ext/cgi/escape/escape.c +++ b/ext/cgi/escape/escape.c @@ -11,27 +11,20 @@ RUBY_EXTERN const signed char ruby_digit36_to_number_table[]; https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L11 static VALUE rb_cCGI, rb_mUtil, rb_mEscape; static ID id_accept_charset; -static void -html_escaped_cat(VALUE str, char c) -{ - switch (c) { - case '\'': - rb_str_cat_cstr(str, "'"); - break; - case '&': - rb_str_cat_cstr(str, "&"); - break; - case '"': - rb_str_cat_cstr(str, """); - break; - case '<': - rb_str_cat_cstr(str, "<"); - break; - case '>': - rb_str_cat_cstr(str, ">"); - break; - } -} +#define HTML_ESCAPE_MAX_LEN 6 + +static const struct { + uint8_t len; + char str[HTML_ESCAPE_MAX_LEN+1]; +} html_escape_table[UCHAR_MAX+1] = { +#define HTML_ESCAPE(c, str) [c] = {rb_strlen_lit(str), str} + HTML_ESCAPE('\'', "'"), + HTML_ESCAPE('&', "&"), + HTML_ESCAPE('"', """), + HTML_ESCAPE('<', "<"), + HTML_ESCAPE('>', ">"), +#undef HTML_ESCAPE +}; static inline void preserve_original_state(VALUE orig, VALUE dest) @@ -44,36 +37,27 @@ preserve_original_state(VALUE orig, VALUE dest) https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L37 static VALUE optimized_escape_html(VALUE str) { - long i, len, beg = 0; - VALUE dest = 0; - const char *cstr; - - len = RSTRING_LEN(str); - cstr = RSTRING_PTR(str); - - for (i = 0; i < len; i++) { - switch (cstr[i]) { - case '\'': - case '&': - case '"': - case '<': - case '>': - if (!dest) { - dest = rb_str_buf_new(len); - } - - rb_str_cat(dest, cstr + beg, i - beg); - beg = i + 1; - - html_escaped_cat(dest, cstr[i]); - break; - } + const char *cstr = RSTRING_PTR(str); + const char *end = cstr + RSTRING_LEN(str); + char *buf = ALLOCA_N(char, RSTRING_LEN(str) * HTML_ESCAPE_MAX_LEN); + + char *dest = buf; + while (cstr < end) { + const unsigned char c = *cstr++; + uint8_t len = html_escape_table[c].len; + if (len) { + memcpy(dest, html_escape_table[c].str, len); + dest += len; + } + else { + *dest++ = c; + } } - if (dest) { - rb_str_cat(dest, cstr + beg, len - beg); - preserve_original_state(str, dest); - return dest; + if (RSTRING_LEN(str) < (dest - buf)) { + VALUE escaped = rb_str_new(buf, dest - buf); + preserve_original_state(str, escaped); + return escaped; } else { return rb_str_dup(str); -- cgit v0.10.2 -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/