ruby-changes:56007

https://git.ruby-lang.org/ruby.git/commit/?id=8d81e59aa7

From 8d81e59aa7a62652caf85f9c8db371703668c149 Mon Sep 17 00:00:00 2001
From: Takashi Kokubun <takashikkbn@g...>
Date: Tue, 4 Jun 2019 19:58:39 +0900
Subject: Optimize CGI.escapeHTML by reducing buffer extension

and switch-case branches.

Buffer allocation optimization using `ALLOCA_N` would be the main
benefit of patch. It eliminates the O(N) buffer extensions.

It also reduces the number of branches using escape table like
https://mattn.kaoriya.net/software/lang/c/20160817011915.htm.

Closes: https://github.com/ruby/ruby/pull/2226

Co-authored-by: Nobuyoshi Nakada <nobu@r...>
Co-authored-by: Yasuhiro MATSUMOTO <mattn.jp@g...>

diff --git a/benchmark/cgi_escape_html.yml b/benchmark/cgi_escape_html.yml
new file mode 100644
index 0000000..af6abd0
--- /dev/null
+++ b/benchmark/cgi_escape_html.yml
@@ -0,0 +1,40 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/cgi_escape_html.yml#L1
+prelude: require 'cgi/escape'
+benchmark:
+  - name: escape_html_blank
+    prelude: str = ""
+    script: CGI.escapeHTML(str)
+    loop_count: 20000000
+  - name: escape_html_short_none
+    prelude: str = "abcde"
+    script: CGI.escapeHTML(str)
+    loop_count: 20000000
+  - name: escape_html_short_one
+    prelude: str = "abcd<"
+    script: CGI.escapeHTML(str)
+    loop_count: 20000000
+  - name: escape_html_short_all
+    prelude: str = "'&\"<>"
+    script: CGI.escapeHTML(str)
+    loop_count: 5000000
+  - name: escape_html_long_none
+    prelude: str = "abcde" * 300
+    script: CGI.escapeHTML(str)
+    loop_count: 1000000
+  - name: escape_html_long_all
+    prelude: str = "'&\"<>" * 10
+    script: CGI.escapeHTML(str)
+    loop_count: 1000000
+  - name: escape_html_real
+    prelude: | # http://example.com/
+      str = <<~HTML
+        <body>
+        <div>
+            <h1>Example Domain</h1>
+            <p>This domain is established to be used for illustrative examples in documents. You may use this
+            domain in examples without prior coordination or asking for permission.</p>
+            <p><a href="http://www.iana.org/domains/example">More information...</a></p>
+        </div>
+        </body>
+      HTML
+    script: CGI.escapeHTML(str)
+    loop_count: 1000000
diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c
index ced1b18..9b64c35 100644
--- a/ext/cgi/escape/escape.c
+++ b/ext/cgi/escape/escape.c
@@ -11,27 +11,20 @@ RUBY_EXTERN const signed char ruby_digit36_to_number_table[]; https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L11
 static VALUE rb_cCGI, rb_mUtil, rb_mEscape;
 static ID id_accept_charset;
 
-static void
-html_escaped_cat(VALUE str, char c)
-{
-    switch (c) {
-      case '\'':
-	rb_str_cat_cstr(str, "&#39;");
-	break;
-      case '&':
-	rb_str_cat_cstr(str, "&amp;");
-	break;
-      case '"':
-	rb_str_cat_cstr(str, "&quot;");
-	break;
-      case '<':
-	rb_str_cat_cstr(str, "&lt;");
-	break;
-      case '>':
-	rb_str_cat_cstr(str, "&gt;");
-	break;
-    }
-}
+#define HTML_ESCAPE_MAX_LEN 6
+
+static const struct {
+    uint8_t len;
+    char str[HTML_ESCAPE_MAX_LEN+1];
+} html_escape_table[UCHAR_MAX+1] = {
+#define HTML_ESCAPE(c, str) [c] = {rb_strlen_lit(str), str}
+    HTML_ESCAPE('\'', "&#39;"),
+    HTML_ESCAPE('&', "&amp;"),
+    HTML_ESCAPE('"', "&quot;"),
+    HTML_ESCAPE('<', "&lt;"),
+    HTML_ESCAPE('>', "&gt;"),
+#undef HTML_ESCAPE
+};
 
 static inline void
 preserve_original_state(VALUE orig, VALUE dest)
@@ -44,36 +37,27 @@ preserve_original_state(VALUE orig, VALUE dest) https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L37
 static VALUE
 optimized_escape_html(VALUE str)
 {
-    long i, len, beg = 0;
-    VALUE dest = 0;
-    const char *cstr;
-
-    len  = RSTRING_LEN(str);
-    cstr = RSTRING_PTR(str);
-
-    for (i = 0; i < len; i++) {
-	switch (cstr[i]) {
-	  case '\'':
-	  case '&':
-	  case '"':
-	  case '<':
-	  case '>':
-	    if (!dest) {
-		dest = rb_str_buf_new(len);
-	    }
-
-	    rb_str_cat(dest, cstr + beg, i - beg);
-	    beg = i + 1;
-
-	    html_escaped_cat(dest, cstr[i]);
-	    break;
-	}
+    const char *cstr = RSTRING_PTR(str);
+    const char *end = cstr + RSTRING_LEN(str);
+    char *buf = ALLOCA_N(char, RSTRING_LEN(str) * HTML_ESCAPE_MAX_LEN);
+
+    char *dest = buf;
+    while (cstr < end) {
+        const unsigned char c = *cstr++;
+        uint8_t len = html_escape_table[c].len;
+        if (len) {
+            memcpy(dest, html_escape_table[c].str, len);
+            dest += len;
+        }
+        else {
+            *dest++ = c;
+        }
     }
 
-    if (dest) {
-	rb_str_cat(dest, cstr + beg, len - beg);
-	preserve_original_state(str, dest);
-	return dest;
+    if (RSTRING_LEN(str) < (dest - buf)) {
+        VALUE escaped = rb_str_new(buf, dest - buf);
+        preserve_original_state(str, escaped);
+        return escaped;
     }
     else {
 	return rb_str_dup(str);
-- 
cgit v0.10.2


--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/