[前][次][番号順一覧][スレッド一覧]

ruby-changes:56022

From: Takashi <ko1@a...>
Date: Wed, 5 Jun 2019 21:14:06 +0900 (JST)
Subject: [ruby-changes:56022] Takashi Kokubun: 0a29dc87e6 (trunk): Optimize CGI.escapeHTML by reducing buffer extension

https://git.ruby-lang.org/ruby.git/commit/?id=0a29dc87e6

From 0a29dc87e62c701db56816cb430daf07a4f02bea Mon Sep 17 00:00:00 2001
From: Takashi Kokubun <takashikkbn@g...>
Date: Wed, 5 Jun 2019 19:28:51 +0900
Subject: Optimize CGI.escapeHTML by reducing buffer extension

and switch-case branches.

Buffer allocation optimization using `ALLOCA_N` would be the main
benefit of patch. It eliminates the O(N) buffer extensions.

It also reduces the number of branches using escape table like
https://mattn.kaoriya.net/software/lang/c/20160817011915.htm.

Closes: https://github.com/ruby/ruby/pull/2226

Co-authored-by: Nobuyoshi Nakada <nobu@r...>
Co-authored-by: Yasuhiro MATSUMOTO <mattn.jp@g...>

diff --git a/benchmark/cgi_escape_html.yml b/benchmark/cgi_escape_html.yml
new file mode 100644
index 0000000..af6abd0
--- /dev/null
+++ b/benchmark/cgi_escape_html.yml
@@ -0,0 +1,40 @@ https://github.com/ruby/ruby/blob/trunk/benchmark/cgi_escape_html.yml#L1
+prelude: require 'cgi/escape'
+benchmark:
+  - name: escape_html_blank
+    prelude: str = ""
+    script: CGI.escapeHTML(str)
+    loop_count: 20000000
+  - name: escape_html_short_none
+    prelude: str = "abcde"
+    script: CGI.escapeHTML(str)
+    loop_count: 20000000
+  - name: escape_html_short_one
+    prelude: str = "abcd<"
+    script: CGI.escapeHTML(str)
+    loop_count: 20000000
+  - name: escape_html_short_all
+    prelude: str = "'&\"<>"
+    script: CGI.escapeHTML(str)
+    loop_count: 5000000
+  - name: escape_html_long_none
+    prelude: str = "abcde" * 300
+    script: CGI.escapeHTML(str)
+    loop_count: 1000000
+  - name: escape_html_long_all
+    prelude: str = "'&\"<>" * 10
+    script: CGI.escapeHTML(str)
+    loop_count: 1000000
+  - name: escape_html_real
+    prelude: | # http://example.com/
+      str = <<~HTML
+        <body>
+        <div>
+            <h1>Example Domain</h1>
+            <p>This domain is established to be used for illustrative examples in documents. You may use this
+            domain in examples without prior coordination or asking for permission.</p>
+            <p><a href="http://www.iana.org/domains/example">More information...</a></p>
+        </div>
+        </body>
+      HTML
+    script: CGI.escapeHTML(str)
+    loop_count: 1000000
diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c
index 78d196d..76d8f0d 100644
--- a/ext/cgi/escape/escape.c
+++ b/ext/cgi/escape/escape.c
@@ -11,27 +11,20 @@ RUBY_EXTERN const signed char ruby_digit36_to_number_table[]; https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L11
 static VALUE rb_cCGI, rb_mUtil, rb_mEscape;
 static ID id_accept_charset;
 
-static void
-html_escaped_cat(VALUE str, char c)
-{
-    switch (c) {
-      case '\'':
-        rb_str_cat_cstr(str, "&#39;");
-        break;
-      case '&':
-        rb_str_cat_cstr(str, "&amp;");
-        break;
-      case '"':
-        rb_str_cat_cstr(str, "&quot;");
-        break;
-      case '<':
-        rb_str_cat_cstr(str, "&lt;");
-        break;
-      case '>':
-        rb_str_cat_cstr(str, "&gt;");
-        break;
-    }
-}
+#define HTML_ESCAPE_MAX_LEN 6
+
+static const struct {
+    uint8_t len;
+    char str[HTML_ESCAPE_MAX_LEN+1];
+} html_escape_table[UCHAR_MAX+1] = {
+#define HTML_ESCAPE(c, str) [c] = {rb_strlen_lit(str), str}
+    HTML_ESCAPE('\'', "&#39;"),
+    HTML_ESCAPE('&', "&amp;"),
+    HTML_ESCAPE('"', "&quot;"),
+    HTML_ESCAPE('<', "&lt;"),
+    HTML_ESCAPE('>', "&gt;"),
+#undef HTML_ESCAPE
+};
 
 static inline void
 preserve_original_state(VALUE orig, VALUE dest)
@@ -44,40 +37,34 @@ preserve_original_state(VALUE orig, VALUE dest) https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L37
 static VALUE
 optimized_escape_html(VALUE str)
 {
-    long i, len, beg = 0;
-    VALUE dest = 0;
-    const char *cstr;
-
-    len  = RSTRING_LEN(str);
-    cstr = RSTRING_PTR(str);
-
-    for (i = 0; i < len; i++) {
-        switch (cstr[i]) {
-          case '\'':
-          case '&':
-          case '"':
-          case '<':
-          case '>':
-            if (!dest) {
-                dest = rb_str_buf_new(len);
-            }
-
-            rb_str_cat(dest, cstr + beg, i - beg);
-            beg = i + 1;
-
-            html_escaped_cat(dest, cstr[i]);
-            break;
+    VALUE vbuf;
+    char *buf = ALLOCV_N(char, vbuf, RSTRING_LEN(str) * HTML_ESCAPE_MAX_LEN);
+    const char *cstr = RSTRING_PTR(str);
+    const char *end = cstr + RSTRING_LEN(str);
+
+    char *dest = buf;
+    while (cstr < end) {
+        const unsigned char c = *cstr++;
+        uint8_t len = html_escape_table[c].len;
+        if (len) {
+            memcpy(dest, html_escape_table[c].str, len);
+            dest += len;
+        }
+        else {
+            *dest++ = c;
         }
     }
 
-    if (dest) {
-        rb_str_cat(dest, cstr + beg, len - beg);
-        preserve_original_state(str, dest);
-        return dest;
+    VALUE escaped;
+    if (RSTRING_LEN(str) < (dest - buf)) {
+        escaped = rb_str_new(buf, dest - buf);
+        preserve_original_state(str, escaped);
     }
     else {
-	return rb_str_dup(str);
+        escaped = rb_str_dup(str);
     }
+    ALLOCV_END(vbuf);
+    return escaped;
 }
 
 static VALUE
-- 
cgit v0.10.2


--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]