ruby-changes:41810
From: nobu <ko1@a...>
Date: Sun, 21 Feb 2016 13:57:02 +0900 (JST)
Subject: [ruby-changes:41810] nobu:r53884 (trunk): cgi/escape: Optimize CGI.unescapeHTML
nobu 2016-02-21 13:57:35 +0900 (Sun, 21 Feb 2016) New Revision: 53884 https://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=53884 Log: cgi/escape: Optimize CGI.unescapeHTML * cgi/escape/escape.c: Optimize CGI.unescapeHTML performance by C ext for ASCII-compatible encodings. [Fix GH-1242] Modified files: trunk/ChangeLog trunk/ext/cgi/escape/escape.c Index: ext/cgi/escape/escape.c =================================================================== --- ext/cgi/escape/escape.c (revision 53883) +++ ext/cgi/escape/escape.c (revision 53884) @@ -1,6 +1,7 @@ https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L1 #include "ruby.h" #include "ruby/encoding.h" +RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow); RUBY_EXTERN const char ruby_hexdigits[]; #define lower_hexdigits (ruby_hexdigits+0) #define upper_hexdigits (ruby_hexdigits+16) @@ -76,6 +77,113 @@ optimized_escape_html(VALUE str) https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L77 } } +static VALUE +optimized_unescape_html(VALUE str) +{ + enum {UNICODE_MAX = 0x10ffff}; + rb_encoding *enc = rb_enc_get(str); + unsigned long charlimit = (strcasecmp(rb_enc_name(enc), "UTF-8") == 0 ? UNICODE_MAX : + strcasecmp(rb_enc_name(enc), "ISO-8859-1") == 0 ? 256 : + 128); + long i, len, beg = 0; + size_t clen, plen; + int overflow; + const char *cstr; + char buf[6]; + VALUE dest = 0; + + len = RSTRING_LEN(str); + cstr = RSTRING_PTR(str); + + for (i = 0; i < len; i++) { + unsigned long cc; + char c = cstr[i]; + if (c != '&') continue; + plen = i - beg; + if (++i >= len) break; + c = (unsigned char)cstr[i]; + switch (c) { + case 'a': + ++i; + if (len - i >= 4 && memcmp(&cstr[i], "pos;", 4) == 0) { + c = '\''; + i += 3; + } + else if (len - i >= 3 && memcmp(&cstr[i], "mp;", 3) == 0) { + c = '&'; + i += 2; + } + else continue; + break; + case 'q': + ++i; + if (len - i >= 4 && memcmp(&cstr[i], "uot;", 4) == 0) { + c = '"'; + i += 3; + } + else continue; + break; + case 'g': + ++i; + if (len - i >= 2 && memcmp(&cstr[i], "t;", 2) == 0) { + c = '>'; + i += 1; + } + else continue; + break; + case 'l': + ++i; + if (len - i >= 2 && memcmp(&cstr[i], "t;", 2) == 0) { + c = '<'; + i += 1; + } + else continue; + break; + case '#': + if (len - ++i >= 2 && ISDIGIT(cstr[i])) { + cc = ruby_scan_digits(&cstr[i], len-i, 10, &clen, &overflow); + } + else if ((cstr[i] == 'x' || cstr[i] == 'X') && len - ++i >= 2 && ISXDIGIT(cstr[i])) { + cc = ruby_scan_digits(&cstr[i], len-i, 16, &clen, &overflow); + } + else continue; + i += clen; + if (overflow || cc >= charlimit || cstr[i] != ';') continue; + if (!dest) { + dest = rb_str_buf_new(len); + } + rb_str_cat(dest, cstr + beg, plen); + if (charlimit > 256) { + rb_str_cat(dest, buf, rb_enc_mbcput((OnigCodePoint)cc, buf, enc)); + } + else { + c = (unsigned char)cc; + rb_str_cat(dest, &c, 1); + } + beg = i + 1; + continue; + default: + --i; + continue; + } + if (!dest) { + dest = rb_str_buf_new(len); + } + rb_str_cat(dest, cstr + beg, plen); + rb_str_cat(dest, &c, 1); + beg = i + 1; + } + + if (dest) { + rb_str_cat(dest, cstr + beg, len - beg); + preserve_original_state(str, dest); + return dest; + } + else { + return rb_str_dup(str); + } +} + static int url_unreserved_char(unsigned char c) { @@ -159,6 +267,26 @@ cgiesc_escape_html(VALUE self, VALUE str https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L267 /* * call-seq: + * CGI.unescapeHTML(string) -> string + * + * Returns HTML-unescaped string. + * + */ +static VALUE +cgiesc_unescape_html(VALUE self, VALUE str) +{ + StringValue(str); + + if (rb_enc_str_asciicompat_p(str)) { + return optimized_unescape_html(str); + } + else { + return rb_call_super(1, &str); + } +} + +/* + * call-seq: * CGI.escape(string) -> string * * Returns URL-escaped string. @@ -184,6 +312,7 @@ Init_escape(void) https://github.com/ruby/ruby/blob/trunk/ext/cgi/escape/escape.c#L312 rb_mEscape = rb_define_module_under(rb_cCGI, "Escape"); rb_mUtil = rb_define_module_under(rb_cCGI, "Util"); rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1); + rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1); rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1); rb_prepend_module(rb_mUtil, rb_mEscape); rb_extend_object(rb_cCGI, rb_mEscape); Index: ChangeLog =================================================================== --- ChangeLog (revision 53883) +++ ChangeLog (revision 53884) @@ -1,3 +1,8 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1 +Sun Feb 21 13:56:57 2016 Nobuyoshi Nakada <nobu@r...> + + * cgi/escape/escape.c: Optimize CGI.unescapeHTML performance by C + ext for ASCII-compatible encodings. [Fix GH-1242] + Sat Feb 20 15:38:16 2016 Eric Wong <e@8...> * doc/extension.rdoc: update paths for defs/ directory -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/