ruby-changes:7279

naruse	2008-08-24 06:40:59 +0900 (Sun, 24 Aug 2008)

  New Revision: 18798

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=18798

  Log:
    * lib/cgi.rb (CGI::unescapeHTML): more encoding sensible unescaping.

  Modified files:
    trunk/ChangeLog
    trunk/lib/cgi.rb

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 18797)
+++ ChangeLog	(revision 18798)
@@ -1,3 +1,7 @@
+Sun Aug 24 06:39:05 2008  NARUSE, Yui  <naruse@r...>
+
+	* lib/cgi.rb (CGI::unescapeHTML): more encoding sensible unescaping.
+
 Sun Aug 24 04:23:19 2008  NARUSE, Yui  <naruse@r...>
 
 	* encoding.c (enc_compatible_p): raise TypeError when argument is Encoding.
Index: lib/cgi.rb
===================================================================
--- lib/cgi.rb	(revision 18797)
+++ lib/cgi.rb	(revision 18798)
@@ -375,6 +375,19 @@
   #      # => "Usage: foo \"bar\" <baz>"
   def CGI::unescapeHTML(string)
     enc = string.encoding
+    if [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE].include?(enc)
+      return string.gsub(Regexp.new('&(amp|quot|gt|lt|#[0-9]+|#x[0-9A-Fa-f]+);'.encode(enc))) do
+	case $1.encode("US-ASCII")
+	when 'amp'                 then '&'.encode(enc)
+	when 'quot'                then '"'.encode(enc)
+	when 'gt'                  then '>'.encode(enc)
+	when 'lt'                  then '<'.encode(enc)
+	when /\A#0*(\d+)\z/        then $1.to_i.chr(enc)
+	when /\A#x([0-9a-f]+)\z/i  then $1.hex.chr(enc)
+	end
+      end
+    end
+    asciicompat = Encoding.compatible?(string, "a")
     string.gsub(/&(amp|quot|gt|lt|\#[0-9]+|\#x[0-9A-Fa-f]+);/) do
       match = $1.dup
       case match
@@ -382,20 +395,24 @@
       when 'quot'                then '"'
       when 'gt'                  then '>'
       when 'lt'                  then '<'
-      when /\A#0*(\d+)\z/        then
-        if Integer($1) < 256
-          Integer($1).chr.force_encoding(enc)
-        else
-          "&##{$1};"
-        end
-      when /\A#x([0-9a-f]+)\z/i then
-        if $1.hex < 256
-          $1.hex.chr.force_encoding(enc)
-        else
-          "&#x#{$1};"
-        end
+      when /\A#0*(\d+)\z/
+	if enc == Encoding::UTF_8
+	  $1.to_i.chr(enc)
+	elsif $1.to_i < 128 && asciicompat
+	  $1.to_i.chr
+	else
+	  "&##{$1};"
+	end
+      when /\A#x([0-9a-f]+)\z/i
+	if enc == Encoding::UTF_8
+	  $1.hex.chr(enc)
+	elsif $1.hex < 128 && asciicompat
+	  $1.hex.chr
+	else
+	  "&#x#{$1};"
+	end
       else
-        "&#{match};"
+	"&#{match};"
       end
     end
   end

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/