[前][次][番号順一覧][スレッド一覧]

ruby-changes:71691

From: Jeremy <ko1@a...>
Date: Tue, 12 Apr 2022 00:17:52 +0900 (JST)
Subject: [ruby-changes:71691] ebb4378237 (master): [ruby/net-http] Add HTTP#response_body_encoding for setting response body encoding

https://git.ruby-lang.org/ruby.git/commit/?id=ebb4378237

From ebb4378237e572ce2e888136a613c7c051439f95 Mon Sep 17 00:00:00 2001
From: Jeremy Evans <code@j...>
Date: Mon, 11 Apr 2022 08:17:19 -0700
Subject: [ruby/net-http] Add HTTP#response_body_encoding for setting response
 body encoding

This allows for the ability to opt-in to a method to set the
encoding of response bodies.  By setting the accessor to a String
or Encoding instance, it will use the specified encoding.
Setting the value of true will try to detect the encoding of the
response body, either using the Content-Type header (assuming it
specifies charset) or by scanning for a <meta> tag in the document
that specifies the encoding.  The default is false in which case
no forcing of encoding will be done (same as before the patch).

Implements [Feature #2567]
Implements [Feature #15517]

https://github.com/ruby/net-http/commit/6233e6b7c1

Co-authored-by: Yui Naruse <naruse@r...>
---
 lib/net/http.rb                    |  14 +++
 lib/net/http/response.rb           | 159 +++++++++++++++++++++++++
 test/net/http/test_http.rb         |  54 +++++++++
 test/net/http/test_httpresponse.rb | 235 +++++++++++++++++++++++++++++++++++++
 4 files changed, 462 insertions(+)

diff --git a/lib/net/http.rb b/lib/net/http.rb
index 3fcf23b05c..5e64e38665 100644
--- a/lib/net/http.rb
+++ b/lib/net/http.rb
@@ -698,6 +698,7 @@ module Net   #:nodoc: https://github.com/ruby/ruby/blob/trunk/lib/net/http.rb#L698
       @continue_timeout = nil
       @max_retries = 1
       @debug_output = nil
+      @response_body_encoding = false
 
       @proxy_from_env = false
       @proxy_uri      = nil
@@ -745,6 +746,18 @@ module Net   #:nodoc: https://github.com/ruby/ruby/blob/trunk/lib/net/http.rb#L746
     # The local port used to establish the connection.
     attr_accessor :local_port
 
+    # The encoding to use for the response body.  If Encoding, uses the
+    # specified encoding.  If other true value, tries to detect the response
+    # body encoding.
+    attr_reader :response_body_encoding
+
+    # Set the encoding to use for the response body.  If given a String, find
+    # the related Encoding.
+    def response_body_encoding=(value)
+      value = Encoding.find(value) if value.is_a?(String)
+      @response_body_encoding = value
+    end
+
     attr_writer :proxy_from_env
     attr_writer :proxy_address
     attr_writer :proxy_port
@@ -1592,6 +1605,7 @@ module Net   #:nodoc: https://github.com/ruby/ruby/blob/trunk/lib/net/http.rb#L1605
           begin
             res = HTTPResponse.read_new(@socket)
             res.decode_content = req.decode_content
+            res.body_encoding = @response_body_encoding
           end while res.kind_of?(HTTPInformation)
 
           res.uri = req.uri
diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb
index 08eaeb2cac..ecbfd42d2b 100644
--- a/lib/net/http/response.rb
+++ b/lib/net/http/response.rb
@@ -84,6 +84,7 @@ class Net::HTTPResponse https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb#L84
     @read = false
     @uri  = nil
     @decode_content = false
+    @body_encoding = false
   end
 
   # The HTTP version supported by the server.
@@ -106,6 +107,18 @@ class Net::HTTPResponse https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb#L107
   # Accept-Encoding header from the user.
   attr_accessor :decode_content
 
+  # The encoding to use for the response body. If Encoding, use that encoding.
+  # If other true value, attempt to detect the appropriate encoding, and use
+  # that.
+  attr_reader :body_encoding
+
+  # Set the encoding to use for the response body.  If given a String, find
+  # the related Encoding.
+  def body_encoding=(value)
+    value = Encoding.find(value) if value.is_a?(String)
+    @body_encoding = value
+  end
+
   def inspect
     "#<#{self.class} #{@code} #{@message} readbody=#{@read}>"
   end
@@ -214,6 +227,17 @@ class Net::HTTPResponse https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb#L227
     end
     @read = true
 
+    case enc = @body_encoding
+    when Encoding, false, nil
+      # Encoding: force given encoding
+      # false/nil: do not force encoding
+    else
+      # other value: detect encoding from body
+      enc = detect_encoding(@body)
+    end
+
+    @body.force_encoding(enc) if enc
+
     @body
   end
 
@@ -245,6 +269,141 @@ class Net::HTTPResponse https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb#L269
 
   private
 
+  # :nodoc:
+  def detect_encoding(str, encoding=nil)
+    if encoding
+    elsif encoding = type_params['charset']
+    elsif encoding = check_bom(str)
+    else
+      encoding = case content_type&.downcase
+      when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml}
+        /\A<xml[ \t\r\n]+
+          version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+
+          encoding[ \t\r\n]*=[ \t\r\n]*
+          (?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str
+        encoding = $1 || $2 || Encoding::UTF_8
+      when %r{text/html.*}
+        sniff_encoding(str)
+      end
+    end
+    return encoding
+  end
+
+  # :nodoc:
+  def sniff_encoding(str, encoding=nil)
+    # the encoding sniffing algorithm
+    # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
+    if enc = scanning_meta(str)
+      enc
+    # 6. last visited page or something
+    # 7. frequency
+    elsif str.ascii_only?
+      Encoding::US_ASCII
+    elsif str.dup.force_encoding(Encoding::UTF_8).valid_encoding?
+      Encoding::UTF_8
+    end
+    # 8. implementation-defined or user-specified
+  end
+
+  # :nodoc:
+  def check_bom(str)
+    case str.byteslice(0, 2)
+    when "\xFE\xFF"
+      return Encoding::UTF_16BE
+    when "\xFF\xFE"
+      return Encoding::UTF_16LE
+    end
+    if "\xEF\xBB\xBF" == str.byteslice(0, 3)
+      return Encoding::UTF_8
+    end
+    nil
+  end
+
+  # :nodoc:
+  def scanning_meta(str)
+    require 'strscan'
+    ss = StringScanner.new(str)
+    if ss.scan_until(/<meta[\t\n\f\r ]*/)
+      attrs = {} # attribute_list
+      got_pragma = false
+      need_pragma = nil
+      charset = nil
+
+      # step: Attributes
+      while attr = get_attribute(ss)
+        name, value = *attr
+        next if attrs[name]
+        attrs[name] = true
+        case name
+        when 'http-equiv'
+          got_pragma = true if value == 'content-type'
+        when 'content'
+          encoding = extracting_encodings_from_meta_elements(value)
+          unless charset
+            charset = encoding
+          end
+          need_pragma = true
+        when 'charset'
+          need_pragma = false
+          charset = value
+        end
+      end
+
+      # step: Processing
+      return if need_pragma.nil?
+      return if need_pragma && !got_pragma
+
+      charset = Encoding.find(charset) rescue nil
+      return unless charset
+      charset = Encoding::UTF_8 if charset == Encoding::UTF_16
+      return charset # tentative
+    end
+    nil
+  end
+
+  def get_attribute(ss)
+    ss.scan(/[\t\n\f\r \/]*/)
+    if ss.peek(1) == '>'
+      ss.getch
+      return nil
+    end
+    name = ss.scan(/[^=\t\n\f\r \/>]*/)
+    name.downcase!
+    raise if name.empty?
+    ss.skip(/[\t\n\f\r ]*/)
+    if ss.getch != '='
+      value = ''
+      return [name, value]
+    end
+    ss.skip(/[\t\n\f\r ]*/)
+    case ss.peek(1)
+    when '"'
+      ss.getch
+      value = ss.scan(/[^"]+/)
+      value.downcase!
+      ss.getch
+    when "'"
+      ss.getch
+      value = ss.scan(/[^']+/)
+      value.downcase!
+      ss.getch
+    when '>'
+      value = ''
+    else
+      value = ss.scan(/[^\t\n\f\r >]+/)
+      value.downcase!
+    end
+    [name, value]
+  end
+
+  def extracting_encodings_from_meta_elements(value)
+    # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
+    if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value
+      return $1 || $2 || $3
+    end
+    return nil
+  end
+
   ##
   # Checks for a supported Content-Encoding header and yields an Inflate
   # wrapper for this response's socket when zlib is present.  If the
diff --git a/test/net/http/test_http.rb b/test/net/http/test_http.rb
index b5156078a4..4725a79147 100644
--- a/test/net/http/test_http.rb
+++ b/test/net/http/test_http.rb
@@ -1294,3 +1294,57 @@ class TestNetHTTPLocalBind < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/test/net/http/test_http.rb#L1294
   end
 end
 
+class TestNetHTTPForceEncoding < Test::Unit::TestCase
+  CONFIG = {
+    'host' => 'localhost',
+    'proxy_host' => nil,
+    'proxy_port' => nil,
+  }
+
+  include TestNetHTTPUtils
+
+  def fe_request(force_enc, content_type=nil)
+    @server.mount_proc('/fe') do |req, res|
+      res['Content-Type'] = content_type if content_type
+      res.body = "hello\u1234"
+    end
+
+    http = Net::HTTP.new(config('host'), config('port'))
+    http.local_host = Addrinfo.tcp(config('host'), config('port')).ip_address
+    assert_not_nil(http.local_host)
+    assert_nil(http.local_port)
+
+    http.response_body_encoding = force_enc
+    http.get('/fe')
+  end
+
+  def test_response_body_encoding_false
+    res = fe_request(false)
+    assert_equal("hello\u1234".b, res.body)
+    assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
+  end
+
+  def test_response_body_encoding_true_without_content_type
+    res = fe_request(true)
+    assert_equal("hello\u1234".b, res.body)
+    assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
+  end
+
+  def test_response_body_encoding_true_with_content_type
+    res = fe_request(true, 'text/html; charset=utf-8')
+    assert_equal("hello\u1234", res.body)
+    assert_equal(Encoding::UTF_8, res.body.encoding)
+  end
+
+  def test_response_body_encoding_string_without_content_type
+    res = fe_request('utf-8')
+    assert_equal("hello\u1234", res.body)
+    assert_equal(Encoding::UTF_8, res.body.encoding)
+  end
+
+  def test_response_ (... truncated)

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]