ruby-changes:71283

https://git.ruby-lang.org/ruby.git/commit/?id=26ffda2fd2

From 26ffda2fd217651e73eb71e6da8f89eb17866f9d Mon Sep 17 00:00:00 2001
From: Burdette Lamar <BurdetteLamar@Y...>
Date: Fri, 25 Feb 2022 13:12:59 -0600
Subject: [DOC] Enhanced RDoc for some encoding methods (#5598)

In String, treats:

    #b
    #scrub
    #scrub!
    #unicode_normalize
    #unicode_normalize!
    #encode
    #encode!

Also adds a note to IO.new (suggested by @jeremyevans).
---
 io.c        |   7 ++++
 string.c    | 120 ++++++++++++++++++++++++++++++++++++++++--------------------
 transcode.c | 101 ++++++++++++++++++++++----------------------------
 3 files changed, 131 insertions(+), 97 deletions(-)

diff --git a/io.c b/io.c
index efe37ca835..19becbd181 100644
--- a/io.c
+++ b/io.c
@@ -8943,6 +8943,13 @@ rb_io_make_open_file(VALUE obj) https://github.com/ruby/ruby/blob/trunk/io.c#L8943
  *    fd = IO.sysopen(path) # => 3
  *    IO.new(fd)            # => #<IO:fd 3>
  *
+ *  The new \IO object does not inherit encoding
+ *  (because the integer file descriptor does not have an encoding):
+ *
+ *    fd = IO.sysopen('t.rus', 'rb')
+ *    io = IO.new(fd)
+ *    io.external_encoding # => #<Encoding:UTF-8> # Not ASCII-8BIT.
+ *
  *  Optional argument +mode+ (defaults to 'r') must specify a valid mode
  *  see IO@Modes:
  *
diff --git a/string.c b/string.c
index 986eee945c..0fdde85b17 100644
--- a/string.c
+++ b/string.c
@@ -6670,7 +6670,6 @@ rb_str_escape(VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L6670
  *  and with special characters escaped:
  *
  *    s = "foo\tbar\tbaz\n"
- *    # => "foo\tbar\tbaz\n"
  *    s.inspect
  *    # => "\"foo\\tbar\\tbaz\\n\""
  *
@@ -10963,9 +10962,22 @@ rb_str_force_encoding(VALUE str, VALUE enc) https://github.com/ruby/ruby/blob/trunk/string.c#L10962
 
 /*
  *  call-seq:
- *     str.b   -> str
+ *    b -> string
+ *
+ *  Returns a copy of +self+ with that has ASCII-8BIT encoding;
+ *  the contents (bytes) of +self+ are not modified:
+ *
+ *    s = "\x99"
+ *    s.encoding   # => #<Encoding:UTF-8>
+ *    t = s.b      # => "\x99"
+ *    t.encoding   # => #<Encoding:ASCII-8BIT>
+ *
+ *    s = "\u4095"
+ *    s.encoding   # => #<Encoding:UTF-8>
+ *    s.bytes      # => [228, 130, 149]
+ *    t = s.b      # => "\xE4\x82\x95"
+ *    t.encoding   # => #<Encoding:ASCII-8BIT>
  *
- *  Returns a copied string whose encoding is ASCII-8BIT.
  */
 
 static VALUE
@@ -11341,17 +11353,38 @@ enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr) https://github.com/ruby/ruby/blob/trunk/string.c#L11353
 
 /*
  *  call-seq:
- *    str.scrub -> new_str
- *    str.scrub(repl) -> new_str
- *    str.scrub{|bytes|} -> new_str
+ *    scrub(replacement_string = default_replacement) -> string
+ *    scrub{|bytes| ... } -> string
+ *
+ *  Returns a copy of self with each invalid byte sequence replaced
+ *  by a replacement string.
+ *
+ *  With no block given and no argument, replaces each invalid sequence
+ *  with the default replacement string
+ *  (<tt>"\uFFFD"</tt> for a Unicode encoding, <tt>'?'</tt> otherwise):
+ *
+ *    "\uFFFD".bytes # => [239, 191, 189]
+ *    s = "foo\x81\x81bar"
+ *    s.bytes
+ *    # => [102, 111, 111, 129, 129, 98, 97, 114]
+ *    s.scrub.bytes
+ *    # => [102, 111, 111, 239, 191, 189, 239, 191, 189, 98, 97, 114]
  *
- *  If the string is invalid byte sequence then replace invalid bytes with given replacement
- *  character, else returns self.
- *  If block is given, replace invalid bytes with returned value of the block.
+ *  With no block given and argument +replacement_string+ given,
+ *  replaces each invalid sequence with that string:
+ *
+ *    "foo\x81\x81bar".scrub('xyzzy') # => "fooxyzzyxyzzybar"
+ *
+ *  With a block given, replaces each invalid sequence with the value
+ *  of the block:
+ *
+ *    "foo\x81\x81bar".scrub {|bytes| p bytes; 'XYZZY' } # => "fooXYZZYXYZZYbar"
+ *
+ *  Output:
+ *
+ *    "\x81"
+ *    "\x81"
  *
- *     "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
- *     "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
- *     "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
  */
 static VALUE
 str_scrub(int argc, VALUE *argv, VALUE str)
@@ -11363,17 +11396,12 @@ str_scrub(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L11396
 
 /*
  *  call-seq:
- *    str.scrub! -> str
- *    str.scrub!(repl) -> str
- *    str.scrub!{|bytes|} -> str
+ *    scrub! -> self
+ *    scrub!(replacement_string = default_replacement) -> self
+ *    scrub!{|bytes|} -> self
  *
- *  If the string is invalid byte sequence then replace invalid bytes with given replacement
- *  character, else returns self.
- *  If block is given, replace invalid bytes with returned value of the block.
+ *  Like String#scrub, except that any replacements are made in +self+.
  *
- *     "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
- *     "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
- *     "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
  */
 static VALUE
 str_scrub_bang(int argc, VALUE *argv, VALUE str)
@@ -11405,25 +11433,36 @@ unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id) https://github.com/ruby/ruby/blob/trunk/string.c#L11433
 
 /*
  *  call-seq:
- *    str.unicode_normalize(form=:nfc)
+ *    unicode_normalize(form = :nfc) -> string
  *
- *  Unicode Normalization---Returns a normalized form of +str+,
- *  using Unicode normalizations NFC, NFD, NFKC, or NFKD.
- *  The normalization form used is determined by +form+, which can
- *  be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
- *  The default is +:nfc+.
+ *  Returns a copy of +self+ with
+ *  {Unicode normalization}[https://unicode.org/reports/tr15] applied.
  *
- *  If the string is not in a Unicode Encoding, then an Exception is raised.
- *  In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
- *  and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
- *  Anything other than UTF-8 is implemented by converting to UTF-8,
- *  which makes it slower than UTF-8.
+ *  Argument +form+ must be one of the following symbols
+ *  (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
+ *
+ *  - +:nfc+: Canonical decomposition, followed by canonical composition.
+ *  - +:nfd+: Canonical decomposition.
+ *  - +:nfkc+: Compatibility decomposition, followed by canonical composition.
+ *  - +:nfkd+: Compatibility decomposition.
+ *
+ *  +self+ must have encoding UTF-8 or one of the other supported encodings:
+ *
+ *    UnicodeNormalize::UNICODE_ENCODINGS
+ *    # =>
+ *    [#<Encoding:UTF-16BE (autoload)>,
+ *     #<Encoding:UTF-16LE>,
+ *     #<Encoding:UTF-32BE (autoload)>,
+ *     #<Encoding:UTF-32LE (autoload)>,
+ *     #<Encoding:GB18030 (autoload)>,
+ *     #<Encoding:UTF-16BE (autoload)>,
+ *     #<Encoding:UTF-32BE (autoload)>]
+ *
+ *  Examples:
+ *
+ *    "a\u0300".unicode_normalize      # => "a"
+ *    "\u00E0".unicode_normalize(:nfd) # => "a "
  *
- *    "a\u0300".unicode_normalize        #=> "\u00E0"
- *    "a\u0300".unicode_normalize(:nfc)  #=> "\u00E0"
- *    "\u00E0".unicode_normalize(:nfd)   #=> "a\u0300"
- *    "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
- *                                       #=> Encoding::CompatibilityError raised
  */
 static VALUE
 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
@@ -11433,10 +11472,11 @@ rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L11472
 
 /*
  *  call-seq:
- *    str.unicode_normalize!(form=:nfc)
+ *    unicode_normalize!(form = :nfc) -> self
+ *
+ *  Like String#unicode_normalize, except that the normalization
+ *  is performed on +self+.
  *
- *  Destructive version of String#unicode_normalize, doing Unicode
- *  normalization in place.
  */
 static VALUE
 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
diff --git a/transcode.c b/transcode.c
index 9cc4d00f28..400ad13775 100644
--- a/transcode.c
+++ b/transcode.c
@@ -2801,16 +2801,11 @@ str_encode_associate(VALUE str, int encidx) https://github.com/ruby/ruby/blob/trunk/transcode.c#L2801
 
 /*
  *  call-seq:
- *     str.encode!(encoding, **options)   -> str
- *     str.encode!(dst_encoding, src_encoding, **options)   -> str
+ *    encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
+ *    encode!(dst_encoding, src_encoding, **enc_opts)   -> self
+ *
+ *  Like #encode, but applies encoding changes to +self+; returns +self+.
  *
- *  The first form transcodes the contents of <i>str</i> from
- *  str.encoding to +encoding+.
- *  The second form transcodes the contents of <i>str</i> from
- *  src_encoding to dst_encoding.
- *  The +options+ keyword arguments give details for conversion. See String#encode
- *  for details.
- *  Returns the string even if no changes were made.
  */
 
 static VALUE
@@ -2837,58 +2832,50 @@ static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx); https://github.com/ruby/ruby/blob/trunk/transcode.c#L2832
 
 /*
  *  call-seq:
- *     str.encode(encoding, **options)   -> str
- *     str.encode(dst_encoding, src_encoding, **options)   -> str
- *     str.encode(**options)   -> str
+ *    encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
+ *    encode(dst_encoding, src_encoding, **enc_opts)   -> string
+ *
+ *  Returns a copy of +self+ transcoded as determined by +dst_encoding+.
+ *  By default, raises an exception if +self+
+ *  contains an invalid byte or a character not defined in +dst_encoding+;
+ *  that behavior may be modified by encoding options; see below.
+ *
+ *  With no arguments:
+ *
+ *  - Uses the same encoding if <tt>Encoding.default_internal</tt> is +nil+
+ *    (the default):
+ *
+ *      Encoding.default_internal # => nil
+ *      s = "Ruby\x99".force_encoding('Windows-1252')
+ *      s.encoding                # => #<Encoding:Windows-1252>
+ *      s.bytes                   # => [82, 117, 98, 121, 153]
+ *      t = s.encode   (... truncated)

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/