ruby-changes:71283
From: Burdette <ko1@a...>
Date: Sat, 26 Feb 2022 04:13:18 +0900 (JST)
Subject: [ruby-changes:71283] 26ffda2fd2 (master): [DOC] Enhanced RDoc for some encoding methods (#5598)
https://git.ruby-lang.org/ruby.git/commit/?id=26ffda2fd2 From 26ffda2fd217651e73eb71e6da8f89eb17866f9d Mon Sep 17 00:00:00 2001 From: Burdette Lamar <BurdetteLamar@Y...> Date: Fri, 25 Feb 2022 13:12:59 -0600 Subject: [DOC] Enhanced RDoc for some encoding methods (#5598) In String, treats: #b #scrub #scrub! #unicode_normalize #unicode_normalize! #encode #encode! Also adds a note to IO.new (suggested by @jeremyevans). --- io.c | 7 ++++ string.c | 120 ++++++++++++++++++++++++++++++++++++++++-------------------- transcode.c | 101 ++++++++++++++++++++++---------------------------- 3 files changed, 131 insertions(+), 97 deletions(-) diff --git a/io.c b/io.c index efe37ca835..19becbd181 100644 --- a/io.c +++ b/io.c @@ -8943,6 +8943,13 @@ rb_io_make_open_file(VALUE obj) https://github.com/ruby/ruby/blob/trunk/io.c#L8943 * fd = IO.sysopen(path) # => 3 * IO.new(fd) # => #<IO:fd 3> * + * The new \IO object does not inherit encoding + * (because the integer file descriptor does not have an encoding): + * + * fd = IO.sysopen('t.rus', 'rb') + * io = IO.new(fd) + * io.external_encoding # => #<Encoding:UTF-8> # Not ASCII-8BIT. + * * Optional argument +mode+ (defaults to 'r') must specify a valid mode * see IO@Modes: * diff --git a/string.c b/string.c index 986eee945c..0fdde85b17 100644 --- a/string.c +++ b/string.c @@ -6670,7 +6670,6 @@ rb_str_escape(VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L6670 * and with special characters escaped: * * s = "foo\tbar\tbaz\n" - * # => "foo\tbar\tbaz\n" * s.inspect * # => "\"foo\\tbar\\tbaz\\n\"" * @@ -10963,9 +10962,22 @@ rb_str_force_encoding(VALUE str, VALUE enc) https://github.com/ruby/ruby/blob/trunk/string.c#L10962 /* * call-seq: - * str.b -> str + * b -> string + * + * Returns a copy of +self+ with that has ASCII-8BIT encoding; + * the contents (bytes) of +self+ are not modified: + * + * s = "\x99" + * s.encoding # => #<Encoding:UTF-8> + * t = s.b # => "\x99" + * t.encoding # => #<Encoding:ASCII-8BIT> + * + * s = "\u4095" + * s.encoding # => #<Encoding:UTF-8> + * s.bytes # => [228, 130, 149] + * t = s.b # => "\xE4\x82\x95" + * t.encoding # => #<Encoding:ASCII-8BIT> * - * Returns a copied string whose encoding is ASCII-8BIT. */ static VALUE @@ -11341,17 +11353,38 @@ enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr) https://github.com/ruby/ruby/blob/trunk/string.c#L11353 /* * call-seq: - * str.scrub -> new_str - * str.scrub(repl) -> new_str - * str.scrub{|bytes|} -> new_str + * scrub(replacement_string = default_replacement) -> string + * scrub{|bytes| ... } -> string + * + * Returns a copy of self with each invalid byte sequence replaced + * by a replacement string. + * + * With no block given and no argument, replaces each invalid sequence + * with the default replacement string + * (<tt>"\uFFFD"</tt> for a Unicode encoding, <tt>'?'</tt> otherwise): + * + * "\uFFFD".bytes # => [239, 191, 189] + * s = "foo\x81\x81bar" + * s.bytes + * # => [102, 111, 111, 129, 129, 98, 97, 114] + * s.scrub.bytes + * # => [102, 111, 111, 239, 191, 189, 239, 191, 189, 98, 97, 114] * - * If the string is invalid byte sequence then replace invalid bytes with given replacement - * character, else returns self. - * If block is given, replace invalid bytes with returned value of the block. + * With no block given and argument +replacement_string+ given, + * replaces each invalid sequence with that string: + * + * "foo\x81\x81bar".scrub('xyzzy') # => "fooxyzzyxyzzybar" + * + * With a block given, replaces each invalid sequence with the value + * of the block: + * + * "foo\x81\x81bar".scrub {|bytes| p bytes; 'XYZZY' } # => "fooXYZZYXYZZYbar" + * + * Output: + * + * "\x81" + * "\x81" * - * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD" - * "abc\u3042\x81".scrub("*") #=> "abc\u3042*" - * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>" */ static VALUE str_scrub(int argc, VALUE *argv, VALUE str) @@ -11363,17 +11396,12 @@ str_scrub(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L11396 /* * call-seq: - * str.scrub! -> str - * str.scrub!(repl) -> str - * str.scrub!{|bytes|} -> str + * scrub! -> self + * scrub!(replacement_string = default_replacement) -> self + * scrub!{|bytes|} -> self * - * If the string is invalid byte sequence then replace invalid bytes with given replacement - * character, else returns self. - * If block is given, replace invalid bytes with returned value of the block. + * Like String#scrub, except that any replacements are made in +self+. * - * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD" - * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*" - * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>" */ static VALUE str_scrub_bang(int argc, VALUE *argv, VALUE str) @@ -11405,25 +11433,36 @@ unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id) https://github.com/ruby/ruby/blob/trunk/string.c#L11433 /* * call-seq: - * str.unicode_normalize(form=:nfc) + * unicode_normalize(form = :nfc) -> string * - * Unicode Normalization---Returns a normalized form of +str+, - * using Unicode normalizations NFC, NFD, NFKC, or NFKD. - * The normalization form used is determined by +form+, which can - * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+. - * The default is +:nfc+. + * Returns a copy of +self+ with + * {Unicode normalization}[https://unicode.org/reports/tr15] applied. * - * If the string is not in a Unicode Encoding, then an Exception is raised. - * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE, - * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE. - * Anything other than UTF-8 is implemented by converting to UTF-8, - * which makes it slower than UTF-8. + * Argument +form+ must be one of the following symbols + * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]): + * + * - +:nfc+: Canonical decomposition, followed by canonical composition. + * - +:nfd+: Canonical decomposition. + * - +:nfkc+: Compatibility decomposition, followed by canonical composition. + * - +:nfkd+: Compatibility decomposition. + * + * +self+ must have encoding UTF-8 or one of the other supported encodings: + * + * UnicodeNormalize::UNICODE_ENCODINGS + * # => + * [#<Encoding:UTF-16BE (autoload)>, + * #<Encoding:UTF-16LE>, + * #<Encoding:UTF-32BE (autoload)>, + * #<Encoding:UTF-32LE (autoload)>, + * #<Encoding:GB18030 (autoload)>, + * #<Encoding:UTF-16BE (autoload)>, + * #<Encoding:UTF-32BE (autoload)>] + * + * Examples: + * + * "a\u0300".unicode_normalize # => "a" + * "\u00E0".unicode_normalize(:nfd) # => "a " * - * "a\u0300".unicode_normalize #=> "\u00E0" - * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0" - * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300" - * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd) - * #=> Encoding::CompatibilityError raised */ static VALUE rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str) @@ -11433,10 +11472,11 @@ rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L11472 /* * call-seq: - * str.unicode_normalize!(form=:nfc) + * unicode_normalize!(form = :nfc) -> self + * + * Like String#unicode_normalize, except that the normalization + * is performed on +self+. * - * Destructive version of String#unicode_normalize, doing Unicode - * normalization in place. */ static VALUE rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str) diff --git a/transcode.c b/transcode.c index 9cc4d00f28..400ad13775 100644 --- a/transcode.c +++ b/transcode.c @@ -2801,16 +2801,11 @@ str_encode_associate(VALUE str, int encidx) https://github.com/ruby/ruby/blob/trunk/transcode.c#L2801 /* * call-seq: - * str.encode!(encoding, **options) -> str - * str.encode!(dst_encoding, src_encoding, **options) -> str + * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self + * encode!(dst_encoding, src_encoding, **enc_opts) -> self + * + * Like #encode, but applies encoding changes to +self+; returns +self+. * - * The first form transcodes the contents of <i>str</i> from - * str.encoding to +encoding+. - * The second form transcodes the contents of <i>str</i> from - * src_encoding to dst_encoding. - * The +options+ keyword arguments give details for conversion. See String#encode - * for details. - * Returns the string even if no changes were made. */ static VALUE @@ -2837,58 +2832,50 @@ static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx); https://github.com/ruby/ruby/blob/trunk/transcode.c#L2832 /* * call-seq: - * str.encode(encoding, **options) -> str - * str.encode(dst_encoding, src_encoding, **options) -> str - * str.encode(**options) -> str + * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string + * encode(dst_encoding, src_encoding, **enc_opts) -> string + * + * Returns a copy of +self+ transcoded as determined by +dst_encoding+. + * By default, raises an exception if +self+ + * contains an invalid byte or a character not defined in +dst_encoding+; + * that behavior may be modified by encoding options; see below. + * + * With no arguments: + * + * - Uses the same encoding if <tt>Encoding.default_internal</tt> is +nil+ + * (the default): + * + * Encoding.default_internal # => nil + * s = "Ruby\x99".force_encoding('Windows-1252') + * s.encoding # => #<Encoding:Windows-1252> + * s.bytes # => [82, 117, 98, 121, 153] + * t = s.encode (... truncated) -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/