ruby-changes:71223

https://git.ruby-lang.org/ruby.git/commit/?id=c8817d6a3e

From c8817d6a3ebc9bbc151625bca198b8f327d1d68f Mon Sep 17 00:00:00 2001
From: Shugo Maeda <shugo@r...>
Date: Sat, 19 Feb 2022 19:10:00 +0900
Subject: Add String#byteindex, String#byterindex, and MatchData#byteoffset
 (#5518)

* Add String#byteindex, String#byterindex, and MatchData#byteoffset [Feature #13110]

Co-authored-by: NARUSE, Yui <naruse@a...>
---
 NEWS.md                  |   6 ++
 re.c                     |  33 ++++++
 string.c                 | 260 +++++++++++++++++++++++++++++++++++++++++++++--
 test/ruby/test_regexp.rb |  21 ++++
 test/ruby/test_string.rb | 140 +++++++++++++++++++++++++
 5 files changed, 451 insertions(+), 9 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index ef03a6c18a..60674183fd 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -59,6 +59,9 @@ Note: We're only listing outstanding class updates. https://github.com/ruby/ruby/blob/trunk/NEWS.md#L59
       empty, instead of returning the default value or
       calling the default proc. [[Bug #16908]]
 
+* MatchData
+    * MatchData#byteoffset has been added. [[Feature #13110]]
+
 * Module
     * Module.used_refinements has been added. [[Feature #14332]]
     * Module#refinements has been added. [[Feature #12737]]
@@ -74,6 +77,9 @@ Note: We're only listing outstanding class updates. https://github.com/ruby/ruby/blob/trunk/NEWS.md#L77
     * Set is now available as a builtin class without the need for `require "set"`. [[Feature #16989]]
       It is currently autoloaded via the `Set` constant or a call to `Enumerable#to_set`.
 
+* String
+    * String#byteindex and String#byterindex have been added. [[Feature #13110]]
+
 * Struct
     * A Struct class can also be initialized with keyword arguments
       without `keyword_init: true` on `Struct.new` [[Feature #16806]]
diff --git a/re.c b/re.c
index 66519effcd..9c1adbb0ff 100644
--- a/re.c
+++ b/re.c
@@ -1234,6 +1234,38 @@ match_offset(VALUE match, VALUE n) https://github.com/ruby/ruby/blob/trunk/re.c#L1234
                         LONG2NUM(RMATCH(match)->rmatch->char_offset[i].end));
 }
 
+/*
+ *  call-seq:
+ *     mtch.byteoffset(n)   -> array
+ *
+ *  Returns a two-element array containing the beginning and ending byte-based offsets of
+ *  the <em>n</em>th match.
+ *  <em>n</em> can be a string or symbol to reference a named capture.
+ *
+ *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
+ *     m.byteoffset(0)      #=> [1, 7]
+ *     m.byteoffset(4)      #=> [6, 7]
+ *
+ *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
+ *     p m.byteoffset(:foo) #=> [0, 1]
+ *     p m.byteoffset(:bar) #=> [2, 3]
+ *
+ */
+
+static VALUE
+match_byteoffset(VALUE match, VALUE n)
+{
+    int i = match_backref_number(match, n);
+    struct re_registers *regs = RMATCH_REGS(match);
+
+    match_check(match);
+    backref_number_check(regs, i);
+
+    if (BEG(i) < 0)
+        return rb_assoc_new(Qnil, Qnil);
+    return rb_assoc_new(LONG2NUM(BEG(i)), LONG2NUM(END(i)));
+}
+
 
 /*
  *  call-seq:
@@ -4162,6 +4194,7 @@ Init_Regexp(void) https://github.com/ruby/ruby/blob/trunk/re.c#L4194
     rb_define_method(rb_cMatch, "size", match_size, 0);
     rb_define_method(rb_cMatch, "length", match_size, 0);
     rb_define_method(rb_cMatch, "offset", match_offset, 1);
+    rb_define_method(rb_cMatch, "byteoffset", match_byteoffset, 1);
     rb_define_method(rb_cMatch, "begin", match_begin, 1);
     rb_define_method(rb_cMatch, "end", match_end, 1);
     rb_define_method(rb_cMatch, "match", match_nth, 1);
diff --git a/string.c b/string.c
index 99269f62ba..37dca19aa7 100644
--- a/string.c
+++ b/string.c
@@ -3979,18 +3979,123 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L3979
     return LONG2NUM(pos);
 }
 
+/* whether given pos is valid character boundary or not
+ * Note that in this function, "character" means a code point
+ * (Unicode scalar value), not a grapheme cluster.
+ */
+static bool
+str_check_byte_pos(VALUE str, long pos)
+{
+    const char *s = RSTRING_PTR(str);
+    const char *e = RSTRING_END(str);
+    const char *p = s + pos;
+    const char *pp = rb_enc_left_char_head(s, p, e, rb_enc_get(str));
+    return p == pp;
+}
+
+/*
+ *  call-seq:
+ *    byteindex(substring, offset = 0) -> integer or nil
+ *    byteindex(regexp, offset = 0) -> integer or nil
+ *
+ *  Returns the \Integer byte-based index of the first occurrence of the given +substring+,
+ *  or +nil+ if none found:
+ *
+ *    'foo'.byteindex('f') # => 0
+ *    'foo'.byteindex('o') # => 1
+ *    'foo'.byteindex('oo') # => 1
+ *    'foo'.byteindex('ooo') # => nil
+ *
+ *  Returns the \Integer byte-based index of the first match for the given \Regexp +regexp+,
+ *  or +nil+ if none found:
+ *
+ *    'foo'.byteindex(/f/) # => 0
+ *    'foo'.byteindex(/o/) # => 1
+ *    'foo'.byteindex(/oo/) # => 1
+ *    'foo'.byteindex(/ooo/) # => nil
+ *
+ *  \Integer argument +offset+, if given, specifies the byte-based position in the
+ *  string to begin the search:
+ *
+ *    'foo'.byteindex('o', 1) # => 1
+ *    'foo'.byteindex('o', 2) # => 2
+ *    'foo'.byteindex('o', 3) # => nil
+ *
+ *  If +offset+ is negative, counts backward from the end of +self+:
+ *
+ *    'foo'.byteindex('o', -1) # => 2
+ *    'foo'.byteindex('o', -2) # => 1
+ *    'foo'.byteindex('o', -3) # => 1
+ *    'foo'.byteindex('o', -4) # => nil
+ *
+ *  If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
+ *  raised.
+ *
+ *  Related: String#index, String#byterindex.
+ */
+
+static VALUE
+rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
+{
+    VALUE sub;
+    VALUE initpos;
+    long pos;
+
+    if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
+        pos = NUM2LONG(initpos);
+    }
+    else {
+        pos = 0;
+    }
+    if (pos < 0) {
+        pos += RSTRING_LEN(str);
+        if (pos < 0) {
+            if (RB_TYPE_P(sub, T_REGEXP)) {
+                rb_backref_set(Qnil);
+            }
+            return Qnil;
+        }
+    }
+
+    if (!str_check_byte_pos(str, pos)) {
+        rb_raise(rb_eIndexError,
+                 "offset %ld does not land on character boundary", pos);
+    }
+
+    if (RB_TYPE_P(sub, T_REGEXP)) {
+        if (pos > RSTRING_LEN(str))
+            return Qnil;
+        if (rb_reg_search(sub, str, pos, 0) < 0) {
+            return Qnil;
+        }
+        else {
+            VALUE match = rb_backref_get();
+            struct re_registers *regs = RMATCH_REGS(match);
+            pos = BEG(0);
+            return LONG2NUM(pos);
+        }
+    }
+    else {
+        StringValue(sub);
+        pos = rb_strseq_index(str, sub, pos, 1);
+    }
+
+    if (pos == -1) return Qnil;
+    return LONG2NUM(pos);
+}
+
 #ifdef HAVE_MEMRCHR
 static long
-str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
+str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
 {
     char *hit, *adjusted;
     int c;
     long slen, searchlen;
     char *sbeg, *e, *t;
 
-    slen = RSTRING_LEN(sub);
-    if (slen == 0) return pos;
     sbeg = RSTRING_PTR(str);
+    slen = RSTRING_LEN(sub);
+    if (slen == 0) return s - sbeg;
     e = RSTRING_END(str);
     t = RSTRING_PTR(sub);
     c = *t & 0xff;
@@ -4005,7 +4110,7 @@ str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L4110
 	    continue;
 	}
 	if (memcmp(hit, t, slen) == 0)
-	    return rb_str_sublen(str, hit - sbeg);
+	    return hit - sbeg;
 	searchlen = adjusted - sbeg;
     } while (searchlen > 0);
 
@@ -4013,7 +4118,7 @@ str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L4118
 }
 #else
 static long
-str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
+str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
 {
     long slen;
     char *sbeg, *e, *t;
@@ -4025,10 +4130,9 @@ str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc) https://github.com/ruby/ruby/blob/trunk/string.c#L4130
 
     while (s) {
 	if (memcmp(s, t, slen) == 0) {
-	    return pos;
+	    return s - sbeg;
 	}
-	if (pos == 0) break;
-	pos--;
+        if (s <= sbeg) break;
 	s = rb_enc_prev_char(sbeg, s, e, enc);
     }
 
@@ -4065,7 +4169,7 @@ rb_str_rindex(VALUE str, VALUE sub, long pos) https://github.com/ruby/ruby/blob/trunk/string.c#L4169
     }
 
     s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
-    return str_rindex(str, sub, s, pos, enc);
+    return rb_str_sublen(str, str_rindex(str, sub, s, enc));
 }
 
 /*
@@ -4170,6 +4274,142 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) https://github.com/ruby/ruby/blob/trunk/string.c#L4274
     return Qnil;
 }
 
+static long
+rb_str_byterindex(VALUE str, VALUE sub, long pos)
+{
+    long len, slen;
+    char *sbeg, *s;
+    rb_encoding *enc;
+
+    enc = rb_enc_check(str, sub);
+    if (is_broken_string(sub)) return -1;
+    len = RSTRING_LEN(str);
+    slen = RSTRING_LEN(sub);
+
+    /* substring longer than string */
+    if (len < slen) return -1;
+    if (len - pos < slen) pos = len - slen;
+    if (len == 0) return pos;
+
+    sbeg = RSTRING_PTR(str);
+
+    if (pos == 0) {
+        if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
+            return 0;
+        else
+            return -1;
+    }
+
+    s = sbeg + pos;
+    return str_rindex(str, sub, s, enc);
+}
+
+
+/*
+ *  call-seq:
+ *    byterindex(substring, offset = self.length) -> integer or nil
+ *    byterindex(regexp, offset = self.length) -> integer or nil
+ *
+ *  Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+,
+ *  or +nil+ if none found:
+ *
+ *    'foo'.byterindex('f') # => 0
+ *    'foo'.byterindex('o') # => 2
+ *    'foo'.byterindex('oo') # => 1
+ *    'foo'.byterindex('ooo') # => nil
+ *
+ *  Returns the \Integer byte-based index of the _last_ match for the given \Regexp +regexp+,
+ *  or +nil+ if none found:
+ *
+ *    'foo'.byterindex(/f/) # = (... truncated)

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/