[前][次][番号順一覧][スレッド一覧]

ruby-changes:4023

From: ko1@a...
Date: Sun, 17 Feb 2008 05:08:54 +0900 (JST)
Subject: [ruby-changes:4023] akr - Ruby:r15513 (trunk): * include/ruby/re.h (struct rmatch_offset): new struct for character

akr	2008-02-17 05:08:35 +0900 (Sun, 17 Feb 2008)

  New Revision: 15513

  Modified files:
    trunk/ChangeLog
    trunk/gc.c
    trunk/include/ruby/re.h
    trunk/re.c
    trunk/string.c

  Log:
    * include/ruby/re.h (struct rmatch_offset): new struct for character
      offsets.
      (struct rmatch): new struct.
      (struct RMatch): reference struct rmatch.
      (RMATCH_REGS): new macro.
    
    * re.c (match_alloc): initialize struct rmatch.
      (pair_byte_cmp): new function.
      (update_char_offset): update character offsets.
      (match_init_copy): copy regexp and character offsets.
      (match_sublen): removed.
      (match_offset): use update_char_offset.
      (match_begin): ditto.
      (match_end): ditto.
      (rb_reg_search): make character offset updated flag false.
      (match_size): use RMATCH_REGS.
      (match_backref_number): ditto.
      (rb_reg_nth_defined): ditto.
      (rb_reg_nth_match): ditto.
      (rb_reg_match_pre): ditto.
      (rb_reg_match_post): ditto.
      (rb_reg_match_last): ditto.
      (match_array): ditto.
      (match_aref): ditto.
      (match_values_at): ditto.
      (match_inspect): ditto.
    
    * string.c (rb_str_subpat_set): use RMATCH_REGS.
      (rb_str_sub_bang): ditto.
      (str_gsub): ditto.
      (rb_str_split_m): ditto.
      (scan_once): ditto.
    
    * gc.c (obj_free): free character offsets.


  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=15513&r2=15512&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=15513&r2=15512&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/re.h?r1=15513&r2=15512&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/gc.c?r1=15513&r2=15512&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/re.c?r1=15513&r2=15512&diff_format=u

Index: include/ruby/re.h
===================================================================
--- include/ruby/re.h	(revision 15512)
+++ include/ruby/re.h	(revision 15513)
@@ -26,14 +26,28 @@
 
 typedef struct re_pattern_buffer Regexp;
 
+struct rmatch_offset {
+    int beg;
+    int end;
+};
+
+struct rmatch {
+    struct re_registers regs;
+
+    int char_offset_updated;
+    int char_offset_num_allocated;
+    struct rmatch_offset *char_offset;
+};
+
 struct RMatch {
     struct RBasic basic;
     VALUE str;
-    struct re_registers *regs;
+    struct rmatch *rmatch;
     VALUE regexp;  /* RRegexp */
 };
 
 #define RMATCH(obj)  (R_CAST(RMatch)(obj))
+#define RMATCH_REGS(obj)  (&(R_CAST(RMatch)(obj))->rmatch->regs)
 
 VALUE rb_reg_regcomp(VALUE);
 int rb_reg_search(VALUE, VALUE, int, int);
Index: re.c
===================================================================
--- re.c	(revision 15512)
+++ re.c	(revision 15513)
@@ -670,28 +670,130 @@
     OBJSETUP(match, klass, T_MATCH);
 
     match->str = 0;
-    match->regs = 0;
+    match->rmatch = 0;
     match->regexp = 0;
-    match->regs = ALLOC(struct re_registers);
-    MEMZERO(match->regs, struct re_registers, 1);
+    match->rmatch = ALLOC(struct rmatch);
+    MEMZERO(match->rmatch, struct rmatch, 1);
 
     return (VALUE)match;
 }
 
+typedef struct {
+    int byte_pos;
+    int char_pos;
+} pair_t;
+
+static int
+pair_byte_cmp(const void *pair1, const void *pair2)
+{
+    return ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
+}
+
+static void
+update_char_offset(VALUE match)
+{
+    struct rmatch *rm = RMATCH(match)->rmatch;
+    struct re_registers *regs;
+    int num_regs;
+    int i, num_pos, c;
+    char *s, *p, *q, *e;
+    rb_encoding *enc;
+    pair_t *pairs;
+
+    if (rm->char_offset_updated)
+        return;
+
+    regs = &rm->regs;
+    num_regs = rm->regs.num_regs;
+
+    if (rm->char_offset_num_allocated < num_regs) {
+        REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
+        rm->char_offset_num_allocated = num_regs;
+    }
+
+    enc = rb_enc_get(RMATCH(match)->str);
+    if (rb_enc_mbmaxlen(enc) == 1) {
+        for (i = 0; i < num_regs; i++) {
+            rm->char_offset[i].beg = BEG(i);
+            rm->char_offset[i].end = END(i);
+        }
+        rm->char_offset_updated = 1;
+        return;
+    }
+
+    pairs = ALLOCA_N(pair_t, num_regs*2);
+    num_pos = 0;
+    for (i = 0; i < num_regs; i++) {
+        if (BEG(i) < 0)
+            continue;
+        pairs[num_pos++].byte_pos = BEG(i);
+        pairs[num_pos++].byte_pos = END(i);
+    }
+    qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
+
+    s = p = RSTRING_PTR(RMATCH(match)->str);
+    e = s + RSTRING_LEN(RMATCH(match)->str);
+    c = 0;
+    for (i = 0; i < num_pos; i++) {
+        q = s + pairs[i].byte_pos;
+        c += rb_enc_strlen(p, q, enc);
+        pairs[i].char_pos = c;
+        p = q;
+    }
+
+    for (i = 0; i < num_regs; i++) {
+        pair_t key, *found;
+        if (BEG(i) < 0) {
+            rm->char_offset[i].beg = -1;
+            rm->char_offset[i].end = -1;
+            continue;
+        }
+
+        key.byte_pos = BEG(i);
+        found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
+        rm->char_offset[i].beg = found->char_pos;
+
+        key.byte_pos = END(i);
+        found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
+        rm->char_offset[i].end = found->char_pos;
+    }
+
+    rm->char_offset_updated = 1;
+}
+
 /* :nodoc: */
 static VALUE
 match_init_copy(VALUE obj, VALUE orig)
 {
+    struct rmatch *rm;
+
     if (obj == orig) return obj;
 
     if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
 	rb_raise(rb_eTypeError, "wrong argument class");
     }
     RMATCH(obj)->str = RMATCH(orig)->str;
-    onig_region_free(RMATCH(obj)->regs, 0);
-    RMATCH(obj)->regs->allocated = 0;
-    onig_region_copy(RMATCH(obj)->regs, RMATCH(orig)->regs);
+    RMATCH(obj)->regexp = RMATCH(orig)->regexp;
 
+    rm = RMATCH(obj)->rmatch;
+    onig_region_free(&rm->regs, 0);
+    rm->regs.allocated = 0;
+
+    onig_region_copy(&rm->regs, RMATCH_REGS(orig));
+
+    if (!RMATCH(orig)->rmatch->char_offset_updated) {
+        rm->char_offset_updated = 0;
+    }
+    else {
+        if (rm->char_offset_num_allocated < rm->regs.num_regs) {
+            REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
+            rm->char_offset_num_allocated = rm->regs.num_regs;
+        }
+        MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
+               struct rmatch_offset, rm->regs.num_regs);
+        rm->char_offset_updated = 1;
+    }
+
     return obj;
 }
 
@@ -747,25 +849,16 @@
 static VALUE
 match_size(VALUE match)
 {
-    return INT2FIX(RMATCH(match)->regs->num_regs);
+    return INT2FIX(RMATCH_REGS(match)->num_regs);
 }
 
-static VALUE
-match_sublen(VALUE str, int offset)
-{
-    int i;
-
-    i = rb_str_sublen(str, offset);
-    return INT2FIX(i);
-}
-
 static int
 match_backref_number(VALUE match, VALUE backref)
 {
     const char *name;
     int num;
 
-    struct re_registers *regs = RMATCH(match)->regs;
+    struct re_registers *regs = RMATCH_REGS(match);
     VALUE regexp = RMATCH(match)->regexp;
 
     switch(TYPE(backref)) {
@@ -816,15 +909,17 @@
 match_offset(VALUE match, VALUE n)
 {
     int i = match_backref_number(match, n);
+    struct re_registers *regs = RMATCH_REGS(match);
 
-    if (i < 0 || RMATCH(match)->regs->num_regs <= i)
+    if (i < 0 || regs->num_regs <= i)
 	rb_raise(rb_eIndexError, "index %d out of matches", i);
 
-    if (RMATCH(match)->regs->beg[i] < 0)
+    if (BEG(i) < 0)
 	return rb_assoc_new(Qnil, Qnil);
 
-    return rb_assoc_new(match_sublen(RMATCH(match)->str, RMATCH(match)->regs->beg[i]),
-			match_sublen(RMATCH(match)->str, RMATCH(match)->regs->end[i]));
+    update_char_offset(match);
+    return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
+			INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
 }
 
 
@@ -849,14 +944,16 @@
 match_begin(VALUE match, VALUE n)
 {
     int i = match_backref_number(match, n);
+    struct re_registers *regs = RMATCH_REGS(match);
 
-    if (i < 0 || RMATCH(match)->regs->num_regs <= i)
+    if (i < 0 || regs->num_regs <= i)
 	rb_raise(rb_eIndexError, "index %d out of matches", i);
 
-    if (RMATCH(match)->regs->beg[i] < 0)
+    if (BEG(i) < 0)
 	return Qnil;
 
-    return match_sublen(RMATCH(match)->str, RMATCH(match)->regs->beg[i]);
+    update_char_offset(match);
+    return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
 }
 
 
@@ -881,14 +978,16 @@
 match_end(VALUE match, VALUE n)
 {
     int i = match_backref_number(match, n);
+    struct re_registers *regs = RMATCH_REGS(match);
 
-    if (i < 0 || RMATCH(match)->regs->num_regs <= i)
+    if (i < 0 || regs->num_regs <= i)
 	rb_raise(rb_eIndexError, "index %d out of matches", i);
 
-    if (RMATCH(match)->regs->beg[i] < 0)
+    if (BEG(i) < 0)
 	return Qnil;
 
-    return match_sublen(RMATCH(match)->str, RMATCH(match)->regs->end[i]);
+    update_char_offset(match);
+    return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
 }
 
 #define MATCH_BUSY FL_USER2
@@ -1094,9 +1193,10 @@
 	    FL_UNSET(match, FL_TAINT);
     }
 
-    onig_region_copy(RMATCH(match)->regs, &regs);
+    onig_region_copy(RMATCH_REGS(match), &regs);
     RMATCH(match)->str = rb_str_new4(str);
     RMATCH(match)->regexp = re;
+    RMATCH(match)->rmatch->char_offset_updated = 0;
     rb_backref_set(match);
 
     OBJ_INFECT(match, re);
@@ -1108,15 +1208,17 @@
 VALUE
 rb_reg_nth_defined(int nth, VALUE match)
 {
+    struct re_registers *regs;
     if (NIL_P(match)) return Qnil;
-    if (nth >= RMATCH(match)->regs->num_regs) {
+    regs = RMATCH_REGS(match);
+    if (nth >= regs->num_regs) {
 	return Qnil;
     }
     if (nth < 0) {
-	nth += RMATCH(match)->regs->num_regs;
+	nth += regs->num_regs;
 	if (nth <= 0) return Qnil;
     }
-    if (RMATCH(match)->BEG(nth) == -1) return Qfalse;
+    if (BEG(nth) == -1) return Qfalse;
     return Qtrue;
 }
 
@@ -1125,18 +1227,20 @@
 {
     VALUE str;
     long start, end, len;
+    struct re_registers *regs;
 
     if (NIL_P(match)) return Qnil;
-    if (nth >= RMATCH(match)->regs->num_regs) {
+    regs = RMATCH_REGS(match);
+    if (nth >= regs->num_regs) {
 	return Qnil;
     }
     if (nth < 0) {
-	nth += RMATCH(match)->regs->num_regs;
+	nth += regs->num_regs;
 	if (nth <= 0) return Qnil;
     }
-    start = RMATCH(match)->BEG(nth);
+    start = BEG(nth);
     if (start == -1) return Qnil;
-    end = RMATCH(match)->END(nth);
+    end = END(nth);
     len = end - start;
     str = rb_str_subseq(RMATCH(match)->str, start, len);
     OBJ_INFECT(str, match);
@@ -1165,10 +1269,12 @@
 rb_reg_match_pre(VALUE match)
 {
     VALUE str;
+    struct re_registers *regs;
 
     if (NIL_P(match)) return Qnil;
-    if (RMATCH(match)->BEG(0) == -1) return Qnil;
-    str = rb_str_subseq(RMATCH(match)->str, 0, RMATCH(match)->BEG(0));
+    regs = RMATCH_REGS(match);
+    if (BEG(0) == -1) return Qnil;
+    str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
     return str;
 }
@@ -1190,11 +1296,13 @@
 {
     VALUE str;
     long pos;
+    struct re_registers *regs;
 
     if (NIL_P(match)) return Qnil;
-    if (RMATCH(match)->BEG(0) == -1) return Qnil;
+    regs = RMATCH_REGS(match);
+    if (BEG(0) == -1) return Qnil;
     str = RMATCH(match)->str;
-    pos = RMATCH(match)->END(0);
+    pos = END(0);
     str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
     return str;
@@ -1204,11 +1312,13 @@
 rb_reg_match_last(VALUE match)
 {
     int i;
+    struct re_registers *regs;
 
     if (NIL_P(match)) return Qnil;
-    if (RMATCH(match)->BEG(0) == -1) return Qnil;
+    regs = RMATCH_REGS(match);
+    if (BEG(0) == -1) return Qnil;
 
-    for (i=RMATCH(match)->regs->num_regs-1; RMATCH(match)->BEG(i) == -1 && i > 0; i--)
+    for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
 	;
     if (i == 0) return Qnil;
     return rb_reg_nth_match(i, match);
@@ -1241,7 +1351,7 @@
 static VALUE
 match_array(VALUE match, int start)
 {
-    struct re_registers *regs = RMATCH(match)->regs;
+    struct re_registers *regs = RMATCH_REGS(match);
     VALUE ary = rb_ary_new2(regs->num_regs);
     VALUE target = RMATCH(match)->str;
     int i;
@@ -1381,7 +1491,7 @@
             p = StringValuePtr(idx);
 
           name_to_backref:
-            num = name_to_backref_number(RMATCH(match)->regs,
+            num = name_to_backref_number(RMATCH_REGS(match),
                        RMATCH(match)->regexp, p, p + strlen(p));
             return rb_reg_nth_match(num, match);
             break;
@@ -1419,7 +1529,8 @@
 static VALUE
 match_values_at(int argc, VALUE *argv, VALUE match)
 {
-    return rb_get_values_at(match, RMATCH(match)->regs->num_regs, argc, argv, match_entry);
+    struct re_registers *regs = RMATCH_REGS(match);
+    return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
 }
 
 
@@ -1506,7 +1617,8 @@
     char *cname = rb_obj_classname(match);
     VALUE str;
     int i;
-    int num_regs = RMATCH(match)->regs->num_regs;
+    struct re_registers *regs = RMATCH_REGS(match);
+    int num_regs = regs->num_regs;
     struct backref_name_tag *names;
     VALUE regexp = RMATCH(match)->regexp;
 
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 15512)
+++ ChangeLog	(revision 15513)
@@ -1,3 +1,40 @@
+Sun Feb 17 03:37:01 2008  Tanaka Akira  <akr@f...>
+
+	* include/ruby/re.h (struct rmatch_offset): new struct for character
+	  offsets.
+	  (struct rmatch): new struct.
+	  (struct RMatch): reference struct rmatch.
+	  (RMATCH_REGS): new macro.
+
+	* re.c (match_alloc): initialize struct rmatch.
+	  (pair_byte_cmp): new function.
+	  (update_char_offset): update character offsets.
+	  (match_init_copy): copy regexp and character offsets.
+	  (match_sublen): removed.
+	  (match_offset): use update_char_offset.
+	  (match_begin): ditto.
+	  (match_end): ditto.
+	  (rb_reg_search): make character offset updated flag false.
+	  (match_size): use RMATCH_REGS.
+	  (match_backref_number): ditto.
+	  (rb_reg_nth_defined): ditto.
+	  (rb_reg_nth_match): ditto.
+	  (rb_reg_match_pre): ditto.
+	  (rb_reg_match_post): ditto.
+	  (rb_reg_match_last): ditto.
+	  (match_array): ditto.
+	  (match_aref): ditto.
+	  (match_values_at): ditto.
+	  (match_inspect): ditto.
+
+	* string.c (rb_str_subpat_set): use RMATCH_REGS.
+	  (rb_str_sub_bang): ditto.
+	  (str_gsub): ditto.
+	  (rb_str_split_m): ditto.
+	  (scan_once): ditto.
+
+	* gc.c (obj_free): free character offsets.
+
 Sun Feb 17 03:13:40 2008  NAKAMURA Usaku  <usa@r...>
 
 	* win32/resource.rb: made version infos confirm to OS spec.
Index: string.c
===================================================================
--- string.c	(revision 15512)
+++ string.c	(revision 15513)
@@ -2716,27 +2716,29 @@
     VALUE match;
     long start, end, len;
     rb_encoding *enc;
+    struct re_registers *regs;
 
     if (rb_reg_search(re, str, 0, 0) < 0) {
 	rb_raise(rb_eIndexError, "regexp not matched");
     }
     match = rb_backref_get();
-    if (nth >= RMATCH(match)->regs->num_regs) {
+    regs = RMATCH_REGS(match);
+    if (nth >= regs->num_regs) {
       out_of_range:
 	rb_raise(rb_eIndexError, "index %d out of regexp", nth);
     }
     if (nth < 0) {
-	if (-nth >= RMATCH(match)->regs->num_regs) {
+	if (-nth >= regs->num_regs) {
 	    goto out_of_range;
 	}
-	nth += RMATCH(match)->regs->num_regs;
+	nth += regs->num_regs;
     }
 
-    start = RMATCH(match)->BEG(nth);
+    start = BEG(nth);
     if (start == -1) {
 	rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
     }
-    end = RMATCH(match)->END(nth);
+    end = END(nth);
     len = end - start;
     StringValue(val);
     enc = rb_enc_check(str, val);
@@ -2967,7 +2969,7 @@
 	int cr = ENC_CODERANGE(str);
 
 	match = rb_backref_get();
-	regs = RMATCH(match)->regs;
+	regs = RMATCH_REGS(match);
 
 	if (iter || !NIL_P(hash)) {
 	    char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
@@ -3114,7 +3116,7 @@
     do {
 	n++;
 	match = rb_backref_get();
-	regs = RMATCH(match)->regs;
+	regs = RMATCH_REGS(match);
 	if (iter || !NIL_P(hash)) {
             if (iter) {
                 rb_match_busy(match);
@@ -4751,7 +4753,7 @@
 	struct re_registers *regs;
 
 	while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
-	    regs = RMATCH(rb_backref_get())->regs;
+	    regs = RMATCH_REGS(rb_backref_get());
 	    if (start == end && BEG(0) == END(0)) {
 		if (!RSTRING_PTR(str)) {
 		    rb_ary_push(result, rb_str_new("", 0));
@@ -5397,7 +5399,7 @@
     enc = STR_ENC_GET(str);
     if (rb_reg_search(pat, str, *start, 0) >= 0) {
 	match = rb_backref_get();
-	regs = RMATCH(match)->regs;
+	regs = RMATCH_REGS(match);
 	if (BEG(0) == END(0)) {
 	    /*
 	     * Always consume at least one character of the input string
Index: gc.c
===================================================================
--- gc.c	(revision 15512)
+++ gc.c	(revision 15513)
@@ -1322,9 +1322,12 @@
 	}
 	break;
       case T_MATCH:
-	if (RANY(obj)->as.match.regs) {
-	    onig_region_free(RANY(obj)->as.match.regs, 0);
-	    RUBY_CRITICAL(free(RANY(obj)->as.match.regs));
+	if (RANY(obj)->as.match.rmatch) {
+            struct rmatch *rm = RANY(obj)->as.match.rmatch;
+	    onig_region_free(&rm->regs, 0);
+            if (rm->char_offset)
+                RUBY_CRITICAL(free(rm->char_offset));
+	    RUBY_CRITICAL(free(rm));
 	}
 	break;
       case T_FILE:

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]