ruby-changes:4984

matz	2008-05-19 17:25:03 +0900 (Mon, 19 May 2008)

  New Revision: 16477

  Modified files:
    trunk/ChangeLog
    trunk/common.mk
    trunk/encoding.c
    trunk/include/ruby/encoding.h
    trunk/re.c
    trunk/regenc.h
    trunk/regexec.c
    trunk/regparse.c
    trunk/string.c

  Log:
    * regparse.c (PINC): use optimized enclen() instead of
      ONIGENC_MBC_ENC_LEN().
    
    * regparse.c (PFETCH): ditto.
    
    * regparse.c (PFETCH): small optimization.
    
    * regexec.c (slow_search): single byte encoding optimization.
    
    * regenc.h (enclen): avoid calling function when encoding's
      min_len == max_len.
    
    * re.c (rb_reg_regsub): rb_enc_ascget() optimization for single
      byte encoding.
    
    * re.c (rb_reg_search): avoid allocating new re_registers if we
      already have MatchData.
    
    * re.c (match_init_copy): avoid unnecessary onig_region_free()
      before onig_region_copy. 
    
    * encoding.c (rb_enc_get_index): remove implicit enc_capable check
      each time.
    
    * encoding.c (rb_enc_set_index): ditto.
    
    * encoding.c (enc_compatible_p): small refactoring.
    
    * include/ruby/encoding.h (rb_enc_dummy_p): inline
      rb_enc_dummy_p() and export related code.

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=16477&r2=16476&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/regexec.c?r1=16477&r2=16476&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=16477&r2=16476&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/encoding.h?r1=16477&r2=16476&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/re.c?r1=16477&r2=16476&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/encoding.c?r1=16477&r2=16476&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/regparse.c?r1=16477&r2=16476&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/regenc.h?r1=16477&r2=16476&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/common.mk?r1=16477&r2=16476&diff_format=u

Index: regparse.c
===================================================================
--- regparse.c	(revision 16476)
+++ regparse.c	(revision 16477)
@@ -253,12 +253,12 @@
 #define PUNFETCH     p = pfetch_prev
 #define PINC       do { \
   pfetch_prev = p; \
-  p += ONIGENC_MBC_ENC_LEN(enc, p, end); \
+  p += enclen(enc, p, end); \
 } while (0)
 #define PFETCH(c)  do { \
-  c = ONIGENC_MBC_TO_CODE(enc, p, end); \
+  c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
   pfetch_prev = p; \
-  p += ONIGENC_MBC_ENC_LEN(enc, p, end); \
+  p += enclen(enc, p, end); \
 } while (0)
 
 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
Index: encoding.c
===================================================================
--- encoding.c	(revision 16476)
+++ encoding.c	(revision 16477)
@@ -18,7 +18,7 @@
 #endif
 
 static ID id_encoding, id_base_encoding;
-static VALUE rb_cEncoding;
+VALUE rb_cEncoding;
 
 struct rb_encoding_entry {
     const char *name;
@@ -38,14 +38,6 @@
 
 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
 
-#define ENC_UNINITIALIZED (&rb_cEncoding)
-#define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
-#define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
-
-#define ENC_DUMMY_FLAG FL_USER2
-#define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG)
-#define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG)
-
 static int load_encoding(const char *name);
 static VALUE enc_base_encoding(VALUE self);
 
@@ -318,15 +310,6 @@
     return index;
 }
 
-int
-rb_enc_dummy_p(rb_encoding *enc)
-{
-    VALUE encoding;
-    if (!enc_initialized_p(enc)) return Qfalse;
-    encoding = rb_enc_from_encoding(enc);
-    return ENC_DUMMY_P(encoding);
-}
-
 /*
  * call-seq:
  *   enc.dummy? => true or false
@@ -343,7 +326,7 @@
 static VALUE
 enc_dummy_p(VALUE enc)
 {
-    return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
+    return ENC_DUMMY_P(enc) ? Qtrue : Qfalse;
 }
 
 static int
@@ -555,7 +538,7 @@
 }
 
 int
-rb_enc_internal_get_index(VALUE obj)
+rb_enc_get_index(VALUE obj)
 {
     int i;
 
@@ -570,7 +553,7 @@
 }
 
 void
-rb_enc_internal_set_index(VALUE obj, int idx)
+rb_enc_set_index(VALUE obj, int idx)
 {
     if (idx < ENCODING_INLINE_MAX) {
 	ENCODING_SET_INLINED(obj, idx);
@@ -584,14 +567,14 @@
 void
 rb_enc_associate_index(VALUE obj, int idx)
 {
-    enc_check_capable(obj);
-    if (rb_enc_internal_get_index(obj) == idx)
+//    enc_check_capable(obj);
+    if (rb_enc_get_index(obj) == idx)
     	return;
     if (!ENC_CODERANGE_ASCIIONLY(obj) ||
 	!rb_enc_asciicompat(rb_enc_from_index(idx))) {
 	ENC_CODERANGE_CLEAR(obj);
     }
-    rb_enc_internal_set_index(obj, idx);
+    rb_enc_set_index(obj, idx);
 }
 
 void
@@ -600,13 +583,6 @@
     rb_enc_associate_index(obj, rb_enc_to_index(enc));
 }
 
-int
-rb_enc_get_index(VALUE obj)
-{
-    if (!enc_capable(obj)) return -1;
-    return rb_enc_internal_get_index(obj);
-}
-
 rb_encoding*
 rb_enc_get(VALUE obj)
 {
@@ -906,11 +882,13 @@
 static VALUE
 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
 {
-    rb_encoding *enc = rb_enc_compatible(str1, str2);
-    VALUE encoding = Qnil;
-    if (!enc || !(encoding = rb_enc_from_encoding(enc)))
-	encoding = Qnil;
-    return encoding;
+    rb_encoding *enc;
+
+    if (!enc_capable(str1)) return Qnil;
+    if (!enc_capable(str2)) return Qnil;
+    enc = rb_enc_compatible(str1, str2);
+    if (!enc) return Qnil;
+    return rb_enc_from_encoding(enc);
 }
 
 /* :nodoc: */
Index: include/ruby/encoding.h
===================================================================
--- include/ruby/encoding.h	(revision 16476)
+++ include/ruby/encoding.h	(revision 16477)
@@ -33,14 +33,14 @@
     if (encoding_set_enc_index < ENCODING_INLINE_MAX) \
         ENCODING_SET_INLINED(rb_encoding_set_obj, encoding_set_enc_index); \
     else \
-        rb_enc_internal_set_index(rb_encoding_set_obj, encoding_set_enc_index); \
+        rb_enc_set_index(rb_encoding_set_obj, encoding_set_enc_index); \
 } while (0)
 
 #define ENCODING_GET_INLINED(obj) ((RBASIC(obj)->flags & ENCODING_MASK)>>ENCODING_SHIFT)
 #define ENCODING_GET(obj) \
     (ENCODING_GET_INLINED(obj) != ENCODING_INLINE_MAX ? \
      ENCODING_GET_INLINED(obj) : \
-     rb_enc_internal_get_index(obj))
+     rb_enc_get_index(obj))
 
 #define ENCODING_IS_ASCII8BIT(obj) (ENCODING_GET_INLINED(obj) == 0)
 
@@ -74,9 +74,9 @@
 
 int rb_enc_replicate(const char *, rb_encoding *);
 int rb_define_dummy_encoding(const char *);
-int rb_enc_dummy_p(rb_encoding *);
 #define rb_enc_to_index(enc) ((enc) ? ((enc)->ruby_encoding_index) : 0)
 int rb_enc_get_index(VALUE obj);
+void rb_enc_set_index(VALUE obj, int encindex);
 int rb_enc_find_index(const char *name);
 int rb_to_encoding_index(VALUE);
 rb_encoding* rb_to_encoding(VALUE);
@@ -86,8 +86,6 @@
 void rb_enc_associate_index(VALUE, int);
 void rb_enc_associate(VALUE, rb_encoding*);
 void rb_enc_copy(VALUE dst, VALUE src);
-int rb_enc_internal_get_index(VALUE obj);
-void rb_enc_internal_set_index(VALUE obj, int encindex);
 
 VALUE rb_enc_str_new(const char*, long, rb_encoding*);
 VALUE rb_enc_reg_new(const char*, long, rb_encoding*, int);
@@ -154,7 +152,7 @@
 #define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE(enc,c)
 #define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT(enc,c)
 
-#define rb_enc_asciicompat(enc) (!rb_enc_dummy_p(enc) && rb_enc_mbminlen(enc)==1)
+#define rb_enc_asciicompat(enc) (rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc))
 
 int rb_enc_casefold(char *to, const char *p, const char *e, rb_encoding *enc);
 int rb_enc_toupper(int c, rb_encoding *enc);
@@ -178,4 +176,21 @@
 VALUE rb_locale_charmap(VALUE klass);
 long rb_memsearch(const void*,long,const void*,long,rb_encoding*);
 
+RUBY_EXTERN VALUE rb_cEncoding;
+
+#define ENC_UNINITIALIZED (&rb_cEncoding)
+#define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
+#define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
+
+#define ENC_DUMMY_FLAG FL_USER2
+#define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG)
+#define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG)
+
+static inline int
+rb_enc_dummy_p(rb_encoding *enc)
+{
+    if (!enc_initialized_p(enc)) return Qfalse;
+    return ENC_DUMMY_P(ENC_FROM_ENCODING(enc));
+}
+
 #endif /* RUBY_ENCODING_H */
Index: re.c
===================================================================
--- re.c	(revision 16476)
+++ re.c	(revision 16477)
@@ -881,9 +881,6 @@
     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
 
     rm = RMATCH(obj)->rmatch;
-    onig_region_free(&rm->regs, 0);
-    rm->regs.allocated = 0;
-
     onig_region_copy(&rm->regs, RMATCH_REGS(orig));
 
     if (!RMATCH(orig)->rmatch->char_offset_updated) {
@@ -1265,7 +1262,7 @@
 {
     int result;
     VALUE match;
-    struct re_registers regs;
+    struct re_registers *regs, regi;
     char *range = RSTRING_PTR(str);
     regex_t *reg0 = RREGEXP(re)->ptr, *reg;
     int busy = FL_TEST(re, REG_BUSY);
@@ -1277,17 +1274,29 @@
 
     reg = rb_reg_prepare_re(re, str);
 
+    match = rb_backref_get();
+    if (!NIL_P(match)) {
+	if (FL_TEST(match, MATCH_BUSY)) {
+	    match = Qnil;
+	}
+	else {
+	    regs = RMATCH_REGS(match);
+	}
+    }
+    if (NIL_P(match)) {
+	regs = &regi;
+	MEMZERO(regs, struct re_registers, 1);
+    }
     FL_SET(re, REG_BUSY);
     if (!reverse) {
 	range += RSTRING_LEN(str);
     }
-    MEMZERO(&regs, struct re_registers, 1);
     result = onig_search(reg,
 			 (UChar*)(RSTRING_PTR(str)),
 			 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
 			 ((UChar*)(RSTRING_PTR(str)) + pos),
 			 ((UChar*)range),
-			 &regs, ONIG_OPTION_NONE);
+			 regs, ONIG_OPTION_NONE);
 
     if (RREGEXP(re)->ptr != reg) {
 	if (busy) {
@@ -1300,7 +1309,8 @@
     }
     if (!busy) FL_UNSET(re, REG_BUSY);
     if (result < 0) {
-	onig_region_free(&regs, 0);
+	if (regs == &regi)
+	    onig_region_free(regs, 0);
 	if (result == ONIG_MISMATCH) {
 	    rb_backref_set(Qnil);
 	    return result;
@@ -1312,9 +1322,10 @@
 	}
     }
 
-    match = rb_backref_get();
-    if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
+    if (NIL_P(match)) {
 	match = match_alloc(rb_cMatch);
+	onig_region_copy(RMATCH_REGS(match), regs);
+	onig_region_free(regs, 0);
     }
     else {
 	if (rb_safe_level() >= 3)
@@ -1323,8 +1334,6 @@
 	    FL_UNSET(match, FL_TAINT);
     }
 
-    onig_region_copy(RMATCH_REGS(match), &regs);
-    onig_region_free(&regs, 0);
     RMATCH(match)->str = rb_str_new4(str);
     RMATCH(match)->regexp = re;
     RMATCH(match)->rmatch->char_offset_updated = 0;
@@ -3088,12 +3097,14 @@
     int no, clen;
     rb_encoding *str_enc = rb_enc_get(str);
     rb_encoding *src_enc = rb_enc_get(src);
+    int acompat = rb_enc_asciicompat(str_enc);
+#define ASCGET(s,e,cl) (acompat ? (*cl=1,s[0]) : rb_enc_ascget(s, e, cl, str_enc))
 
     p = s = RSTRING_PTR(str);
     e = s + RSTRING_LEN(str);
 
     while (s < e) {
-        int c = rb_enc_ascget(s, e, &clen, str_enc);
+        int c = ASCGET(s, e, &clen);
 	char *ss;
 
 	if (c == -1) {
@@ -3110,7 +3121,7 @@
 	}
         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
 
-        c = rb_enc_ascget(s, e, &clen, str_enc);
+        c = ASCGET(s, e, &clen);
         if (c == -1) {
             s += mbclen(s, e, str_enc);
 	    rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
@@ -3132,12 +3143,12 @@
 	    break;
 
           case 'k':
-            if (s < e && rb_enc_ascget(s, e, &clen, str_enc) == '<') {
+            if (s < e && ASCGET(s, e, &clen) == '<') {
                 char *name, *name_end;
                
                 name_end = name = s + clen;
                 while (name_end < e) {
-                    c = rb_enc_ascget(name_end, e, &clen, str_enc);
+                    c = ASCGET(name_end, e, &clen);
                     if (c == '>') break;
                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
                 }
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 16476)
+++ ChangeLog	(revision 16477)
@@ -1,3 +1,36 @@
+Mon May 19 17:23:55 2008  Yukihiro Matsumoto  <matz@r...>
+
+	* regparse.c (PINC): use optimized enclen() instead of
+	  ONIGENC_MBC_ENC_LEN().
+
+	* regparse.c (PFETCH): ditto.
+
+	* regparse.c (PFETCH): small optimization.
+
+	* regexec.c (slow_search): single byte encoding optimization.
+
+	* regenc.h (enclen): avoid calling function when encoding's
+	  min_len == max_len.
+
+	* re.c (rb_reg_regsub): rb_enc_ascget() optimization for single
+	  byte encoding.
+
+	* re.c (rb_reg_search): avoid allocating new re_registers if we
+	  already have MatchData.
+
+	* re.c (match_init_copy): avoid unnecessary onig_region_free()
+	  before onig_region_copy. 
+
+	* encoding.c (rb_enc_get_index): remove implicit enc_capable check
+	  each time.
+
+	* encoding.c (rb_enc_set_index): ditto.
+
+	* encoding.c (enc_compatible_p): small refactoring.
+
+	* include/ruby/encoding.h (rb_enc_dummy_p): inline
+	  rb_enc_dummy_p() and export related code.
+
 Mon May 19 14:32:03 2008  Koichi Sasada  <ko1@a...>
 
 	* version.h: fix strange change by version.h update tool.
Index: regenc.h
===================================================================
--- regenc.h	(revision 16476)
+++ regenc.h	(revision 16477)
@@ -70,7 +70,7 @@
 #define ONIG_CHECK_NULL_RETURN(p)          if (ONIG_IS_NULL(p)) return NULL
 #define ONIG_CHECK_NULL_RETURN_VAL(p,val)  if (ONIG_IS_NULL(p)) return (val)
 
-#define enclen(enc,p,e)      ONIGENC_MBC_ENC_LEN(enc,p,e)
+#define enclen(enc,p,e) ((enc->max_enc_len == enc->min_enc_len) ? enc->min_enc_len : ONIGENC_MBC_ENC_LEN(enc,p,e))
 
 /* character types bit flag */
 #define BIT_CTYPE_NEWLINE  (1<< ONIGENC_CTYPE_NEWLINE)
Index: string.c
===================================================================
--- string.c	(revision 16476)
+++ string.c	(revision 16477)
@@ -256,7 +256,7 @@
 static inline void
 str_enc_copy(VALUE str1, VALUE str2)
 {
-    rb_enc_internal_set_index(str1, ENCODING_GET(str2));
+    rb_enc_set_index(str1, ENCODING_GET(str2));
 }
 
 static void
Index: common.mk
===================================================================
--- common.mk	(revision 16476)
+++ common.mk	(revision 16477)
@@ -615,7 +615,7 @@
   {$(VPATH)}eval_intern.h {$(VPATH)}util.h {$(VPATH)}dln.h
 time.$(OBJEXT): {$(VPATH)}time.c {$(VPATH)}ruby.h {$(VPATH)}config.h \
   {$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \
-  {$(VPATH)}st.h
+  {$(VPATH)}st.h {$(VPATH)}encoding.h 
 util.$(OBJEXT): {$(VPATH)}util.c {$(VPATH)}ruby.h {$(VPATH)}config.h \
   {$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \
   {$(VPATH)}st.h {$(VPATH)}util.h
Index: regexec.c
===================================================================
--- regexec.c	(revision 16476)
+++ regexec.c	(revision 16477)
@@ -2758,16 +2758,25 @@
 
   s = (UChar* )text;
 
+  if (enc->max_enc_len == enc->min_enc_len) {
+    int n = enc->max_enc_len;
+
+    while (s < end) {
+      if (*s == *target) {
+	p = s + 1;
+	t = target + 1;
+	if (memcmp(t, p, target_end - t) == 0)
+	  return s;
+      }
+      s += n;
+    }
+    return (UChar*)NULL;
+  }
   while (s < end) {
     if (*s == *target) {
       p = s + 1;
       t = target + 1;
-      while (t < target_end) {
-	if (*t != *p++)
-	  break;
-	t++;
-      }
-      if (t == target_end)
+      if (memcmp(t, p, target_end - t) == 0)
 	return s;
     }
     s += enclen(enc, s, end);

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/