ruby-changes:40269

ko1	2015-10-29 18:10:32 +0900 (Thu, 29 Oct 2015)

  New Revision: 52350

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=52350

  Log:
    * encoding.c (rb_enc_check_str): add for performance.
      This function only accept T_STRING (and T_REGEXP).
    
      This patch improves performance of a tiny_segmenter benchmark
      (num=2) 2.54sec -> 2.42sec on my machine.
      https://github.com/chezou/TinySegmenter.jl/blob/master/benchmark/benchmark.rb
    
    * encoding.c: add ENC_DEBUG and ENC_ASSERT() macros.
    
    * internal.h: add a decl. of rb_enc_check_str().
    
    * string.c (rb_str_plus): use rb_enc_check_str().
    
    * string.c (rb_str_subpat_set): ditto.

  Modified files:
    trunk/ChangeLog
    trunk/encoding.c
    trunk/internal.h
    trunk/string.c
Index: encoding.c
===================================================================
--- encoding.c	(revision 52349)
+++ encoding.c	(revision 52350)
@@ -15,6 +15,12 @@ https://github.com/ruby/ruby/blob/trunk/encoding.c#L15
 #include <ctype.h>
 #include "ruby/util.h"
 
+#include <assert.h>
+#ifndef ENC_DEBUG
+#define ENC_DEBUG 0
+#endif
+#define ENC_ASSERT(expr) do { if (ENC_DEBUG) {assert(expr);} } while (0)
+
 #undef rb_ascii8bit_encindex
 #undef rb_utf8_encindex
 #undef rb_usascii_encindex
@@ -743,6 +749,19 @@ rb_id_encoding(void) https://github.com/ruby/ruby/blob/trunk/encoding.c#L749
     return id_encoding;
 }
 
+static int
+enc_get_index_str(VALUE str)
+{
+    int i = ENCODING_GET_INLINED(str);
+    if (i == ENCODING_INLINE_MAX) {
+	VALUE iv;
+
+	iv = rb_ivar_get(str, rb_id_encoding());
+	i = NUM2INT(iv);
+    }
+    return i;
+}
+
 int
 rb_enc_get_index(VALUE obj)
 {
@@ -758,13 +777,7 @@ rb_enc_get_index(VALUE obj) https://github.com/ruby/ruby/blob/trunk/encoding.c#L777
       default:
       case T_STRING:
       case T_REGEXP:
-	i = ENCODING_GET_INLINED(obj);
-	if (i == ENCODING_INLINE_MAX) {
-	    VALUE iv;
-
-	    iv = rb_ivar_get(obj, rb_id_encoding());
-	    i = NUM2INT(iv);
-	}
+	i = enc_get_index_str(obj);
 	break;
       case T_FILE:
 	tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0);
@@ -842,6 +855,21 @@ rb_enc_get(VALUE obj) https://github.com/ruby/ruby/blob/trunk/encoding.c#L855
     return rb_enc_from_index(rb_enc_get_index(obj));
 }
 
+static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2);
+
+rb_encoding*
+rb_enc_check_str(VALUE str1, VALUE str2)
+{
+    rb_encoding *enc = enc_compatible_str(str1, str2);
+    ENC_ASSERT(TYPE(str1) == T_STRING);
+    ENC_ASSERT(TYPE(str2) == T_STRING);
+    if (!enc)
+	rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
+		 rb_enc_name(rb_enc_get(str1)),
+		 rb_enc_name(rb_enc_get(str2)));
+    return enc;
+}
+
 rb_encoding*
 rb_enc_check(VALUE str1, VALUE str2)
 {
@@ -853,40 +881,28 @@ rb_enc_check(VALUE str1, VALUE str2) https://github.com/ruby/ruby/blob/trunk/encoding.c#L881
     return enc;
 }
 
-rb_encoding*
-rb_enc_compatible(VALUE str1, VALUE str2)
+static rb_encoding*
+enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
 {
-    int idx1, idx2;
-    rb_encoding *enc1, *enc2;
     int isstr1, isstr2;
-
-    idx1 = rb_enc_get_index(str1);
-    idx2 = rb_enc_get_index(str2);
-
-    if (idx1 < 0 || idx2 < 0)
-        return 0;
-
-    if (idx1 == idx2) {
-	return rb_enc_from_index(idx1);
-    }
-    enc1 = rb_enc_from_index(idx1);
-    enc2 = rb_enc_from_index(idx2);
+    rb_encoding *enc1 = rb_enc_from_index(idx1);
+    rb_encoding *enc2 = rb_enc_from_index(idx2);
 
     isstr2 = RB_TYPE_P(str2, T_STRING);
     if (isstr2 && RSTRING_LEN(str2) == 0)
-	return enc1;
+      return enc1;
     isstr1 = RB_TYPE_P(str1, T_STRING);
     if (isstr1 && RSTRING_LEN(str1) == 0)
-	return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
+      return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
     if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
 	return 0;
     }
 
     /* objects whose encoding is the same of contents */
     if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
-	return enc1;
+      return enc1;
     if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
-	return enc2;
+      return enc2;
 
     if (!isstr1) {
 	VALUE tmp = str1;
@@ -915,11 +931,44 @@ rb_enc_compatible(VALUE str1, VALUE str2 https://github.com/ruby/ruby/blob/trunk/encoding.c#L931
 	    }
 	}
 	if (cr1 == ENC_CODERANGE_7BIT)
-	    return enc2;
+	  return enc2;
     }
     return 0;
 }
 
+static rb_encoding*
+enc_compatible_str(VALUE str1, VALUE str2)
+{
+    int idx1 = enc_get_index_str(str1);
+    int idx2 = enc_get_index_str(str2);
+
+    if (idx1 < 0 || idx2 < 0)
+        return 0;
+
+    if (idx1 == idx2) {
+	return rb_enc_from_index(idx1);
+    }
+    else {
+	return enc_compatible_latter(str1, str2, idx1, idx2);
+    }
+}
+
+rb_encoding*
+rb_enc_compatible(VALUE str1, VALUE str2)
+{
+    int idx1 = rb_enc_get_index(str1);
+    int idx2 = rb_enc_get_index(str2);
+
+    if (idx1 < 0 || idx2 < 0)
+        return 0;
+
+    if (idx1 == idx2) {
+	return rb_enc_from_index(idx1);
+    }
+
+    return enc_compatible_latter(str1, str2, idx1, idx2);
+}
+
 void
 rb_enc_copy(VALUE obj1, VALUE obj2)
 {
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 52349)
+++ ChangeLog	(revision 52350)
@@ -1,3 +1,20 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1
+Thu Oct 29 18:05:22 2015  Koichi Sasada  <ko1@a...>
+
+	* encoding.c (rb_enc_check_str): add for performance.
+	  This function only accept T_STRING (and T_REGEXP).
+
+	  This patch improves performance of a tiny_segmenter benchmark
+	  (num=2) 2.54sec -> 2.42sec on my machine.
+	  https://github.com/chezou/TinySegmenter.jl/blob/master/benchmark/benchmark.rb
+
+	* encoding.c: add ENC_DEBUG and ENC_ASSERT() macros.
+
+	* internal.h: add a decl. of rb_enc_check_str().
+
+	* string.c (rb_str_plus): use rb_enc_check_str().
+
+	* string.c (rb_str_subpat_set): ditto.
+
 Thu Oct 29 17:16:40 2015  Koichi Sasada  <ko1@a...>
 
 	* internal.h: export rb_wb_(un)protected_newobj_of()
Index: string.c
===================================================================
--- string.c	(revision 52349)
+++ string.c	(revision 52350)
@@ -1598,7 +1598,7 @@ rb_str_plus(VALUE str1, VALUE str2) https://github.com/ruby/ruby/blob/trunk/string.c#L1598
     long len1, len2;
 
     StringValue(str2);
-    enc = rb_enc_check(str1, str2);
+    enc = rb_enc_check_str(str1, str2);
     RSTRING_GETMEM(str1, ptr1, len1);
     RSTRING_GETMEM(str2, ptr2, len2);
     str3 = rb_str_new(0, len1+len2);
@@ -4072,7 +4072,7 @@ rb_str_subpat_set(VALUE str, VALUE re, V https://github.com/ruby/ruby/blob/trunk/string.c#L4072
     end = END(nth);
     len = end - start;
     StringValue(val);
-    enc = rb_enc_check(str, val);
+    enc = rb_enc_check_str(str, val);
     rb_str_splice_0(str, start, len, val);
     rb_enc_associate(str, enc);
 }
Index: internal.h
===================================================================
--- internal.h	(revision 52349)
+++ internal.h	(revision 52350)
@@ -716,6 +716,7 @@ void Init_ext(void); https://github.com/ruby/ruby/blob/trunk/internal.h#L716
 ID rb_id_encoding(void);
 void rb_gc_mark_encodings(void);
 rb_encoding *rb_enc_get_from_index(int index);
+rb_encoding *rb_enc_check_str(VALUE str1, VALUE str2);
 int rb_encdb_replicate(const char *alias, const char *orig);
 int rb_encdb_alias(const char *alias, const char *orig);
 int rb_encdb_dummy(const char *name);

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/