ruby-changes:40269
From: ko1 <ko1@a...>
Date: Thu, 29 Oct 2015 18:10:45 +0900 (JST)
Subject: [ruby-changes:40269] ko1:r52350 (trunk): * encoding.c (rb_enc_check_str): add for performance.
ko1 2015-10-29 18:10:32 +0900 (Thu, 29 Oct 2015) New Revision: 52350 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=52350 Log: * encoding.c (rb_enc_check_str): add for performance. This function only accept T_STRING (and T_REGEXP). This patch improves performance of a tiny_segmenter benchmark (num=2) 2.54sec -> 2.42sec on my machine. https://github.com/chezou/TinySegmenter.jl/blob/master/benchmark/benchmark.rb * encoding.c: add ENC_DEBUG and ENC_ASSERT() macros. * internal.h: add a decl. of rb_enc_check_str(). * string.c (rb_str_plus): use rb_enc_check_str(). * string.c (rb_str_subpat_set): ditto. Modified files: trunk/ChangeLog trunk/encoding.c trunk/internal.h trunk/string.c Index: encoding.c =================================================================== --- encoding.c (revision 52349) +++ encoding.c (revision 52350) @@ -15,6 +15,12 @@ https://github.com/ruby/ruby/blob/trunk/encoding.c#L15 #include <ctype.h> #include "ruby/util.h" +#include <assert.h> +#ifndef ENC_DEBUG +#define ENC_DEBUG 0 +#endif +#define ENC_ASSERT(expr) do { if (ENC_DEBUG) {assert(expr);} } while (0) + #undef rb_ascii8bit_encindex #undef rb_utf8_encindex #undef rb_usascii_encindex @@ -743,6 +749,19 @@ rb_id_encoding(void) https://github.com/ruby/ruby/blob/trunk/encoding.c#L749 return id_encoding; } +static int +enc_get_index_str(VALUE str) +{ + int i = ENCODING_GET_INLINED(str); + if (i == ENCODING_INLINE_MAX) { + VALUE iv; + + iv = rb_ivar_get(str, rb_id_encoding()); + i = NUM2INT(iv); + } + return i; +} + int rb_enc_get_index(VALUE obj) { @@ -758,13 +777,7 @@ rb_enc_get_index(VALUE obj) https://github.com/ruby/ruby/blob/trunk/encoding.c#L777 default: case T_STRING: case T_REGEXP: - i = ENCODING_GET_INLINED(obj); - if (i == ENCODING_INLINE_MAX) { - VALUE iv; - - iv = rb_ivar_get(obj, rb_id_encoding()); - i = NUM2INT(iv); - } + i = enc_get_index_str(obj); break; case T_FILE: tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0); @@ -842,6 +855,21 @@ rb_enc_get(VALUE obj) https://github.com/ruby/ruby/blob/trunk/encoding.c#L855 return rb_enc_from_index(rb_enc_get_index(obj)); } +static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2); + +rb_encoding* +rb_enc_check_str(VALUE str1, VALUE str2) +{ + rb_encoding *enc = enc_compatible_str(str1, str2); + ENC_ASSERT(TYPE(str1) == T_STRING); + ENC_ASSERT(TYPE(str2) == T_STRING); + if (!enc) + rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", + rb_enc_name(rb_enc_get(str1)), + rb_enc_name(rb_enc_get(str2))); + return enc; +} + rb_encoding* rb_enc_check(VALUE str1, VALUE str2) { @@ -853,40 +881,28 @@ rb_enc_check(VALUE str1, VALUE str2) https://github.com/ruby/ruby/blob/trunk/encoding.c#L881 return enc; } -rb_encoding* -rb_enc_compatible(VALUE str1, VALUE str2) +static rb_encoding* +enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2) { - int idx1, idx2; - rb_encoding *enc1, *enc2; int isstr1, isstr2; - - idx1 = rb_enc_get_index(str1); - idx2 = rb_enc_get_index(str2); - - if (idx1 < 0 || idx2 < 0) - return 0; - - if (idx1 == idx2) { - return rb_enc_from_index(idx1); - } - enc1 = rb_enc_from_index(idx1); - enc2 = rb_enc_from_index(idx2); + rb_encoding *enc1 = rb_enc_from_index(idx1); + rb_encoding *enc2 = rb_enc_from_index(idx2); isstr2 = RB_TYPE_P(str2, T_STRING); if (isstr2 && RSTRING_LEN(str2) == 0) - return enc1; + return enc1; isstr1 = RB_TYPE_P(str1, T_STRING); if (isstr1 && RSTRING_LEN(str1) == 0) - return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2; + return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2; if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) { return 0; } /* objects whose encoding is the same of contents */ if (!isstr2 && idx2 == ENCINDEX_US_ASCII) - return enc1; + return enc1; if (!isstr1 && idx1 == ENCINDEX_US_ASCII) - return enc2; + return enc2; if (!isstr1) { VALUE tmp = str1; @@ -915,11 +931,44 @@ rb_enc_compatible(VALUE str1, VALUE str2 https://github.com/ruby/ruby/blob/trunk/encoding.c#L931 } } if (cr1 == ENC_CODERANGE_7BIT) - return enc2; + return enc2; } return 0; } +static rb_encoding* +enc_compatible_str(VALUE str1, VALUE str2) +{ + int idx1 = enc_get_index_str(str1); + int idx2 = enc_get_index_str(str2); + + if (idx1 < 0 || idx2 < 0) + return 0; + + if (idx1 == idx2) { + return rb_enc_from_index(idx1); + } + else { + return enc_compatible_latter(str1, str2, idx1, idx2); + } +} + +rb_encoding* +rb_enc_compatible(VALUE str1, VALUE str2) +{ + int idx1 = rb_enc_get_index(str1); + int idx2 = rb_enc_get_index(str2); + + if (idx1 < 0 || idx2 < 0) + return 0; + + if (idx1 == idx2) { + return rb_enc_from_index(idx1); + } + + return enc_compatible_latter(str1, str2, idx1, idx2); +} + void rb_enc_copy(VALUE obj1, VALUE obj2) { Index: ChangeLog =================================================================== --- ChangeLog (revision 52349) +++ ChangeLog (revision 52350) @@ -1,3 +1,20 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1 +Thu Oct 29 18:05:22 2015 Koichi Sasada <ko1@a...> + + * encoding.c (rb_enc_check_str): add for performance. + This function only accept T_STRING (and T_REGEXP). + + This patch improves performance of a tiny_segmenter benchmark + (num=2) 2.54sec -> 2.42sec on my machine. + https://github.com/chezou/TinySegmenter.jl/blob/master/benchmark/benchmark.rb + + * encoding.c: add ENC_DEBUG and ENC_ASSERT() macros. + + * internal.h: add a decl. of rb_enc_check_str(). + + * string.c (rb_str_plus): use rb_enc_check_str(). + + * string.c (rb_str_subpat_set): ditto. + Thu Oct 29 17:16:40 2015 Koichi Sasada <ko1@a...> * internal.h: export rb_wb_(un)protected_newobj_of() Index: string.c =================================================================== --- string.c (revision 52349) +++ string.c (revision 52350) @@ -1598,7 +1598,7 @@ rb_str_plus(VALUE str1, VALUE str2) https://github.com/ruby/ruby/blob/trunk/string.c#L1598 long len1, len2; StringValue(str2); - enc = rb_enc_check(str1, str2); + enc = rb_enc_check_str(str1, str2); RSTRING_GETMEM(str1, ptr1, len1); RSTRING_GETMEM(str2, ptr2, len2); str3 = rb_str_new(0, len1+len2); @@ -4072,7 +4072,7 @@ rb_str_subpat_set(VALUE str, VALUE re, V https://github.com/ruby/ruby/blob/trunk/string.c#L4072 end = END(nth); len = end - start; StringValue(val); - enc = rb_enc_check(str, val); + enc = rb_enc_check_str(str, val); rb_str_splice_0(str, start, len, val); rb_enc_associate(str, enc); } Index: internal.h =================================================================== --- internal.h (revision 52349) +++ internal.h (revision 52350) @@ -716,6 +716,7 @@ void Init_ext(void); https://github.com/ruby/ruby/blob/trunk/internal.h#L716 ID rb_id_encoding(void); void rb_gc_mark_encodings(void); rb_encoding *rb_enc_get_from_index(int index); +rb_encoding *rb_enc_check_str(VALUE str1, VALUE str2); int rb_encdb_replicate(const char *alias, const char *orig); int rb_encdb_alias(const char *alias, const char *orig); int rb_encdb_dummy(const char *name); -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/