ruby-changes:2522
From: ko1@a...
Date: 25 Nov 2007 22:26:56 +0900
Subject: [ruby-changes:2522] akr - Ruby:r14013 (trunk): * include/ruby/encoding.h (rb_enc_str_asciionly_p): declared.
akr 2007-11-25 22:25:34 +0900 (Sun, 25 Nov 2007) New Revision: 14013 Modified files: trunk/ChangeLog trunk/include/ruby/encoding.h trunk/re.c trunk/string.c trunk/test/ruby/test_m17n.rb Log: * include/ruby/encoding.h (rb_enc_str_asciionly_p): declared. (rb_enc_str_asciicompat_p): defined. * re.c (rb_reg_initialize_str): use rb_enc_str_asciionly_p. (rb_reg_quote): return ascii-8bit string if the argument is ascii-only to generate encoding generic regexp if possible. (rb_reg_s_union): fix encoding handling. [ruby-dev:32094] * string.c (rb_enc_str_asciionly_p): defined. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=14013&r2=14012 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=14013&r2=14012 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/encoding.h?r1=14013&r2=14012 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/re.c?r1=14013&r2=14012 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_m17n.rb?r1=14013&r2=14012 Index: include/ruby/encoding.h =================================================================== --- include/ruby/encoding.h (revision 14012) +++ include/ruby/encoding.h (revision 14013) @@ -100,6 +100,8 @@ ID rb_intern3(const char*, long, rb_encoding*); int rb_enc_symname_p(const char*, rb_encoding*); int rb_enc_str_coderange(VALUE); +int rb_enc_str_asciionly_p(VALUE); +#define rb_enc_str_asciicompat_p(str) rb_enc_asciicompat(rb_enc_get(str)) VALUE rb_enc_from_encoding(rb_encoding *enc); rb_encoding *rb_enc_primary(void); rb_encoding *rb_enc_default(void); Index: re.c =================================================================== --- re.c (revision 14012) +++ re.c (revision 14013) @@ -1268,7 +1268,7 @@ static int rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err) { - if (rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) { + if (!rb_enc_str_asciionly_p(str)) { options |= ARG_ENCODING_FIXED; } return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str), @@ -1654,6 +1654,7 @@ char *s, *send, *t; VALUE tmp; int c; + int ascii_only = rb_enc_str_asciionly_p(str); s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); @@ -1677,11 +1678,17 @@ goto meta_found; } } + if (ascii_only && rb_enc_get_index(str) != 0) { + str = rb_str_new3(str); + rb_enc_associate(str, rb_enc_from_index(0)); + } return str; meta_found: tmp = rb_str_new(0, RSTRING_LEN(str)*2); - rb_enc_copy(tmp, str); + if (!ascii_only) { + rb_enc_copy(tmp, str); + } t = RSTRING_PTR(tmp); /* copy upto metacharacter */ memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str)); @@ -1802,43 +1809,115 @@ return rb_class_new_instance(1, args, rb_cRegexp); } else if (argc == 1) { - VALUE v; - v = rb_check_regexp_type(rb_ary_entry(args0, 0)); - if (!NIL_P(v)) - return v; + VALUE arg = rb_ary_entry(args0, 0); + VALUE re = rb_check_regexp_type(arg); + if (!NIL_P(re)) + return re; else { - VALUE args[1]; - args[0] = rb_reg_s_quote(Qnil, RARRAY_PTR(args0)[0]); - return rb_class_new_instance(1, args, rb_cRegexp); + VALUE quoted; + quoted = rb_reg_s_quote(Qnil, arg); + return rb_reg_new(quoted, 0); } } else { int i; VALUE source = rb_str_buf_new(0); - int mbs = Qfalse; - rb_encoding *enc = 0; + rb_encoding *enc; + int has_asciionly_string = 0; + rb_encoding *has_ascii_compat_string = 0; + rb_encoding *has_ascii_incompat_string = 0; + + int has_generic_regexp = 0; + rb_encoding *has_ascii_compat_fixed_regexp = 0; + rb_encoding *has_ascii_incompat_regexp = 0; + for (i = 0; i < argc; i++) { volatile VALUE v; VALUE e = rb_ary_entry(args0, i); + if (0 < i) - rb_str_buf_cat2(source, "|"); + rb_str_buf_cat2(source, "|"); /* xxx: UTF-16 */ + v = rb_check_regexp_type(e); if (!NIL_P(v)) { + rb_encoding *enc0 = rb_enc_get(v); + if (!rb_enc_asciicompat(enc0)) { + if (!has_ascii_incompat_regexp) { + has_ascii_incompat_regexp = enc0; + } + else { + if (has_ascii_incompat_regexp != enc0) + rb_raise(rb_eArgError, "regexp encodings differ"); + } + } + else if (ENCODING_GET(v) != 0 || FL_TEST(v, KCODE_FIXED)) { + if (!has_ascii_compat_fixed_regexp) { + has_ascii_compat_fixed_regexp = enc0; + } + else { + if (has_ascii_compat_fixed_regexp != enc0) + rb_raise(rb_eArgError, "regexp encodings differ"); + } + } + else { + has_generic_regexp = 1; + } v = rb_reg_to_s(v); } else { + StringValue(e); + if (!rb_enc_str_asciicompat_p(e)) { + rb_encoding *enc0 = rb_enc_get(e); + if (!has_ascii_incompat_string) { + has_ascii_incompat_string = enc0; + } + else { + if (has_ascii_incompat_string != enc0) + rb_raise(rb_eArgError, "regexp encodings differ"); + } + } + else if (rb_enc_str_asciionly_p(e)) { + has_asciionly_string = 1; + } + else { + rb_encoding *enc0 = rb_enc_get(e); + if (!has_ascii_compat_string) { + has_ascii_compat_string = enc0; + } + else { + if (has_ascii_compat_string != enc0) + rb_raise(rb_eArgError, "regexp encodings differ"); + } + } v = rb_reg_s_quote(Qnil, e); } - if (mbs || rb_enc_str_coderange(v) != ENC_CODERANGE_SINGLE) { - if (!enc) enc = rb_enc_get(v); - else if (mbs && enc != rb_enc_get(v)) { - rb_raise(rb_eArgError, "regexp encodings differ"); - } - mbs = Qtrue; - } rb_str_append(source, v); } + if (has_ascii_incompat_string || has_ascii_incompat_regexp) { + if (has_asciionly_string || has_ascii_compat_string || + has_generic_regexp || has_ascii_compat_fixed_regexp) + rb_raise(rb_eArgError, "regexp encodings differ"); + if (has_ascii_incompat_string && has_ascii_incompat_regexp && + has_ascii_incompat_string != has_ascii_incompat_regexp) + rb_raise(rb_eArgError, "regexp encodings differ"); + enc = has_ascii_incompat_string; + if (enc == 0) + enc = has_ascii_incompat_regexp; + } + else if (has_ascii_compat_string || has_ascii_compat_fixed_regexp) { + if (has_ascii_compat_string && has_ascii_compat_fixed_regexp && + has_ascii_compat_string != has_ascii_compat_fixed_regexp) + rb_raise(rb_eArgError, "regexp encodings differ"); + enc = has_ascii_compat_string; + if (enc == 0) + enc = has_ascii_compat_fixed_regexp; + } + else { + enc = rb_enc_from_index(0); + } + + rb_enc_associate(source, enc); return rb_class_new_instance(1, &source, rb_cRegexp); } } Index: ChangeLog =================================================================== --- ChangeLog (revision 14012) +++ ChangeLog (revision 14013) @@ -1,3 +1,15 @@ +Sun Nov 25 22:21:35 2007 Tanaka Akira <akr@f...> + + * include/ruby/encoding.h (rb_enc_str_asciionly_p): declared. + (rb_enc_str_asciicompat_p): defined. + + * re.c (rb_reg_initialize_str): use rb_enc_str_asciionly_p. + (rb_reg_quote): return ascii-8bit string if the argument is + ascii-only to generate encoding generic regexp if possible. + (rb_reg_s_union): fix encoding handling. [ruby-dev:32094] + + * string.c (rb_enc_str_asciionly_p): defined. + Sun Nov 25 12:12:03 2007 Eric Hodel <drbrain@s...> * gem_prelude.rb: Import fast-loading gem_prelude.rb from RubyGems. Index: string.c =================================================================== --- string.c (revision 14012) +++ string.c (revision 14013) @@ -129,6 +129,23 @@ return cr; } +int rb_enc_str_asciionly_p(VALUE str) +{ + rb_encoding *enc = rb_enc_get(str); + + if (rb_enc_asciicompat(enc) && + rb_enc_str_coderange(str) == ENC_CODERANGE_SINGLE) { + char *ptr = RSTRING_PTR(str); + long len = RSTRING_LEN(str); + long i; + for (i = 0; i < len; i++) + if (ptr[i] & 0x80) + return Qfalse; + return Qtrue; + } + return Qfalse; +} + static inline void str_mod_check(VALUE s, char *p, long len) { Index: test/ruby/test_m17n.rb =================================================================== --- test/ruby/test_m17n.rb (revision 14012) +++ test/ruby/test_m17n.rb (revision 14013) @@ -46,31 +46,72 @@ #assert_raise(SyntaxError) { eval('/\xc0\x20/u') } end + def assert_regexp_generic_encoding(r) + %w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename| + # "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8. + assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(ename) } + } + end + + def assert_regexp_fixed_encoding(r) + %w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename| + enc = Encoding.find(ename) + if enc == r.encoding + assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(enc) } + else + assert_raise(ArgumentError) { r =~ "\xc0\xa1".force_encoding(enc) } + end + } + end + + def assert_regexp_generic_ascii(r) + assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_generic_encoding(r) + end + + def assert_regexp_fixed_ascii8bit(r) + assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_eucjp(r) + assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_sjis(r) + assert_encoding("Shift_JIS", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_utf8(r) + assert_encoding("UTF-8", r.encoding) + assert_regexp_fixed_encoding(r) + end + def test_regexp_generic r = /a/ - assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_generic_ascii(r) assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) assert_equal(0, r =~ u("a")) + assert_equal(nil, r =~ a("\xc0\xa1")) + assert_equal(nil, r =~ e("\xc0\xa1")) + assert_equal(nil, r =~ s("\xc0\xa1")) + assert_equal(nil, r =~ u("\xc0\xa1")) - # "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8. + r = Regexp.new("a".force_encoding("ASCII-8BIT")) + assert_regexp_generic_ascii(r) + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) assert_equal(nil, r =~ a("\xc0\xa1")) assert_equal(nil, r =~ e("\xc0\xa1")) assert_equal(nil, r =~ s("\xc0\xa1")) assert_equal(nil, r =~ u("\xc0\xa1")) - r = eval(a(%{/\xc0\xa1/})) - assert_encoding("ASCII-8BIT", r.encoding) - assert_equal(nil, r =~ a("a")) - assert_equal(nil, r =~ e("a")) - assert_equal(nil, r =~ s("a")) - assert_equal(nil, r =~ u("a")) - assert_equal(0, r =~ a("\xc0\xa1")) - assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } - assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } - # xxx: /\xc0\xa1/ should be restricted only for ASCII-8BIT? # r = /\xc0\xa1/ # assert_encoding("ASCII-8BIT", r.encoding) @@ -86,7 +127,7 @@ def test_regexp_ascii r = /a/n - assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_ascii8bit(r) assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) @@ -97,7 +138,7 @@ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = /\xc0\xa1/n - assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_ascii8bit(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -107,8 +148,19 @@ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + r = eval(a(%{/\xc0\xa1/})) + assert_regexp_fixed_ascii8bit(r) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_equal(0, r =~ a("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + r = eval(%{/\xc0\xa1/n}.force_encoding("ASCII-8BIT")) - assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_ascii8bit(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -119,7 +171,9 @@ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = eval(%q{/\xc0\xa1/}.force_encoding("ASCII-8BIT")) + # assert_regexp_fixed_ascii8bit(r) assert_encoding("ASCII-8BIT", r.encoding) + # assert_regexp_fixed_encoding(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -128,12 +182,11 @@ # assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } # assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } # assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } - end def test_regexp_euc r = /a/e - assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_eucjp(r) assert_equal(0, r =~ a("a")) assert_equal(0, r =~ e("a")) assert_equal(0, r =~ s("a")) @@ -143,8 +196,19 @@ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + r = Regexp.new("a".force_encoding("EUC-JP")) + assert_regexp_fixed_eucjp(r) + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } + assert_equal(nil, r =~ e("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + r = /\xc0\xa1/e - assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_eucjp(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -155,7 +219,7 @@ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = eval(%{/\xc0\xa1/}.force_encoding("EUC-JP")) - assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_eucjp(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -166,7 +230,7 @@ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } r = eval(%q{/\xc0\xa1/}.force_encoding("EUC-JP")) - assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_eucjp(r) assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ s("a")) @@ -175,6 +239,7 @@ assert_equal(0, r =~ e("\xc0\xa1")) assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + end def test_begin_end_offset @@ -198,4 +263,107 @@ assert_equal([1,2], $~.offset(0)) end + def test_quote + assert_regexp_generic_ascii(/#{Regexp.quote(a("a"))}#{Regexp.quote(e("e"))}/) + + # Regexp.quote returns ASCII-8BIT string for ASCII only string + # to make generic regexp if possible. + assert_encoding("ASCII-8BIT", Regexp.quote(a("")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(e("")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(s("")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(u("")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(a("a")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(e("a")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(s("a")).encoding) + assert_encoding("ASCII-8BIT", Regexp.quote(u("a")).encoding) + + assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc0\xa1")).encoding) + assert_encoding("EUC-JP", Regexp.quote(e("\xc0\xa1")).encoding) + assert_encoding("Shift_JIS", Regexp.quote(s("\xc0\xa1")).encoding) + assert_encoding("UTF-8", Regexp.quote(u("\xc0\xa1")).encoding) + end + + def test_union_0 + r = Regexp.union + assert_regexp_generic_ascii(r) + assert(r !~ a("")) + assert(r !~ e("")) + assert(r !~ s("")) + assert(r !~ u("")) + end + + def test_union_1_asciionly_string + assert_regexp_generic_ascii(Regexp.union(a(""))) + assert_regexp_generic_ascii(Regexp.union(e(""))) + assert_regexp_generic_ascii(Regexp.union(s(""))) + assert_regexp_generic_ascii(Regexp.union(u(""))) + assert_regexp_generic_ascii(Regexp.union(a("a"))) + assert_regexp_generic_ascii(Regexp.union(e("a"))) + assert_regexp_generic_ascii(Regexp.union(s("a"))) + assert_regexp_generic_ascii(Regexp.union(u("a"))) + assert_regexp_generic_ascii(Regexp.union(a("\t"))) + assert_regexp_generic_ascii(Regexp.union(e("\t"))) + assert_regexp_generic_ascii(Regexp.union(s("\t"))) + assert_regexp_generic_ascii(Regexp.union(u("\t"))) + end + + def test_union_1_nonascii_string + assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc0\xa1"))) + assert_regexp_fixed_eucjp(Regexp.union(e("\xc0\xa1"))) + assert_regexp_fixed_sjis(Regexp.union(s("\xc0\xa1"))) + assert_regexp_fixed_utf8(Regexp.union(u("\xc0\xa1"))) + end + + def test_union_1_regexp + assert_regexp_generic_ascii(Regexp.union(//)) + assert_regexp_fixed_ascii8bit(Regexp.union(//n)) + assert_regexp_fixed_eucjp(Regexp.union(//e)) + assert_regexp_fixed_sjis(Regexp.union(//s)) + assert_regexp_fixed_utf8(Regexp.union(//u)) + end + + def test_union_2_asciionly_strings + ary = [a(""), e(""), s(""), u("")] + ary.each {|s1| + ary.each {|s2| + assert_regexp_generic_ascii(Regexp.union(s1, s2)) + } + } + end + + def test_union_2_strings + ary = [ + a(""), e(""), s(""), u(""), + a("\xc0\xa1"), e("\xc0\xa1"), s("\xc0\xa1"), u("\xc0\xa1") + ] + ary.each {|s1| + ary.each {|s2| + if s1.empty? + if s2.empty? + assert_regexp_generic_ascii(Regexp.union(s1, s2)) + else + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s2.encoding, r.encoding) + end + else + if s2.empty? + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s1.encoding, r.encoding) + else + if s1.encoding == s2.encoding + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s1.encoding, r.encoding) + else + assert_raise(ArgumentError) { Regexp.union(s1, s2) } + end + end + end + } + } + end + + end -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml