ruby-changes:3849
From: ko1@a...
Date: Wed, 30 Jan 2008 12:50:10 +0900 (JST)
Subject: [ruby-changes:3849] akr - Ruby:r15338 (trunk): * enc/utf_16be.c (UTF16_IS_SURROGATE_FIRST): avoid branch.
akr 2008-01-30 12:49:54 +0900 (Wed, 30 Jan 2008) New Revision: 15338 Modified files: trunk/ChangeLog trunk/enc/utf_16be.c trunk/enc/utf_16le.c trunk/test/ruby/test_utf16.rb Log: * enc/utf_16be.c (UTF16_IS_SURROGATE_FIRST): avoid branch. (UTF16_IS_SURROGATE_SECOND): ditto. (UTF16_IS_SURROGATE): defined. (utf16be_mbc_enc_len): validation implemented. * enc/utf_16le.c (UTF16_IS_SURROGATE_FIRST): avoid branch. (UTF16_IS_SURROGATE_SECOND): ditto. (UTF16_IS_SURROGATE): defined. (utf16le_mbc_enc_len): validation implemented. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=15338&r2=15337&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_utf16.rb?r1=15338&r2=15337&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/enc/utf_16le.c?r1=15338&r2=15337&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/enc/utf_16be.c?r1=15338&r2=15337&diff_format=u Index: ChangeLog =================================================================== --- ChangeLog (revision 15337) +++ ChangeLog (revision 15338) @@ -1,3 +1,15 @@ +Wed Jan 30 12:26:59 2008 Tanaka Akira <akr@f...> + + * enc/utf_16be.c (UTF16_IS_SURROGATE_FIRST): avoid branch. + (UTF16_IS_SURROGATE_SECOND): ditto. + (UTF16_IS_SURROGATE): defined. + (utf16be_mbc_enc_len): validation implemented. + + * enc/utf_16le.c (UTF16_IS_SURROGATE_FIRST): avoid branch. + (UTF16_IS_SURROGATE_SECOND): ditto. + (UTF16_IS_SURROGATE): defined. + (utf16le_mbc_enc_len): validation implemented. + Wed Jan 30 12:06:43 2008 Tadayoshi Funaba <tadf@d...> * bignum.c (rb_cstr_to_inum): '0_2' is a valid representatin. Index: enc/utf_16be.c =================================================================== --- enc/utf_16be.c (revision 15337) +++ enc/utf_16be.c (revision 15338) @@ -29,8 +29,9 @@ #include "regenc.h" -#define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb) -#define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf) +#define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8) +#define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc) +#define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8) static const int EncLen_UTF16[] = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -55,7 +56,28 @@ utf16be_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED, OnigEncoding enc ARG_UNUSED) { - return EncLen_UTF16[*p]; + int byte = p[0]; + if (!UTF16_IS_SURROGATE(byte)) { + if (2 <= e-p) + return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2); + else + return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); + } + if (UTF16_IS_SURROGATE_FIRST(byte)) { + switch (e-p) { + case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3); + case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2); + case 3: + if (UTF16_IS_SURROGATE_SECOND(p[2])) + return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); + break; + default: + if (UTF16_IS_SURROGATE_SECOND(p[2])) + return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); + break; + } + } + return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); } static int Index: enc/utf_16le.c =================================================================== --- enc/utf_16le.c (revision 15337) +++ enc/utf_16le.c (revision 15338) @@ -29,8 +29,9 @@ #include "regenc.h" -#define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb) -#define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf) +#define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8) +#define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc) +#define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8) static const int EncLen_UTF16[] = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -52,10 +53,23 @@ }; static int -utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED, +utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e, OnigEncoding enc ARG_UNUSED) { - return EncLen_UTF16[*(p+1)]; + int len = e-p, byte; + if (len < 2) + return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); + byte = p[1]; + if (!UTF16_IS_SURROGATE(byte)) { + return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2); + } + if (UTF16_IS_SURROGATE_FIRST(byte)) { + if (len < 4) + return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len); + if (UTF16_IS_SURROGATE_SECOND(p[3])) + return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); + } + return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); } static int Index: test/ruby/test_utf16.rb =================================================================== --- test/ruby/test_utf16.rb (revision 15337) +++ test/ruby/test_utf16.rb (revision 15338) @@ -48,10 +48,61 @@ # tests start def test_utf16be_valid_encoding - s = "\xd8\x00\xd8\x00".force_encoding("utf-16be") - assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?") + [ + "\x00\x00", + "\xd7\xff", + "\xd8\x00\xdc\x00", + "\xdb\xff\xdf\xff", + "\xe0\x00", + "\xff\xff", + ].each {|s| + s.force_encoding("utf-16be") + assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?") + } + [ + "\x00", + "\xd7", + "\xd8\x00", + "\xd8\x00\xd8\x00", + "\xdc\x00", + "\xdc\x00\xd8\x00", + "\xdc\x00\xdc\x00", + "\xe0", + "\xff", + ].each {|s| + s.force_encoding("utf-16be") + assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?") + } end + def test_utf16le_valid_encoding + [ + "\x00\x00", + "\xff\xd7", + "\x00\xd8\x00\xdc", + "\xff\xdb\xff\xdf", + "\x00\xe0", + "\xff\xff", + ].each {|s| + s.force_encoding("utf-16le") + assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?") + } + [ + "\x00", + "\xd7", + "\x00\xd8", + "\x00\xd8\x00\xd8", + "\x00\xdc", + "\x00\xdc\x00\xd8", + "\x00\xdc\x00\xdc", + "\xe0", + "\xff", + ].each {|s| + s.force_encoding("utf-16le") + assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?") + } + end + def test_strftime s = "aa".force_encoding("utf-16be") assert_raise(ArgumentError, "Time.now.strftime(#{encdump s})") { Time.now.strftime(s) } -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/