ruby-changes:3849
From: ko1@a...
Date: Wed, 30 Jan 2008 12:50:10 +0900 (JST)
Subject: [ruby-changes:3849] akr - Ruby:r15338 (trunk): * enc/utf_16be.c (UTF16_IS_SURROGATE_FIRST): avoid branch.
akr 2008-01-30 12:49:54 +0900 (Wed, 30 Jan 2008)
New Revision: 15338
Modified files:
trunk/ChangeLog
trunk/enc/utf_16be.c
trunk/enc/utf_16le.c
trunk/test/ruby/test_utf16.rb
Log:
* enc/utf_16be.c (UTF16_IS_SURROGATE_FIRST): avoid branch.
(UTF16_IS_SURROGATE_SECOND): ditto.
(UTF16_IS_SURROGATE): defined.
(utf16be_mbc_enc_len): validation implemented.
* enc/utf_16le.c (UTF16_IS_SURROGATE_FIRST): avoid branch.
(UTF16_IS_SURROGATE_SECOND): ditto.
(UTF16_IS_SURROGATE): defined.
(utf16le_mbc_enc_len): validation implemented.
http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=15338&r2=15337&diff_format=u
http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_utf16.rb?r1=15338&r2=15337&diff_format=u
http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/enc/utf_16le.c?r1=15338&r2=15337&diff_format=u
http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/enc/utf_16be.c?r1=15338&r2=15337&diff_format=u
Index: ChangeLog
===================================================================
--- ChangeLog (revision 15337)
+++ ChangeLog (revision 15338)
@@ -1,3 +1,15 @@
+Wed Jan 30 12:26:59 2008 Tanaka Akira <akr@f...>
+
+ * enc/utf_16be.c (UTF16_IS_SURROGATE_FIRST): avoid branch.
+ (UTF16_IS_SURROGATE_SECOND): ditto.
+ (UTF16_IS_SURROGATE): defined.
+ (utf16be_mbc_enc_len): validation implemented.
+
+ * enc/utf_16le.c (UTF16_IS_SURROGATE_FIRST): avoid branch.
+ (UTF16_IS_SURROGATE_SECOND): ditto.
+ (UTF16_IS_SURROGATE): defined.
+ (utf16le_mbc_enc_len): validation implemented.
+
Wed Jan 30 12:06:43 2008 Tadayoshi Funaba <tadf@d...>
* bignum.c (rb_cstr_to_inum): '0_2' is a valid representatin.
Index: enc/utf_16be.c
===================================================================
--- enc/utf_16be.c (revision 15337)
+++ enc/utf_16be.c (revision 15338)
@@ -29,8 +29,9 @@
#include "regenc.h"
-#define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb)
-#define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf)
+#define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
+#define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
+#define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
static const int EncLen_UTF16[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -55,7 +56,28 @@
utf16be_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED,
OnigEncoding enc ARG_UNUSED)
{
- return EncLen_UTF16[*p];
+ int byte = p[0];
+ if (!UTF16_IS_SURROGATE(byte)) {
+ if (2 <= e-p)
+ return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
+ else
+ return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
+ }
+ if (UTF16_IS_SURROGATE_FIRST(byte)) {
+ switch (e-p) {
+ case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3);
+ case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2);
+ case 3:
+ if (UTF16_IS_SURROGATE_SECOND(p[2]))
+ return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
+ break;
+ default:
+ if (UTF16_IS_SURROGATE_SECOND(p[2]))
+ return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
+ break;
+ }
+ }
+ return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
}
static int
Index: enc/utf_16le.c
===================================================================
--- enc/utf_16le.c (revision 15337)
+++ enc/utf_16le.c (revision 15338)
@@ -29,8 +29,9 @@
#include "regenc.h"
-#define UTF16_IS_SURROGATE_FIRST(c) (c >= 0xd8 && c <= 0xdb)
-#define UTF16_IS_SURROGATE_SECOND(c) (c >= 0xdc && c <= 0xdf)
+#define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
+#define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
+#define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
static const int EncLen_UTF16[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -52,10 +53,23 @@
};
static int
-utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED,
+utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e,
OnigEncoding enc ARG_UNUSED)
{
- return EncLen_UTF16[*(p+1)];
+ int len = e-p, byte;
+ if (len < 2)
+ return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
+ byte = p[1];
+ if (!UTF16_IS_SURROGATE(byte)) {
+ return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
+ }
+ if (UTF16_IS_SURROGATE_FIRST(byte)) {
+ if (len < 4)
+ return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len);
+ if (UTF16_IS_SURROGATE_SECOND(p[3]))
+ return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
+ }
+ return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
}
static int
Index: test/ruby/test_utf16.rb
===================================================================
--- test/ruby/test_utf16.rb (revision 15337)
+++ test/ruby/test_utf16.rb (revision 15338)
@@ -48,10 +48,61 @@
# tests start
def test_utf16be_valid_encoding
- s = "\xd8\x00\xd8\x00".force_encoding("utf-16be")
- assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ [
+ "\x00\x00",
+ "\xd7\xff",
+ "\xd8\x00\xdc\x00",
+ "\xdb\xff\xdf\xff",
+ "\xe0\x00",
+ "\xff\xff",
+ ].each {|s|
+ s.force_encoding("utf-16be")
+ assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ [
+ "\x00",
+ "\xd7",
+ "\xd8\x00",
+ "\xd8\x00\xd8\x00",
+ "\xdc\x00",
+ "\xdc\x00\xd8\x00",
+ "\xdc\x00\xdc\x00",
+ "\xe0",
+ "\xff",
+ ].each {|s|
+ s.force_encoding("utf-16be")
+ assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
end
+ def test_utf16le_valid_encoding
+ [
+ "\x00\x00",
+ "\xff\xd7",
+ "\x00\xd8\x00\xdc",
+ "\xff\xdb\xff\xdf",
+ "\x00\xe0",
+ "\xff\xff",
+ ].each {|s|
+ s.force_encoding("utf-16le")
+ assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ [
+ "\x00",
+ "\xd7",
+ "\x00\xd8",
+ "\x00\xd8\x00\xd8",
+ "\x00\xdc",
+ "\x00\xdc\x00\xd8",
+ "\x00\xdc\x00\xdc",
+ "\xe0",
+ "\xff",
+ ].each {|s|
+ s.force_encoding("utf-16le")
+ assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ end
+
def test_strftime
s = "aa".force_encoding("utf-16be")
assert_raise(ArgumentError, "Time.now.strftime(#{encdump s})") { Time.now.strftime(s) }
--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/