ruby-changes:45743
From: nobu <ko1@a...>
Date: Thu, 9 Mar 2017 11:04:16 +0900 (JST)
Subject: [ruby-changes:45743] nobu:r57816 (trunk): fix UTF-32 valid_encoding?
nobu 2017-03-09 11:04:10 +0900 (Thu, 09 Mar 2017) New Revision: 57816 https://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=57816 Log: fix UTF-32 valid_encoding? * enc/utf_32be.c (utf32be_mbc_enc_len): check arguments precisely. [ruby-core:79966] [Bug #13292] * enc/utf_32le.c (utf32le_mbc_enc_len): ditto. * regenc.h (UNICODE_VALID_CODEPOINT_P): predicate for valid Unicode codepoints. Modified files: trunk/enc/utf_32be.c trunk/enc/utf_32le.c trunk/regenc.h trunk/test/ruby/enc/test_utf32.rb Index: test/ruby/enc/test_utf32.rb =================================================================== --- test/ruby/enc/test_utf32.rb (revision 57815) +++ test/ruby/enc/test_utf32.rb (revision 57816) @@ -90,5 +90,73 @@ EOT https://github.com/ruby/ruby/blob/trunk/test/ruby/enc/test_utf32.rb#L90 assert_equal(sl, "a".ord.chr("utf-32le")) assert_equal(sb, "a".ord.chr("utf-32be")) end + + def test_utf32be_valid_encoding + all_assertions do |a| + [ + "\x00\x00\x00\x00", + "\x00\x00\x00a", + "\x00\x00\x30\x40", + "\x00\x00\xd7\xff", + "\x00\x00\xe0\x00", + "\x00\x00\xff\xff", + "\x00\x10\xff\xff", + ].each {|s| + s.force_encoding("utf-32be") + a.for(s) { + assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + [ + "a", + "\x00a", + "\x00\x00a", + "\x00\x00\xd8\x00", + "\x00\x00\xdb\xff", + "\x00\x00\xdc\x00", + "\x00\x00\xdf\xff", + "\x00\x11\x00\x00", + ].each {|s| + s.force_encoding("utf-32be") + a.for(s) { + assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + end + end + + def test_utf32le_valid_encoding + all_assertions do |a| + [ + "\x00\x00\x00\x00", + "a\x00\x00\x00", + "\x40\x30\x00\x00", + "\xff\xd7\x00\x00", + "\x00\xe0\x00\x00", + "\xff\xff\x00\x00", + "\xff\xff\x10\x00", + ].each {|s| + s.force_encoding("utf-32le") + a.for(s) { + assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + [ + "a", + "a\x00", + "a\x00\x00", + "\x00\xd8\x00\x00", + "\xff\xdb\x00\x00", + "\x00\xdc\x00\x00", + "\xff\xdf\x00\x00", + "\x00\x00\x11\x00", + ].each {|s| + s.force_encoding("utf-32le") + a.for(s) { + assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + end + end end Index: enc/utf_32be.c =================================================================== --- enc/utf_32be.c (revision 57815) +++ enc/utf_32be.c (revision 57816) @@ -30,11 +30,23 @@ https://github.com/ruby/ruby/blob/trunk/enc/utf_32be.c#L30 #include "regenc.h" #include "iso_8859.h" +static OnigCodePoint utf32be_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc); static int -utf32be_mbc_enc_len(const UChar* p ARG_UNUSED, const OnigUChar* e ARG_UNUSED, - OnigEncoding enc ARG_UNUSED) +utf32be_mbc_enc_len(const UChar* p ARG_UNUSED, const OnigUChar* e, + OnigEncoding enc) { - return 4; + if (e < p) { + return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + } + else if (e-p < 4) { + return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-(int)(e-p)); + } + else { + OnigCodePoint c = utf32be_mbc_to_code(p, e, enc); + if (!UNICODE_VALID_CODEPOINT_P(c)) + return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); + } } static int Index: enc/utf_32le.c =================================================================== --- enc/utf_32le.c (revision 57815) +++ enc/utf_32le.c (revision 57816) @@ -30,11 +30,23 @@ https://github.com/ruby/ruby/blob/trunk/enc/utf_32le.c#L30 #include "regenc.h" #include "iso_8859.h" +static OnigCodePoint utf32le_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc); static int -utf32le_mbc_enc_len(const UChar* p ARG_UNUSED, const OnigUChar* e ARG_UNUSED, - OnigEncoding enc ARG_UNUSED) +utf32le_mbc_enc_len(const UChar* p ARG_UNUSED, const OnigUChar* e, + OnigEncoding enc) { - return 4; + if (e < p) { + return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + } + else if (e-p < 4) { + return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-(int)(e-p)); + } + else { + OnigCodePoint c = utf32le_mbc_to_code(p, e, enc); + if (!UNICODE_VALID_CODEPOINT_P(c)) + return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); + return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); + } } static int Index: regenc.h =================================================================== --- regenc.h (revision 57815) +++ regenc.h (revision 57816) @@ -186,6 +186,9 @@ ONIG_EXTERN int onigenc_unicode_apply_al https://github.com/ruby/ruby/blob/trunk/regenc.h#L186 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8) #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc) #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8) +#define UNICODE_VALID_CODEPOINT_P(c) ( \ + ((c) <= 0x10ffff) && \ + !((c) < 0x10000 && UTF16_IS_SURROGATE((c) >> 8))) #define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \ OnigEncISO_8859_1_ToLowerCaseTable[c] -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/