ruby-changes:57024
From: Nobuyoshi <ko1@a...>
Date: Wed, 14 Aug 2019 11:37:02 +0900 (JST)
Subject: [ruby-changes:57024] Nobuyoshi Nakada: cad41bb6d3 (master): [ruby/stringio] Supported BOM
https://git.ruby-lang.org/ruby.git/commit/?id=cad41bb6d3 From cad41bb6d35c09b0e5d9e50280e3bfcd1bd9bcc0 Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada <nobu@r...> Date: Wed, 14 Aug 2019 10:43:46 +0900 Subject: [ruby/stringio] Supported BOM https://github.com/ruby/stringio/commit/b249631c43 diff --git a/ext/stringio/stringio.c b/ext/stringio/stringio.c index 6ca5db3..f7b520a 100644 --- a/ext/stringio/stringio.c +++ b/ext/stringio/stringio.c @@ -262,6 +262,73 @@ strio_initialize(int argc, VALUE *argv, VALUE self) https://github.com/ruby/ruby/blob/trunk/ext/stringio/stringio.c#L262 return strio_init(argc, argv, ptr, self); } +static int +detect_bom(VALUE str, int *bomlen) +{ + const char *p; + long len; + + RSTRING_GETMEM(str, p, len); + if (len < 1) return 0; + switch ((unsigned char)p[0]) { + case 0xEF: + if (len < 2) break; + if ((unsigned char)p[1] == 0xBB && len > 2) { + if ((unsigned char)p[2] == 0xBF) { + *bomlen = 3; + return rb_utf8_encindex(); + } + } + break; + + case 0xFE: + if (len < 2) break; + if ((unsigned char)p[1] == 0xFF) { + *bomlen = 2; + return rb_enc_find_index("UTF-16BE"); + } + break; + + case 0xFF: + if (len < 2) break; + if ((unsigned char)p[1] == 0xFE) { + if (len >= 4 && (unsigned char)p[2] == 0 && (unsigned char)p[3] == 0) { + *bomlen = 4; + return rb_enc_find_index("UTF-32LE"); + } + *bomlen = 2; + return rb_enc_find_index("UTF-16LE"); + } + break; + + case 0: + if (len < 4) break; + if ((unsigned char)p[1] == 0 && (unsigned char)p[2] == 0xFE & (unsigned char)p[3] == 0xFF) { + *bomlen = 4; + return rb_enc_find_index("UTF-32BE"); + } + break; + } + return 0; +} + +static rb_encoding * +set_encoding_by_bom(struct StringIO *ptr) +{ + int bomlen, idx = detect_bom(ptr->string, &bomlen); + rb_encoding *extenc = NULL; + + if (idx) { + extenc = rb_enc_from_index(idx); + ptr->pos = bomlen; + if (ptr->flags & FMODE_WRITABLE) { + rb_enc_associate_index(ptr->string, idx); + } + } + ptr->enc = extenc; + return extenc; +} + static VALUE strio_init(int argc, VALUE *argv, struct StringIO *ptr, VALUE self) { @@ -294,6 +361,7 @@ strio_init(int argc, VALUE *argv, struct StringIO *ptr, VALUE self) https://github.com/ruby/ruby/blob/trunk/ext/stringio/stringio.c#L361 ptr->enc = convconfig.enc; ptr->pos = 0; ptr->lineno = 0; + if (ptr->flags & FMODE_SETENC_BY_BOM) set_encoding_by_bom(ptr); RBASIC(self)->flags |= (ptr->flags & FMODE_READWRITE) * (STRIO_READABLE / FMODE_READABLE); return self; } @@ -1677,6 +1745,18 @@ strio_set_encoding(int argc, VALUE *argv, VALUE self) https://github.com/ruby/ruby/blob/trunk/ext/stringio/stringio.c#L1745 return self; } +static VALUE +strio_set_encoding_by_bom(VALUE self) +{ + struct StringIO *ptr = StringIO(self); + + if (ptr->enc) { + rb_raise(rb_eArgError, "encoding conversion is set"); + } + if (!set_encoding_by_bom(ptr)) return Qnil; + return rb_enc_from_encoding(ptr->enc); +} + /* * Pseudo I/O on String object. * @@ -1778,6 +1858,7 @@ Init_stringio(void) https://github.com/ruby/ruby/blob/trunk/ext/stringio/stringio.c#L1858 rb_define_method(StringIO, "external_encoding", strio_external_encoding, 0); rb_define_method(StringIO, "internal_encoding", strio_internal_encoding, 0); rb_define_method(StringIO, "set_encoding", strio_set_encoding, -1); + rb_define_method(StringIO, "set_encoding_by_bom", strio_set_encoding_by_bom, 0); { VALUE mReadable = rb_define_module_under(rb_cIO, "generic_readable"); diff --git a/test/stringio/test_stringio.rb b/test/stringio/test_stringio.rb index 1e8e548..b4369ed 100644 --- a/test/stringio/test_stringio.rb +++ b/test/stringio/test_stringio.rb @@ -795,6 +795,20 @@ class TestStringIO < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/test/stringio/test_stringio.rb#L795 assert_equal("\0\0\0a\0\0\0b\0\0\0c", s.read) end + %w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.each do |name| + define_method("test_strip_bom:#{name}") do + text = "\uFEFF\u0100a" + content = text.encode(name) + result = StringIO.new(content, mode: 'rb:BOM|UTF-8').read + assert_equal(Encoding.find(name), result.encoding, name) + assert_equal(content[1..-1].b, result.b, name) + + StringIO.open(content) {|f| + assert_equal(Encoding.find(name), f.set_encoding_by_bom) + } + end + end + def assert_string(content, encoding, str, mesg = nil) assert_equal([content, encoding], [str, str.encoding], mesg) end -- cgit v0.10.2 -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/