[前][次][番号順一覧][スレッド一覧]

ruby-changes:57024

From: Nobuyoshi <ko1@a...>
Date: Wed, 14 Aug 2019 11:37:02 +0900 (JST)
Subject: [ruby-changes:57024] Nobuyoshi Nakada: cad41bb6d3 (master): [ruby/stringio] Supported BOM

https://git.ruby-lang.org/ruby.git/commit/?id=cad41bb6d3

From cad41bb6d35c09b0e5d9e50280e3bfcd1bd9bcc0 Mon Sep 17 00:00:00 2001
From: Nobuyoshi Nakada <nobu@r...>
Date: Wed, 14 Aug 2019 10:43:46 +0900
Subject: [ruby/stringio] Supported BOM

https://github.com/ruby/stringio/commit/b249631c43

diff --git a/ext/stringio/stringio.c b/ext/stringio/stringio.c
index 6ca5db3..f7b520a 100644
--- a/ext/stringio/stringio.c
+++ b/ext/stringio/stringio.c
@@ -262,6 +262,73 @@ strio_initialize(int argc, VALUE *argv, VALUE self) https://github.com/ruby/ruby/blob/trunk/ext/stringio/stringio.c#L262
     return strio_init(argc, argv, ptr, self);
 }
 
+static int
+detect_bom(VALUE str, int *bomlen)
+{
+    const char *p;
+    long len;
+
+    RSTRING_GETMEM(str, p, len);
+    if (len < 1) return 0;
+    switch ((unsigned char)p[0]) {
+      case 0xEF:
+	if (len < 2) break;
+	if ((unsigned char)p[1] == 0xBB && len > 2) {
+	    if ((unsigned char)p[2] == 0xBF) {
+		*bomlen = 3;
+		return rb_utf8_encindex();
+	    }
+	}
+	break;
+
+      case 0xFE:
+	if (len < 2) break;
+	if ((unsigned char)p[1] == 0xFF) {
+	    *bomlen = 2;
+	    return rb_enc_find_index("UTF-16BE");
+	}
+	break;
+
+      case 0xFF:
+	if (len < 2) break;
+	if ((unsigned char)p[1] == 0xFE) {
+	    if (len >= 4 && (unsigned char)p[2] == 0 && (unsigned char)p[3] == 0) {
+		*bomlen = 4;
+		return rb_enc_find_index("UTF-32LE");
+	    }
+	    *bomlen = 2;
+	    return rb_enc_find_index("UTF-16LE");
+	}
+	break;
+
+      case 0:
+	if (len < 4) break;
+	if ((unsigned char)p[1] == 0 && (unsigned char)p[2] == 0xFE & (unsigned char)p[3] == 0xFF) {
+	    *bomlen = 4;
+	    return rb_enc_find_index("UTF-32BE");
+	}
+	break;
+    }
+    return 0;
+}
+
+static rb_encoding *
+set_encoding_by_bom(struct StringIO *ptr)
+{
+    int bomlen, idx = detect_bom(ptr->string, &bomlen);
+    rb_encoding *extenc = NULL;
+
+    if (idx) {
+	extenc = rb_enc_from_index(idx);
+	ptr->pos = bomlen;
+	if (ptr->flags & FMODE_WRITABLE) {
+	    rb_enc_associate_index(ptr->string, idx);
+	}
+    }
+    ptr->enc = extenc;
+    return extenc;
+}
+
 static VALUE
 strio_init(int argc, VALUE *argv, struct StringIO *ptr, VALUE self)
 {
@@ -294,6 +361,7 @@ strio_init(int argc, VALUE *argv, struct StringIO *ptr, VALUE self) https://github.com/ruby/ruby/blob/trunk/ext/stringio/stringio.c#L361
     ptr->enc = convconfig.enc;
     ptr->pos = 0;
     ptr->lineno = 0;
+    if (ptr->flags & FMODE_SETENC_BY_BOM) set_encoding_by_bom(ptr);
     RBASIC(self)->flags |= (ptr->flags & FMODE_READWRITE) * (STRIO_READABLE / FMODE_READABLE);
     return self;
 }
@@ -1677,6 +1745,18 @@ strio_set_encoding(int argc, VALUE *argv, VALUE self) https://github.com/ruby/ruby/blob/trunk/ext/stringio/stringio.c#L1745
     return self;
 }
 
+static VALUE
+strio_set_encoding_by_bom(VALUE self)
+{
+    struct StringIO *ptr = StringIO(self);
+
+    if (ptr->enc) {
+	rb_raise(rb_eArgError, "encoding conversion is set");
+    }
+    if (!set_encoding_by_bom(ptr)) return Qnil;
+    return rb_enc_from_encoding(ptr->enc);
+}
+
 /*
  * Pseudo I/O on String object.
  *
@@ -1778,6 +1858,7 @@ Init_stringio(void) https://github.com/ruby/ruby/blob/trunk/ext/stringio/stringio.c#L1858
     rb_define_method(StringIO, "external_encoding", strio_external_encoding, 0);
     rb_define_method(StringIO, "internal_encoding", strio_internal_encoding, 0);
     rb_define_method(StringIO, "set_encoding", strio_set_encoding, -1);
+    rb_define_method(StringIO, "set_encoding_by_bom", strio_set_encoding_by_bom, 0);
 
     {
 	VALUE mReadable = rb_define_module_under(rb_cIO, "generic_readable");
diff --git a/test/stringio/test_stringio.rb b/test/stringio/test_stringio.rb
index 1e8e548..b4369ed 100644
--- a/test/stringio/test_stringio.rb
+++ b/test/stringio/test_stringio.rb
@@ -795,6 +795,20 @@ class TestStringIO < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/test/stringio/test_stringio.rb#L795
     assert_equal("\0\0\0a\0\0\0b\0\0\0c", s.read)
   end
 
+  %w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.each do |name|
+    define_method("test_strip_bom:#{name}") do
+      text = "\uFEFF\u0100a"
+      content = text.encode(name)
+      result = StringIO.new(content, mode: 'rb:BOM|UTF-8').read
+      assert_equal(Encoding.find(name), result.encoding, name)
+      assert_equal(content[1..-1].b, result.b, name)
+
+      StringIO.open(content) {|f|
+        assert_equal(Encoding.find(name), f.set_encoding_by_bom)
+      }
+    end
+  end
+
   def assert_string(content, encoding, str, mesg = nil)
     assert_equal([content, encoding], [str, str.encoding], mesg)
   end
-- 
cgit v0.10.2


--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]