ruby-changes:41005

nobu	2015-12-13 18:48:27 +0900 (Sun, 13 Dec 2015)

  New Revision: 53084

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=revision&revision=53084

  Log:
    io.c: BOM with non-UTF
    
    * io.c (io_encname_bom_p): check BOM prefix only, not including
      UTF prefix.
    * io.c (parse_mode_enc): warn BOM with non-UTF encoding.

  Modified files:
    trunk/ChangeLog
    trunk/io.c
    trunk/test/ruby/test_io_m17n.rb
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 53083)
+++ ChangeLog	(revision 53084)
@@ -1,4 +1,9 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1
-Sun Dec 13 18:45:12 2015  Nobuyoshi Nakada  <nobu@r...>
+Sun Dec 13 18:46:31 2015  Nobuyoshi Nakada  <nobu@r...>
+
+	* io.c (io_encname_bom_p): check BOM prefix only, not including
+	  UTF prefix.
+
+	* io.c (parse_mode_enc): warn BOM with non-UTF encoding.
 
 	* io.c (parse_mode_enc): fix buffer overflow.
 
Index: io.c
===================================================================
--- io.c	(revision 53083)
+++ io.c	(revision 53084)
@@ -4852,11 +4852,14 @@ rb_io_fmode_modestr(int fmode) https://github.com/ruby/ruby/blob/trunk/io.c#L4852
     }
 }
 
+static const char bom_prefix[] = "bom|";
+static const char utf_prefix[] = "utf-";
+enum {bom_prefix_len = (int)sizeof(bom_prefix) - 1};
+enum {utf_prefix_len = (int)sizeof(utf_prefix) - 1};
+
 static int
 io_encname_bom_p(const char *name, long len)
 {
-    static const char bom_prefix[] = "bom|utf-";
-    enum {bom_prefix_len = (int)sizeof(bom_prefix) - 1};
     return len > bom_prefix_len && STRNCASECMP(name, bom_prefix, bom_prefix_len) == 0;
 }
 
@@ -5064,37 +5067,31 @@ parse_mode_enc(const char *estr, rb_enco https://github.com/ruby/ruby/blob/trunk/io.c#L5067
     int idx, idx2;
     int fmode = fmode_p ? *fmode_p : 0;
     rb_encoding *ext_enc, *int_enc;
+    long len;
 
     /* parse estr as "enc" or "enc2:enc" or "enc:-" */
 
     p = strrchr(estr, ':');
-    if (p) {
-	long len = (p++) - estr;
-	if (len == 0 || len > ENCODING_MAXNAMELEN)
-	    idx = -1;
+    len = p ? (p++ - estr) : (long)strlen(estr);
+    if ((fmode & FMODE_SETENC_BY_BOM) || io_encname_bom_p(estr, len)) {
+	estr += bom_prefix_len;
+	len -= bom_prefix_len;
+	if (!STRNCASECMP(estr, utf_prefix, utf_prefix_len)) {
+	    fmode |= FMODE_SETENC_BY_BOM;
+	}
 	else {
-	    if (io_encname_bom_p(estr, len)) {
-		fmode |= FMODE_SETENC_BY_BOM;
-		estr += 4;
-                len -= 4;
-            }
-	    memcpy(encname, estr, len);
-	    encname[len] = '\0';
-	    estr = encname;
-	    idx = rb_enc_find_index(encname);
+	    rb_warn("BOM with non-UTF encoding %s is nonsense", estr);
+	    fmode &= ~FMODE_SETENC_BY_BOM;
 	}
     }
+    if (len == 0 || len > ENCODING_MAXNAMELEN) {
+	idx = -1;
+    }
     else {
-	long len = strlen(estr);
-	if (io_encname_bom_p(estr, len)) {
-	    fmode |= FMODE_SETENC_BY_BOM;
-	    estr += 4;
-            len -= 4;
-	    if (len > 0 && len <= ENCODING_MAXNAMELEN) {
-		memcpy(encname, estr, len);
-		encname[len] = '\0';
-		estr = encname;
-	    }
+	if (p) {
+	    memcpy(encname, estr, len);
+	    encname[len] = '\0';
+	    estr = encname;
 	}
 	idx = rb_enc_find_index(estr);
     }
Index: test/ruby/test_io_m17n.rb
===================================================================
--- test/ruby/test_io_m17n.rb	(revision 53083)
+++ test/ruby/test_io_m17n.rb	(revision 53084)
@@ -2095,6 +2095,20 @@ EOT https://github.com/ruby/ruby/blob/trunk/test/ruby/test_io_m17n.rb#L2095
     end;
   end
 
+  def test_bom_non_utf
+    enc = nil
+
+    assert_warn(/BOM/) {
+      open(__FILE__, "r:bom|us-ascii") {|f| enc = f.external_encoding}
+    }
+    assert_equal(Encoding::US_ASCII, enc)
+
+    assert_warn(/BOM/) {
+      open(IO::NULL, "w:bom|us-ascii") {|f| enc = f.external_encoding}
+    }
+    assert_equal(Encoding::US_ASCII, enc)
+  end
+
   def test_cbuf
     with_tmpdir {
       fn = "tst"

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/