[前][次][番号順一覧][スレッド一覧]

ruby-changes:28410

From: nobu <ko1@a...>
Date: Thu, 25 Apr 2013 16:11:33 +0900 (JST)
Subject: [ruby-changes:28410] nobu:r40462 (trunk): io.c: conversion from bom encoding

nobu	2013-04-25 16:11:20 +0900 (Thu, 25 Apr 2013)

  New Revision: 40462

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=40462

  Log:
    io.c: conversion from bom encoding
    
    * io.c (rb_io_ext_int_to_encs, parse_mode_enc): bom-prefixed name is
      not a real encoding name, just a fallback.  so the proper conversion
      should take place even if if the internal encoding is equal to the
      bom-prefixed name, unless actual encoding is equal to the internal
      encoding.  [ruby-core:54563] [Bug #8323]
    * io.c (io_set_encoding_by_bom): reset extenal encoding if no BOM
      found.  [ruby-core:54569]

  Modified files:
    trunk/ChangeLog
    trunk/io.c
    trunk/test/ruby/test_io_m17n.rb

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 40461)
+++ ChangeLog	(revision 40462)
@@ -1,3 +1,14 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1
+Thu Apr 25 16:11:06 2013  Nobuyoshi Nakada  <nobu@r...>
+
+	* io.c (rb_io_ext_int_to_encs, parse_mode_enc): bom-prefixed name is
+	  not a real encoding name, just a fallback.  so the proper conversion
+	  should take place even if if the internal encoding is equal to the
+	  bom-prefixed name, unless actual encoding is equal to the internal
+	  encoding.  [ruby-core:54563] [Bug #8323]
+
+	* io.c (io_set_encoding_by_bom): reset extenal encoding if no BOM
+	  found.  [ruby-core:54569]
+
 Thu Apr 25 14:35:01 2013  NARUSE, Yui  <naruse@r...>
 
 	* ext/openssl/ossl_bn.c (ossl_bn_initialize): allow Fixnum and Bignum.
Index: io.c
===================================================================
--- io.c	(revision 40461)
+++ io.c	(revision 40462)
@@ -4860,7 +4860,7 @@ rb_io_oflags_modestr(int oflags) https://github.com/ruby/ruby/blob/trunk/io.c#L4860
  * Qnil => no encoding specified (internal only)
  */
 static void
-rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2)
+rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2, int fmode)
 {
     int default_ext = 0;
 
@@ -4871,7 +4871,8 @@ rb_io_ext_int_to_encs(rb_encoding *ext, https://github.com/ruby/ruby/blob/trunk/io.c#L4871
     if (intern == NULL && ext != rb_ascii8bit_encoding())
 	/* If external is ASCII-8BIT, no default transcoding */
 	intern = rb_default_internal_encoding();
-    if (intern == NULL || intern == (rb_encoding *)Qnil || intern == ext) {
+    if (intern == NULL || intern == (rb_encoding *)Qnil ||
+	(!(fmode & FMODE_SETENC_BY_BOM) && (intern == ext))) {
 	/* No internal encoding => use external + no transcoding */
 	*enc = (default_ext && intern != ext) ? NULL : ext;
 	*enc2 = NULL;
@@ -4894,6 +4895,7 @@ parse_mode_enc(const char *estr, rb_enco https://github.com/ruby/ruby/blob/trunk/io.c#L4895
     const char *p;
     char encname[ENCODING_MAXNAMELEN+1];
     int idx, idx2;
+    int fmode = fmode_p ? *fmode_p : 0;
     rb_encoding *ext_enc, *int_enc;
 
     /* parse estr as "enc" or "enc2:enc" or "enc:-" */
@@ -4905,7 +4907,7 @@ parse_mode_enc(const char *estr, rb_enco https://github.com/ruby/ruby/blob/trunk/io.c#L4907
 	    idx = -1;
 	else {
 	    if (io_encname_bom_p(estr, len)) {
-		if (fmode_p) *fmode_p |= FMODE_SETENC_BY_BOM;
+		fmode |= FMODE_SETENC_BY_BOM;
 		estr += 4;
                 len -= 4;
             }
@@ -4918,7 +4920,7 @@ parse_mode_enc(const char *estr, rb_enco https://github.com/ruby/ruby/blob/trunk/io.c#L4920
     else {
 	long len = strlen(estr);
 	if (io_encname_bom_p(estr, len)) {
-	    if (fmode_p) *fmode_p |= FMODE_SETENC_BY_BOM;
+	    fmode |= FMODE_SETENC_BY_BOM;
 	    estr += 4;
             len -= 4;
 	    memcpy(encname, estr, len);
@@ -4927,6 +4929,7 @@ parse_mode_enc(const char *estr, rb_enco https://github.com/ruby/ruby/blob/trunk/io.c#L4929
 	}
 	idx = rb_enc_find_index(estr);
     }
+    if (fmode_p) *fmode_p = fmode;
 
     if (idx >= 0)
 	ext_enc = rb_enc_from_index(idx);
@@ -4946,7 +4949,7 @@ parse_mode_enc(const char *estr, rb_enco https://github.com/ruby/ruby/blob/trunk/io.c#L4949
 	    idx2 = rb_enc_find_index(p);
 	    if (idx2 < 0)
 		unsupported_encoding(p);
-	    else if (idx2 == idx) {
+	    else if (!(fmode & FMODE_SETENC_BY_BOM) && (idx2 == idx)) {
 		int_enc = (rb_encoding *)Qnil;
 	    }
 	    else
@@ -4954,7 +4957,7 @@ parse_mode_enc(const char *estr, rb_enco https://github.com/ruby/ruby/blob/trunk/io.c#L4957
 	}
     }
 
-    rb_io_ext_int_to_encs(ext_enc, int_enc, enc_p, enc2_p);
+    rb_io_ext_int_to_encs(ext_enc, int_enc, enc_p, enc2_p, fmode);
 }
 
 int
@@ -5015,12 +5018,12 @@ rb_io_extract_encoding_option(VALUE opt, https://github.com/ruby/ruby/blob/trunk/io.c#L5018
 	    parse_mode_enc(StringValueCStr(tmp), enc_p, enc2_p, fmode_p);
 	}
 	else {
-	    rb_io_ext_int_to_encs(rb_to_encoding(encoding), NULL, enc_p, enc2_p);
+	    rb_io_ext_int_to_encs(rb_to_encoding(encoding), NULL, enc_p, enc2_p, 0);
 	}
     }
     else if (extenc != Qundef || intenc != Qundef) {
         extracted = 1;
-	rb_io_ext_int_to_encs(extencoding, intencoding, enc_p, enc2_p);
+	rb_io_ext_int_to_encs(extencoding, intencoding, enc_p, enc2_p, 0);
     }
     return extracted;
 }
@@ -5095,7 +5098,7 @@ rb_io_extract_modeenc(VALUE *vmode_p, VA https://github.com/ruby/ruby/blob/trunk/io.c#L5098
     vmode = *vmode_p;
 
     /* Set to defaults */
-    rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2);
+    rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2, 0);
 
   vmode_handle:
     if (NIL_P(vmode)) {
@@ -5123,7 +5126,7 @@ rb_io_extract_modeenc(VALUE *vmode_p, VA https://github.com/ruby/ruby/blob/trunk/io.c#L5126
 	    rb_encoding *e;
 
 	    e = (fmode & FMODE_BINMODE) ? rb_ascii8bit_encoding() : NULL;
-	    rb_io_ext_int_to_encs(e, NULL, &enc, &enc2);
+	    rb_io_ext_int_to_encs(e, NULL, &enc, &enc2, fmode);
 	}
     }
 
@@ -5147,7 +5150,7 @@ rb_io_extract_modeenc(VALUE *vmode_p, VA https://github.com/ruby/ruby/blob/trunk/io.c#L5150
             oflags |= O_BINARY;
 #endif
 	    if (!has_enc)
-		rb_io_ext_int_to_encs(rb_ascii8bit_encoding(), NULL, &enc, &enc2);
+		rb_io_ext_int_to_encs(rb_ascii8bit_encoding(), NULL, &enc, &enc2, fmode);
 	}
 #if DEFAULT_TEXTMODE
 	else if (NIL_P(vmode)) {
@@ -5370,13 +5373,16 @@ static void https://github.com/ruby/ruby/blob/trunk/io.c#L5373
 io_set_encoding_by_bom(VALUE io)
 {
     int idx = io_strip_bom(io);
+    rb_io_t *fptr;
 
+    GetOpenFile(io, fptr);
     if (idx) {
-	rb_io_t *fptr;
-	GetOpenFile(io, fptr);
 	io_encoding_set(fptr, rb_enc_from_encoding(rb_enc_from_index(idx)),
 		rb_io_internal_encoding(io), Qnil);
     }
+    else {
+	fptr->encs.enc2 = NULL;
+    }
 }
 
 static VALUE
@@ -5386,7 +5392,7 @@ rb_file_open_generic(VALUE io, VALUE fil https://github.com/ruby/ruby/blob/trunk/io.c#L5392
     convconfig_t cc;
     if (!convconfig) {
 	/* Set to default encodings */
-	rb_io_ext_int_to_encs(NULL, NULL, &cc.enc, &cc.enc2);
+	rb_io_ext_int_to_encs(NULL, NULL, &cc.enc, &cc.enc2, fmode);
         cc.ecflags = 0;
         cc.ecopts = Qnil;
         convconfig = &cc;
@@ -5420,7 +5426,7 @@ rb_file_open_internal(VALUE io, VALUE fi https://github.com/ruby/ruby/blob/trunk/io.c#L5426
 	/* Set to default encodings */
 
 	e = (fmode & FMODE_BINMODE) ? rb_ascii8bit_encoding() : NULL;
-	rb_io_ext_int_to_encs(e, NULL, &convconfig.enc, &convconfig.enc2);
+	rb_io_ext_int_to_encs(e, NULL, &convconfig.enc, &convconfig.enc2, fmode);
         convconfig.ecflags = 0;
         convconfig.ecopts = Qnil;
     }
@@ -9078,7 +9084,7 @@ io_encoding_set(rb_io_t *fptr, VALUE v1, https://github.com/ruby/ruby/blob/trunk/io.c#L9084
     else {
 	if (NIL_P(v1)) {
 	    /* Set to default encodings */
-	    rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2);
+	    rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2, 0);
 	    SET_UNIVERSAL_NEWLINE_DECORATOR_IF_ENC2(enc2, ecflags);
             ecopts = Qnil;
 	}
@@ -9090,7 +9096,7 @@ io_encoding_set(rb_io_t *fptr, VALUE v1, https://github.com/ruby/ruby/blob/trunk/io.c#L9096
                 ecflags = rb_econv_prepare_options(opt, &ecopts, ecflags);
 	    }
 	    else {
-		rb_io_ext_int_to_encs(find_encoding(v1), NULL, &enc, &enc2);
+		rb_io_ext_int_to_encs(find_encoding(v1), NULL, &enc, &enc2, 0);
 		SET_UNIVERSAL_NEWLINE_DECORATOR_IF_ENC2(enc2, ecflags);
                 ecopts = Qnil;
 	    }
Index: test/ruby/test_io_m17n.rb
===================================================================
--- test/ruby/test_io_m17n.rb	(revision 40461)
+++ test/ruby/test_io_m17n.rb	(revision 40462)
@@ -1997,6 +1997,7 @@ EOT https://github.com/ruby/ruby/blob/trunk/test/ruby/test_io_m17n.rb#L1997
   def test_strip_bom
     with_tmpdir {
       text = "\uFEFFa"
+      stripped = "a"
       %w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.each do |name|
         path = '%s-bom.txt' % name
         content = text.encode(name)
@@ -2004,11 +2005,32 @@ EOT https://github.com/ruby/ruby/blob/trunk/test/ruby/test_io_m17n.rb#L2005
         result = File.read(path, mode: 'rb:BOM|UTF-8')
         assert_equal(content[1].force_encoding("ascii-8bit"),
                      result.force_encoding("ascii-8bit"))
+        result = File.read(path, mode: 'rb:BOM|UTF-8:UTF-8')
+        assert_equal(Encoding::UTF_8, result.encoding)
+        assert_equal(stripped, result)
       end
 
       bug3407 = '[ruby-core:30641]'
-      result = File.read('UTF-8-bom.txt', encoding: 'BOM|UTF-8')
+      path = 'UTF-8-bom.txt'
+      result = File.read(path, encoding: 'BOM|UTF-8')
       assert_equal("a", result.force_encoding("ascii-8bit"), bug3407)
+
+      bug8323 = '[ruby-core:54563] [Bug #8323]'
+      expected = "a\xff".force_encoding("utf-8")
+      open(path, 'ab') {|f| f.write("\xff")}
+      result = File.read(path, encoding: 'BOM|UTF-8')
+      assert_not_predicate(result, :valid_encoding?, bug8323)
+      assert_equal(expected, result, bug8323)
+      result = File.read(path, encoding: 'BOM|UTF-8:UTF-8')
+      assert_not_predicate(result, :valid_encoding?, bug8323)
+      assert_equal(expected, result, bug8323)
+
+      path = 'ascii.txt'
+      generate_file(path, stripped)
+      result = File.read(path, encoding: 'BOM|UTF-8')
+      assert_equal(stripped, result, bug8323)
+      result = File.read(path, encoding: 'BOM|UTF-8:UTF-8')
+      assert_equal(stripped, result, bug8323)
     }
   end
 

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]