[前][次][番号順一覧][スレッド一覧]

ruby-changes:12306

From: naruse <ko1@a...>
Date: Thu, 9 Jul 2009 23:48:05 +0900 (JST)
Subject: [ruby-changes:12306] Ruby:r24001 (trunk): Set encoding and strip bom when modeenc string is "r:foo-bom"

naruse	2009-07-09 23:47:48 +0900 (Thu, 09 Jul 2009)

  New Revision: 24001

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=24001

  Log:
    Set encoding and strip bom when modeenc string is "r:foo-bom" [ruby-dev:37236]
    * include/ruby/io.h (FMODE_STRIP_BOM): new constant.
    
    * io.c (io_encname_bom_p): judge whether the encoding name
      is *-bom or not.
      (parse_mode_enc): drop "-bom".
      (rb_io_modestr_fmode): set FMODE_STRIP_BOM if needed.
      (rb_io_extract_modeenc): ditto.
      (io_strip_bom): strip bom if exists.
      (io_set_encoding_by_bom): set encoding if there is bom.

  Modified files:
    trunk/ChangeLog
    trunk/include/ruby/io.h
    trunk/io.c

Index: include/ruby/io.h
===================================================================
--- include/ruby/io.h	(revision 24000)
+++ include/ruby/io.h	(revision 24001)
@@ -94,6 +94,7 @@
 #define FMODE_TEXTMODE              0x00001000
 #define FMODE_EOF                   0x00002000
 /* #define FMODE_PREP               0x00010000 */
+#define FMODE_STRIP_BOM             0x00100000
 
 #define GetOpenFile(obj,fp) rb_io_check_closed((fp) = RFILE(rb_io_taint_check(obj))->fptr)
 
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 24000)
+++ ChangeLog	(revision 24001)
@@ -1,3 +1,17 @@
+Thu Jul  9 23:28:48 2009  NARUSE, Yui  <naruse@r...>
+
+	* include/ruby/io.h (FMODE_STRIP_BOM): new constant.
+
+	* io.c (io_encname_bom_p): judge whether the encoding name
+	  is *-bom or not.
+	  (parse_mode_enc): drop "-bom".
+	  (rb_io_modestr_fmode): set FMODE_STRIP_BOM if needed.
+	  (rb_io_extract_modeenc): ditto.
+	  (io_strip_bom): strip bom if exists.
+	  (io_set_encoding_by_bom): set encoding if there is bom.
+	  Set encoding and strip bom when modeenc string is "r:foo-bom"
+	  [ruby-dev:37236]
+
 Thu Jul  9 21:56:59 2009  NARUSE, Yui  <naruse@r...>
 
 	* marshal.c (r_object0): replace \u by u when the regexp is
Index: io.c
===================================================================
--- io.c	(revision 24000)
+++ io.c	(revision 24001)
@@ -3934,11 +3934,26 @@
     return NULL;		/* not reached */
 }
 
+static int
+io_encname_bom_p(const char *name, long len) {
+    if (len) {
+	if (len > 4 && strncasecmp(name + len - 4, "-bom", 4) == 0)
+	    return 1;
+    }
+    else {
+	const char *p = strchr(name, ':');
+	if (!p) p = name + strlen(name);
+	if (p - name > 4 && strncasecmp(p - 4, "-bom", 4) == 0)
+	    return 1;
+    }
+    return 0;
+}
+
 int
 rb_io_modestr_fmode(const char *modestr)
 {
     int fmode = 0;
-    const char *m = modestr;
+    const char *m = modestr, *p = NULL;
 
     switch (*m++) {
       case 'r':
@@ -3969,6 +3984,7 @@
 	  default:
             goto error;
 	  case ':':
+	    p = m;
             goto finished;
         }
     }
@@ -3976,6 +3992,8 @@
   finished:
     if ((fmode & FMODE_BINMODE) && (fmode & FMODE_TEXTMODE))
         goto error;
+    if (p && io_encname_bom_p(p, 0))
+	fmode |= FMODE_STRIP_BOM;
 
     return fmode;
 }
@@ -4126,14 +4144,24 @@
 	if (len == 0 || len > ENCODING_MAXNAMELEN)
 	    idx = -1;
 	else {
+	    if (io_encname_bom_p(estr, len))
+		len -= 4;
 	    memcpy(encname, estr, len);
 	    encname[len] = '\0';
 	    estr = encname;
 	    idx = rb_enc_find_index(encname);
 	}
     }
-    else
+    else {
+	long len = strlen(estr);
+	if (io_encname_bom_p(estr, len)) {
+	    len -= 4;
+	    memcpy(encname, estr, len);
+	    encname[len] = '\0';
+	    estr = encname;
+	}
 	idx = rb_enc_find_index(estr);
+    }
 
     if (idx >= 0)
 	ext_enc = rb_enc_from_index(idx);
@@ -4309,6 +4337,8 @@
         if (p) {
             has_enc = 1;
             parse_mode_enc(p+1, &enc, &enc2);
+	    if (io_encname_bom_p(p+1, 0))
+		fmode |= FMODE_STRIP_BOM;
         }
 	else {
 	    rb_encoding *e;
@@ -4493,6 +4523,84 @@
         fptr->mode |= FMODE_TTY|FMODE_DUPLEX;
 }
 
+static VALUE rb_io_internal_encoding(VALUE);
+static void io_encoding_set(rb_io_t *, VALUE, VALUE, VALUE);
+
+static int
+io_strip_bom(VALUE io) {
+    int b1, b2, b3, b4;
+    switch (b1 = FIX2INT(rb_io_getbyte(io))) {
+      case 0xEF:
+	b2 = FIX2INT(rb_io_getbyte(io));
+	if (b2 == 0xBB) {
+	    b3 = FIX2INT(rb_io_getbyte(io));
+	    if (b3 == 0xBF) {
+		return rb_utf8_encindex();
+	    }
+	    rb_io_ungetbyte(io, INT2FIX(b3));
+	}
+	rb_io_ungetbyte(io, INT2FIX(b2));
+	break;
+
+      case 0xFE:
+	b2 = FIX2INT(rb_io_getbyte(io));
+	if (b2 == 0xFF) {
+	    return rb_enc_find_index("UTF-16BE");
+	}
+	rb_io_ungetbyte(io, INT2FIX(b2));
+	break;
+
+      case 0xFF:
+	b2 = FIX2INT(rb_io_getbyte(io));
+	if (b2 == 0xFF) {
+	    b3 = FIX2INT(rb_io_getbyte(io));
+	    if (b3 == 0) {
+		b4 = FIX2INT(rb_io_getbyte(io));
+		if (b4 == 0) {
+		    return rb_enc_find_index("UTF-32LE");
+		}
+		rb_io_ungetbyte(io, INT2FIX(b4));
+	    }
+	    else {
+		return rb_enc_find_index("UTF-16LE");
+	    }
+	    rb_io_ungetbyte(io, INT2FIX(b3));
+	}
+	rb_io_ungetbyte(io, INT2FIX(b2));
+	break;
+
+      case 0:
+	b2 = FIX2INT(rb_io_getbyte(io));
+	if (b2 == 0) {
+	    b3 = FIX2INT(rb_io_getbyte(io));
+	    if (b3 == 0xFE) {
+		b4 = FIX2INT(rb_io_getbyte(io));
+		if (b4 == 0xFF) {
+		    return rb_enc_find_index("UTF-32BE");
+		}
+		rb_io_ungetbyte(io, INT2FIX(b4));
+	    }
+	    rb_io_ungetbyte(io, INT2FIX(b3));
+	}
+	rb_io_ungetbyte(io, INT2FIX(b2));
+	break;
+    }
+    rb_io_ungetbyte(io, INT2FIX(b1));
+    return 0;
+}
+
+static void
+io_set_encoding_by_bom(VALUE io) {
+    int idx = io_strip_bom(io);
+
+    if (idx) {
+	rb_io_t *fptr;
+	GetOpenFile(io, fptr);
+	io_encoding_set(fptr, rb_enc_from_encoding(rb_enc_from_index(idx)),
+		rb_io_internal_encoding(io), Qnil);
+    }
+}
+
 static VALUE
 rb_file_open_generic(VALUE io, VALUE filename, int oflags, int fmode, convconfig_t *convconfig, mode_t perm)
 {
@@ -4513,6 +4621,7 @@
     fptr->pathv = rb_str_new_frozen(filename);
     fptr->fd = rb_sysopen(fptr->pathv, oflags, perm);
     io_check_tty(fptr);
+    if (fmode & FMODE_STRIP_BOM) io_set_encoding_by_bom(io);
 
     return io;
 }
@@ -6250,6 +6359,7 @@
     else if (fileno(stderr) == fd)
 	fp->stdio_file = stderr;
 
+    if (fmode & FMODE_STRIP_BOM) io_set_encoding_by_bom(io);
     return io;
 }
 

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]