ruby-changes:12306
From: naruse <ko1@a...>
Date: Thu, 9 Jul 2009 23:48:05 +0900 (JST)
Subject: [ruby-changes:12306] Ruby:r24001 (trunk): Set encoding and strip bom when modeenc string is "r:foo-bom"
naruse 2009-07-09 23:47:48 +0900 (Thu, 09 Jul 2009) New Revision: 24001 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=24001 Log: Set encoding and strip bom when modeenc string is "r:foo-bom" [ruby-dev:37236] * include/ruby/io.h (FMODE_STRIP_BOM): new constant. * io.c (io_encname_bom_p): judge whether the encoding name is *-bom or not. (parse_mode_enc): drop "-bom". (rb_io_modestr_fmode): set FMODE_STRIP_BOM if needed. (rb_io_extract_modeenc): ditto. (io_strip_bom): strip bom if exists. (io_set_encoding_by_bom): set encoding if there is bom. Modified files: trunk/ChangeLog trunk/include/ruby/io.h trunk/io.c Index: include/ruby/io.h =================================================================== --- include/ruby/io.h (revision 24000) +++ include/ruby/io.h (revision 24001) @@ -94,6 +94,7 @@ #define FMODE_TEXTMODE 0x00001000 #define FMODE_EOF 0x00002000 /* #define FMODE_PREP 0x00010000 */ +#define FMODE_STRIP_BOM 0x00100000 #define GetOpenFile(obj,fp) rb_io_check_closed((fp) = RFILE(rb_io_taint_check(obj))->fptr) Index: ChangeLog =================================================================== --- ChangeLog (revision 24000) +++ ChangeLog (revision 24001) @@ -1,3 +1,17 @@ +Thu Jul 9 23:28:48 2009 NARUSE, Yui <naruse@r...> + + * include/ruby/io.h (FMODE_STRIP_BOM): new constant. + + * io.c (io_encname_bom_p): judge whether the encoding name + is *-bom or not. + (parse_mode_enc): drop "-bom". + (rb_io_modestr_fmode): set FMODE_STRIP_BOM if needed. + (rb_io_extract_modeenc): ditto. + (io_strip_bom): strip bom if exists. + (io_set_encoding_by_bom): set encoding if there is bom. + Set encoding and strip bom when modeenc string is "r:foo-bom" + [ruby-dev:37236] + Thu Jul 9 21:56:59 2009 NARUSE, Yui <naruse@r...> * marshal.c (r_object0): replace \u by u when the regexp is Index: io.c =================================================================== --- io.c (revision 24000) +++ io.c (revision 24001) @@ -3934,11 +3934,26 @@ return NULL; /* not reached */ } +static int +io_encname_bom_p(const char *name, long len) { + if (len) { + if (len > 4 && strncasecmp(name + len - 4, "-bom", 4) == 0) + return 1; + } + else { + const char *p = strchr(name, ':'); + if (!p) p = name + strlen(name); + if (p - name > 4 && strncasecmp(p - 4, "-bom", 4) == 0) + return 1; + } + return 0; +} + int rb_io_modestr_fmode(const char *modestr) { int fmode = 0; - const char *m = modestr; + const char *m = modestr, *p = NULL; switch (*m++) { case 'r': @@ -3969,6 +3984,7 @@ default: goto error; case ':': + p = m; goto finished; } } @@ -3976,6 +3992,8 @@ finished: if ((fmode & FMODE_BINMODE) && (fmode & FMODE_TEXTMODE)) goto error; + if (p && io_encname_bom_p(p, 0)) + fmode |= FMODE_STRIP_BOM; return fmode; } @@ -4126,14 +4144,24 @@ if (len == 0 || len > ENCODING_MAXNAMELEN) idx = -1; else { + if (io_encname_bom_p(estr, len)) + len -= 4; memcpy(encname, estr, len); encname[len] = '\0'; estr = encname; idx = rb_enc_find_index(encname); } } - else + else { + long len = strlen(estr); + if (io_encname_bom_p(estr, len)) { + len -= 4; + memcpy(encname, estr, len); + encname[len] = '\0'; + estr = encname; + } idx = rb_enc_find_index(estr); + } if (idx >= 0) ext_enc = rb_enc_from_index(idx); @@ -4309,6 +4337,8 @@ if (p) { has_enc = 1; parse_mode_enc(p+1, &enc, &enc2); + if (io_encname_bom_p(p+1, 0)) + fmode |= FMODE_STRIP_BOM; } else { rb_encoding *e; @@ -4493,6 +4523,84 @@ fptr->mode |= FMODE_TTY|FMODE_DUPLEX; } +static VALUE rb_io_internal_encoding(VALUE); +static void io_encoding_set(rb_io_t *, VALUE, VALUE, VALUE); + +static int +io_strip_bom(VALUE io) { + int b1, b2, b3, b4; + switch (b1 = FIX2INT(rb_io_getbyte(io))) { + case 0xEF: + b2 = FIX2INT(rb_io_getbyte(io)); + if (b2 == 0xBB) { + b3 = FIX2INT(rb_io_getbyte(io)); + if (b3 == 0xBF) { + return rb_utf8_encindex(); + } + rb_io_ungetbyte(io, INT2FIX(b3)); + } + rb_io_ungetbyte(io, INT2FIX(b2)); + break; + + case 0xFE: + b2 = FIX2INT(rb_io_getbyte(io)); + if (b2 == 0xFF) { + return rb_enc_find_index("UTF-16BE"); + } + rb_io_ungetbyte(io, INT2FIX(b2)); + break; + + case 0xFF: + b2 = FIX2INT(rb_io_getbyte(io)); + if (b2 == 0xFF) { + b3 = FIX2INT(rb_io_getbyte(io)); + if (b3 == 0) { + b4 = FIX2INT(rb_io_getbyte(io)); + if (b4 == 0) { + return rb_enc_find_index("UTF-32LE"); + } + rb_io_ungetbyte(io, INT2FIX(b4)); + } + else { + return rb_enc_find_index("UTF-16LE"); + } + rb_io_ungetbyte(io, INT2FIX(b3)); + } + rb_io_ungetbyte(io, INT2FIX(b2)); + break; + + case 0: + b2 = FIX2INT(rb_io_getbyte(io)); + if (b2 == 0) { + b3 = FIX2INT(rb_io_getbyte(io)); + if (b3 == 0xFE) { + b4 = FIX2INT(rb_io_getbyte(io)); + if (b4 == 0xFF) { + return rb_enc_find_index("UTF-32BE"); + } + rb_io_ungetbyte(io, INT2FIX(b4)); + } + rb_io_ungetbyte(io, INT2FIX(b3)); + } + rb_io_ungetbyte(io, INT2FIX(b2)); + break; + } + rb_io_ungetbyte(io, INT2FIX(b1)); + return 0; +} + +static void +io_set_encoding_by_bom(VALUE io) { + int idx = io_strip_bom(io); + + if (idx) { + rb_io_t *fptr; + GetOpenFile(io, fptr); + io_encoding_set(fptr, rb_enc_from_encoding(rb_enc_from_index(idx)), + rb_io_internal_encoding(io), Qnil); + } +} + static VALUE rb_file_open_generic(VALUE io, VALUE filename, int oflags, int fmode, convconfig_t *convconfig, mode_t perm) { @@ -4513,6 +4621,7 @@ fptr->pathv = rb_str_new_frozen(filename); fptr->fd = rb_sysopen(fptr->pathv, oflags, perm); io_check_tty(fptr); + if (fmode & FMODE_STRIP_BOM) io_set_encoding_by_bom(io); return io; } @@ -6250,6 +6359,7 @@ else if (fileno(stderr) == fd) fp->stdio_file = stderr; + if (fmode & FMODE_STRIP_BOM) io_set_encoding_by_bom(io); return io; } -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/