ruby-changes:3273
From: ko1@a...
Date: 28 Dec 2007 18:27:32 +0900
Subject: [ruby-changes:3273] duerst - Ruby:r14766 (trunk): Fri Dec 28 01:55:04 2007 Martin Duerst <duerst@i...>
duerst 2007-12-28 18:26:55 +0900 (Fri, 28 Dec 2007) New Revision: 14766 Modified files: trunk/ChangeLog trunk/enc/trans/single_byte.c trunk/test/ruby/test_transcode.rb trunk/transcode.c trunk/transcode_data.h Log: Fri Dec 28 01:55:04 2007 Martin Duerst <duerst@i...> * transcode.c (transcode_dispatch): reverted some of the changes in r14746. * transcode.c, enc/trans/single_byte.c: Added conversions to/from US-ASCII and ASCII-8BIT (using data tables). * enc/trans/single_byte.c: Some spacing/ordering changes due to automatic data file generation. * transcode_data.h, transcode.c: Preliminary code for using micro-conversion functions. * test/ruby/test_transcode.rb: Added some tests for US-ASCII and ASCII-8BIT conversions. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/transcode_data.h?r1=14766&r2=14765 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_transcode.rb?r1=14766&r2=14765 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=14766&r2=14765 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/transcode.c?r1=14766&r2=14765 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/enc/trans/single_byte.c?r1=14766&r2=14765 Index: ChangeLog =================================================================== --- ChangeLog (revision 14765) +++ ChangeLog (revision 14766) @@ -1,3 +1,20 @@ +Fri Dec 28 01:55:04 2007 Martin Duerst <duerst@i...> + + * transcode.c (transcode_dispatch): reverted some of the changes + in r14746. + + * transcode.c, enc/trans/single_byte.c: Added conversions to/from + US-ASCII and ASCII-8BIT (using data tables). + + * enc/trans/single_byte.c: Some spacing/ordering changes due to + automatic data file generation. + + * transcode_data.h, transcode.c: Preliminary code for using + micro-conversion functions. + + * test/ruby/test_transcode.rb: Added some tests for US-ASCII and + ASCII-8BIT conversions. + Fri Dec 28 17:33:44 2007 Tanaka Akira <akr@f...> * time.c (make_time_t): verify mktime and timegm result. Index: enc/trans/single_byte.c =================================================================== --- enc/trans/single_byte.c (revision 14765) +++ enc/trans/single_byte.c (revision 14766) @@ -1,6 +1,63 @@ #include "transcode_data.h" static const unsigned char +from_US_ASCII_offsets[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; +static const struct byte_lookup* const +from_US_ASCII_infos[2] = { + NOMAP, UNDEF, +}; +static const BYTE_LOOKUP +from_US_ASCII = { + /* used from from_US_ASCII */ + /* used from to_US_ASCII */ + /* used from to_ASCII_8BIT */ + /* used from from_ASCII_8BIT */ + from_US_ASCII_offsets, + from_US_ASCII_infos +}; + +static rb_transcoder +rb_from_US_ASCII = { + "US-ASCII", "UTF-8", &from_US_ASCII, 1, 0, + NULL, NULL, +}; + +static rb_transcoder +rb_to_US_ASCII = { + "UTF-8", "US-ASCII", &from_US_ASCII, 1, 1, + NULL, NULL, +}; + +static rb_transcoder +rb_from_ASCII_8BIT = { + "ASCII-8BIT", "UTF-8", &from_US_ASCII, 1, 0, + NULL, NULL, +}; + +static rb_transcoder +rb_to_ASCII_8BIT = { + "UTF-8", "ASCII-8BIT", &from_US_ASCII, 1, 1, + NULL, NULL, +}; + +static const unsigned char from_ISO_8859_1_offsets[256] = { /* used from from_ISO_8859_1 */ /* used from from_ISO_8859_2 */ @@ -69,6 +126,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_1_infos }; + static rb_transcoder rb_from_ISO_8859_1 = { "ISO-8859-1", "UTF-8", &from_ISO_8859_1, 2, 0, @@ -167,6 +225,7 @@ to_ISO_8859_1_offsets, to_ISO_8859_1_infos }; + static rb_transcoder rb_to_ISO_8859_1 = { "UTF-8", "ISO-8859-1", &to_ISO_8859_1, 1, 1, @@ -214,6 +273,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_2_infos }; + static rb_transcoder rb_from_ISO_8859_2 = { "ISO-8859-2", "UTF-8", &from_ISO_8859_2, 2, 0, @@ -370,6 +430,7 @@ to_ISO_8859_2_offsets, to_ISO_8859_2_infos }; + static rb_transcoder rb_to_ISO_8859_2 = { "UTF-8", "ISO-8859-2", &to_ISO_8859_2, 1, 1, @@ -434,6 +495,7 @@ from_ISO_8859_3_offsets, from_ISO_8859_3_infos }; + static rb_transcoder rb_from_ISO_8859_3 = { "ISO-8859-3", "UTF-8", &from_ISO_8859_3, 2, 0, @@ -565,6 +627,7 @@ to_ISO_8859_2_offsets, to_ISO_8859_3_infos }; + static rb_transcoder rb_to_ISO_8859_3 = { "UTF-8", "ISO-8859-3", &to_ISO_8859_3, 1, 1, @@ -612,6 +675,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_4_infos }; + static rb_transcoder rb_from_ISO_8859_4 = { "ISO-8859-4", "UTF-8", &from_ISO_8859_4, 2, 0, @@ -747,6 +811,7 @@ to_ISO_8859_2_offsets, to_ISO_8859_4_infos }; + static rb_transcoder rb_to_ISO_8859_4 = { "UTF-8", "ISO-8859-4", &to_ISO_8859_4, 1, 1, @@ -826,6 +891,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_5_infos }; + static rb_transcoder rb_from_ISO_8859_5 = { "ISO-8859-5", "UTF-8", &from_ISO_8859_5, 3, 0, @@ -977,6 +1043,7 @@ to_ISO_8859_5_offsets, to_ISO_8859_5_infos }; + static rb_transcoder rb_to_ISO_8859_5 = { "UTF-8", "ISO-8859-5", &to_ISO_8859_5, 1, 1, @@ -1032,6 +1099,7 @@ from_ISO_8859_6_offsets, from_ISO_8859_6_infos }; + static rb_transcoder rb_from_ISO_8859_6 = { "ISO-8859-6", "UTF-8", &from_ISO_8859_6, 2, 0, @@ -1138,6 +1206,7 @@ to_ISO_8859_6_offsets, to_ISO_8859_6_infos }; + static rb_transcoder rb_to_ISO_8859_6 = { "UTF-8", "ISO-8859-6", &to_ISO_8859_6, 1, 1, @@ -1235,6 +1304,7 @@ from_ISO_8859_7_offsets, from_ISO_8859_7_infos }; + static rb_transcoder rb_from_ISO_8859_7 = { "ISO-8859-7", "UTF-8", &from_ISO_8859_7, 3, 0, @@ -1421,6 +1491,7 @@ to_ISO_8859_7_offsets, to_ISO_8859_7_infos }; + static rb_transcoder rb_to_ISO_8859_7 = { "UTF-8", "ISO-8859-7", &to_ISO_8859_7, 1, 1, @@ -1501,6 +1572,7 @@ from_ISO_8859_8_offsets, from_ISO_8859_8_infos }; + static rb_transcoder rb_from_ISO_8859_8 = { "ISO-8859-8", "UTF-8", &from_ISO_8859_8, 3, 0, @@ -1646,6 +1718,7 @@ to_ISO_8859_8_offsets, to_ISO_8859_8_infos }; + static rb_transcoder rb_to_ISO_8859_8 = { "UTF-8", "ISO-8859-8", &to_ISO_8859_8, 1, 1, @@ -1693,6 +1766,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_9_infos }; + static rb_transcoder rb_from_ISO_8859_9 = { "ISO-8859-9", "UTF-8", &from_ISO_8859_9, 2, 0, @@ -1795,6 +1869,7 @@ to_ISO_8859_9_offsets, to_ISO_8859_9_infos }; + static rb_transcoder rb_to_ISO_8859_9 = { "UTF-8", "ISO-8859-9", &to_ISO_8859_9, 1, 1, @@ -1874,6 +1949,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_10_infos }; + static rb_transcoder rb_from_ISO_8859_10 = { "ISO-8859-10", "UTF-8", &from_ISO_8859_10, 3, 0, @@ -2031,6 +2107,7 @@ to_ISO_8859_10_offsets, to_ISO_8859_10_infos }; + static rb_transcoder rb_to_ISO_8859_10 = { "UTF-8", "ISO-8859-10", &to_ISO_8859_10, 1, 1, @@ -2125,6 +2202,7 @@ from_ISO_8859_11_offsets, from_ISO_8859_11_infos }; + static rb_transcoder rb_from_ISO_8859_11 = { "ISO-8859-11", "UTF-8", &from_ISO_8859_11, 3, 0, @@ -2258,6 +2336,7 @@ to_ISO_8859_11_offsets, to_ISO_8859_11_infos }; + static rb_transcoder rb_to_ISO_8859_11 = { "UTF-8", "ISO-8859-11", &to_ISO_8859_11, 1, 1, @@ -2337,6 +2416,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_13_infos }; + static rb_transcoder rb_from_ISO_8859_13 = { "ISO-8859-13", "UTF-8", &from_ISO_8859_13, 3, 0, @@ -2481,6 +2561,7 @@ to_ISO_8859_10_offsets, to_ISO_8859_13_infos }; + static rb_transcoder rb_to_ISO_8859_13 = { "UTF-8", "ISO-8859-13", &to_ISO_8859_13, 1, 1, @@ -2560,6 +2641,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_14_infos }; + static rb_transcoder rb_from_ISO_8859_14 = { "ISO-8859-14", "UTF-8", &from_ISO_8859_14, 3, 0, @@ -2781,6 +2863,7 @@ to_ISO_8859_14_offsets, to_ISO_8859_14_infos }; + static rb_transcoder rb_to_ISO_8859_14 = { "UTF-8", "ISO-8859-14", &to_ISO_8859_14, 1, 1, @@ -2860,6 +2943,7 @@ from_ISO_8859_1_offsets, from_ISO_8859_15_infos }; + static rb_transcoder rb_from_ISO_8859_15 = { "ISO-8859-15", "UTF-8", &from_ISO_8859_15, 3, 0, @@ -2979,6 +3063,7 @@ to_ISO_8859_15_offsets, to_ISO_8859_15_infos }; + static rb_transcoder rb_to_ISO_8859_15 = { "UTF-8", "ISO-8859-15", &to_ISO_8859_15, 1, 1, @@ -2988,33 +3073,37 @@ void Init_single_byte(void) { + rb_register_transcoder(&rb_from_US_ASCII); + rb_register_transcoder(&rb_to_US_ASCII); + rb_register_transcoder(&rb_from_ASCII_8BIT); + rb_register_transcoder(&rb_to_ASCII_8BIT); rb_register_transcoder(&rb_from_ISO_8859_1); + rb_register_transcoder(&rb_to_ISO_8859_1); rb_register_transcoder(&rb_from_ISO_8859_2); + rb_register_transcoder(&rb_to_ISO_8859_2); rb_register_transcoder(&rb_from_ISO_8859_3); + rb_register_transcoder(&rb_to_ISO_8859_3); rb_register_transcoder(&rb_from_ISO_8859_4); + rb_register_transcoder(&rb_to_ISO_8859_4); rb_register_transcoder(&rb_from_ISO_8859_5); + rb_register_transcoder(&rb_to_ISO_8859_5); rb_register_transcoder(&rb_from_ISO_8859_6); + rb_register_transcoder(&rb_to_ISO_8859_6); rb_register_transcoder(&rb_from_ISO_8859_7); + rb_register_transcoder(&rb_to_ISO_8859_7); rb_register_transcoder(&rb_from_ISO_8859_8); + rb_register_transcoder(&rb_to_ISO_8859_8); rb_register_transcoder(&rb_from_ISO_8859_9); + rb_register_transcoder(&rb_to_ISO_8859_9); rb_register_transcoder(&rb_from_ISO_8859_10); + rb_register_transcoder(&rb_to_ISO_8859_10); rb_register_transcoder(&rb_from_ISO_8859_11); + rb_register_transcoder(&rb_to_ISO_8859_11); rb_register_transcoder(&rb_from_ISO_8859_13); + rb_register_transcoder(&rb_to_ISO_8859_13); rb_register_transcoder(&rb_from_ISO_8859_14); + rb_register_transcoder(&rb_to_ISO_8859_14); rb_register_transcoder(&rb_from_ISO_8859_15); - rb_register_transcoder(&rb_to_ISO_8859_1); - rb_register_transcoder(&rb_to_ISO_8859_2); - rb_register_transcoder(&rb_to_ISO_8859_3); - rb_register_transcoder(&rb_to_ISO_8859_4); - rb_register_transcoder(&rb_to_ISO_8859_5); - rb_register_transcoder(&rb_to_ISO_8859_6); - rb_register_transcoder(&rb_to_ISO_8859_7); - rb_register_transcoder(&rb_to_ISO_8859_8); - rb_register_transcoder(&rb_to_ISO_8859_9); - rb_register_transcoder(&rb_to_ISO_8859_10); - rb_register_transcoder(&rb_to_ISO_8859_11); - rb_register_transcoder(&rb_to_ISO_8859_13); - rb_register_transcoder(&rb_to_ISO_8859_14); rb_register_transcoder(&rb_to_ISO_8859_15); } -/* Footprint (bytes): gross: 26788, saved: 3728, net: 23060 */ +/* Footprint (bytes): gross: 27876, saved: 4544, net: 23332 */ Index: transcode_data.h =================================================================== --- transcode_data.h (revision 14765) +++ transcode_data.h (revision 14766) @@ -27,25 +27,28 @@ #define PType (const BYTE_LOOKUP *) #endif -#define NOMAP (PType 0x01) /* single byte direct map */ -#define ONEbt (0x02) /* one byte payload */ -#define TWObt (0x03) /* two bytes payload */ -#define THREEbt (0x05) /* three bytes payload */ -#define FOURbt (0x06) /* four bytes payload, UTF-8 only, macros start at getBT0 */ -#define INVALID (PType 0x07) /* invalid byte sequence */ -#define UNDEF (PType 0x09) /* legal but undefined */ -#define ZERObt (PType 0x0A) /* zero bytes of payload, i.e. remove */ +#define NOMAP (PType 0x01) /* single byte direct map */ +#define ONEbt (0x02) /* one byte payload */ +#define TWObt (0x03) /* two bytes payload */ +#define THREEbt (0x05) /* three bytes payload */ +#define FOURbt (0x06) /* four bytes payload, UTF-8 only, macros start at getBT0 */ +#define INVALID (PType 0x07) /* invalid byte sequence */ +#define UNDEF (PType 0x09) /* legal but undefined */ +#define ZERObt (PType 0x0A) /* zero bytes of payload, i.e. remove */ +#define FUNii (PType 0x0B) /* function from info to info */ -#define o1(b1) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|ONEbt)) -#define o2(b1,b2) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt)) -#define o3(b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt)) -#define o4(b0,b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt)) +#define o1(b1) (PType((((unsigned char)(b1))<<8)|ONEbt)) +#define o2(b1,b2) (PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt)) +#define o3(b1,b2,b3) (PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt)) +#define o4(b0,b1,b2,b3) (PType((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt)) -#define getBT1(a) (((a)>> 8)&0xFF) -#define getBT2(a) (((a)>>16)&0xFF) -#define getBT3(a) (((a)>>24)&0xFF) -#define getBT0(a) ((((a)>> 5)&0x07)|0xF0) /* for UTF-8 only!!! */ +#define getBT1(a) (((a)>> 8)&0xFF) +#define getBT2(a) (((a)>>16)&0xFF) +#define getBT3(a) (((a)>>24)&0xFF) +#define getBT0(a) ((((a)>> 5)&0x07)|0xF0) /* for UTF-8 only!!! */ +#define o2FUNii(b1,b2) (PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|FUNii)) + /* do we need these??? maybe not, can be done with simple tables */ #define ONETRAIL /* legal but undefined if one more trailing UTF-8 */ #define TWOTRAIL /* legal but undefined if two more trailing UTF-8 */ @@ -70,6 +73,7 @@ struct rb_transcoder *, struct rb_transcoding *); void (*postprocessor)(char**, char**, char*, char*, struct rb_transcoder *, struct rb_transcoding *); + VALUE (*func_ii)(VALUE); /* function from info to info */ } rb_transcoder; void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib); Index: test/ruby/test_transcode.rb =================================================================== --- test/ruby/test_transcode.rb (revision 14765) +++ test/ruby/test_transcode.rb (revision 14766) @@ -26,6 +26,8 @@ assert_raise(ArgumentError) { 'abc'.encode!('foo', 'bar') } assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode('foo') } assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode!('foo') } + assert_raise(RuntimeError) { "\x80".encode('utf-8','ASCII-8BIT') } + assert_raise(RuntimeError) { "\x80".encode('utf-8','US-ASCII') } assert_raise(RuntimeError) { "\xA5".encode('utf-8','iso-8859-3') } end @@ -87,6 +89,7 @@ def test_ascii_range encodings = [ + 'US-ASCII', 'ASCII-8BIT', 'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4', 'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9', Index: transcode.c =================================================================== --- transcode.c (revision 14765) +++ transcode.c (revision 14766) @@ -89,6 +89,8 @@ static void init_transcoder_table(void) { + rb_declare_transcoder("US-ASCII", "UTF-8", "single_byte"); + rb_declare_transcoder("ASCII-8BIT", "UTF-8", "single_byte"); rb_declare_transcoder("ISO-8859-1", "UTF-8", "single_byte"); rb_declare_transcoder("ISO-8859-2", "UTF-8", "single_byte"); rb_declare_transcoder("ISO-8859-3", "UTF-8", "single_byte"); @@ -173,6 +175,7 @@ follow_byte: next_offset = next_table->base[next_byte]; next_info = (VALUE)next_table->info[next_offset]; + follow_info: switch (next_info & 0x1F) { case NOMAP: *out_p++ = next_byte; @@ -191,7 +194,7 @@ else goto invalid; } - next_table = next_table->info[next_offset]; + next_table = (const BYTE_LOOKUP *)next_info; goto follow_byte; /* maybe rewrite the following cases to use fallthrough???? */ case ZERObt: /* drop input */ @@ -210,6 +213,9 @@ *out_p++ = getBT2(next_info); *out_p++ = getBT3(next_info); continue; + case FUNii: + next_info = (VALUE)(*my_transcoder->func_ii)(next_info); + goto follow_info; case INVALID: goto invalid; case UNDEF: @@ -287,7 +293,7 @@ return -1; } if (from_enc && to_enc && rb_enc_asciicompat(from_enc) && rb_enc_asciicompat(to_enc)) { - if (to_encidx == 0 || ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) { + if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) { return to_encidx; } } @@ -295,25 +301,6 @@ return -1; } - if (from_encidx == 0) { - const char *p = RSTRING_PTR(str); - const char *e = p + RSTRING_LEN(str); - - while (p < e) { - int ret = rb_enc_precise_mbclen(p, e, to_enc); - int len = MBCLEN_CHARFOUND(ret); - - if (!len) { - rb_raise(rb_eArgError, "not fully converted, %d bytes left", e-p); - } - p += len; - } - if (to_encidx < 0) { - to_encidx = rb_define_dummy_encoding(to_e); - } - return to_encidx; - } - while (!final_encoding) { /* loop for multistep transcoding */ /* later, maybe use smaller intermediate strings for very long strings */ if (!(my_transcoder = transcode_dispatch(from_e, to_e))) { @@ -412,6 +399,7 @@ /* * call-seq: * str.encode(encoding) => str + * str.encode(to_encoding, from_encoding) => str * * With one argument, returns a copy of <i>str</i> transcoded * to encoding +encoding+. -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml