ruby-changes:6985
From: akr <ko1@a...>
Date: Tue, 12 Aug 2008 07:44:48 +0900 (JST)
Subject: [ruby-changes:6985] Ruby:r18503 (trunk): * transcode_data.h (rb_transcoder): add resetstate_func field for
akr 2008-08-12 07:44:23 +0900 (Tue, 12 Aug 2008) New Revision: 18503 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=18503 Log: * transcode_data.h (rb_transcoder): add resetstate_func field for resetting a state of stateful encoding. * enc/trans/iso2022.trans (rb_EUC_JP_to_ISO_2022_JP): specify finish_eucjp_to_iso2022jp for resetstate_func. * tool/transcode-tblgen.rb: specify NULL for resetstate_func. * transcode.c (output_replacement_character): call resetstate_func before appending the replacement character. Modified files: trunk/ChangeLog trunk/enc/trans/iso2022.trans trunk/test/ruby/test_transcode.rb trunk/tool/transcode-tblgen.rb trunk/transcode.c trunk/transcode_data.h Index: ChangeLog =================================================================== --- ChangeLog (revision 18502) +++ ChangeLog (revision 18503) @@ -1,3 +1,16 @@ +Tue Aug 12 07:41:13 2008 Tanaka Akira <akr@f...> + + * transcode_data.h (rb_transcoder): add resetstate_func field for + resetting a state of stateful encoding. + + * enc/trans/iso2022.trans (rb_EUC_JP_to_ISO_2022_JP): specify + finish_eucjp_to_iso2022jp for resetstate_func. + + * tool/transcode-tblgen.rb: specify NULL for resetstate_func. + + * transcode.c (output_replacement_character): call resetstate_func + before appending the replacement character. + Tue Aug 12 07:19:24 2008 Tanaka Akira <akr@f...> * transcode.c (get_replacement_character): extracted from Index: enc/trans/iso2022.trans =================================================================== --- enc/trans/iso2022.trans (revision 18502) +++ enc/trans/iso2022.trans (revision 18503) @@ -136,7 +136,8 @@ 1, /* input_unit_length */ 3, /* max_input */ 5, /* max_output */ - NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp + NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, + finish_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp }; void Index: transcode_data.h =================================================================== --- transcode_data.h (revision 18502) +++ transcode_data.h (revision 18503) @@ -95,6 +95,7 @@ VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */ int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */ int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */ + int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */ int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */ }; Index: tool/transcode-tblgen.rb =================================================================== --- tool/transcode-tblgen.rb (revision 18502) +++ tool/transcode-tblgen.rb (revision 18503) @@ -446,7 +446,7 @@ #{input_unit_length}, /* input_unit_length */ #{max_input}, /* max_input */ #{max_output}, /* max_output */ - NULL, NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL }; End tree_code + "\n" + transcoder_code Index: test/ruby/test_transcode.rb =================================================================== --- test/ruby/test_transcode.rb (revision 18502) +++ test/ruby/test_transcode.rb (revision 18503) @@ -303,6 +303,9 @@ "\xdc\x00".encode("EUC-JP", "UTF-16BE", :invalid=>:replace), "[ruby-dev:35776]") assert_equal("ab?cd?ef", "\0a\0b\xdc\x00\0c\0d\xdf\x00\0e\0f".encode("EUC-JP", "UTF-16BE", :invalid=>:replace)) + + assert_equal("\e$B!!\e(B?".force_encoding("ISO-2022-JP"), + "\xA1\xA1\xFF".encode("ISO-2022-JP", "EUC-JP", invalid: :replace)) end def test_undef_replace Index: transcode.c =================================================================== --- transcode.c (revision 18502) +++ transcode.c (revision 18503) @@ -292,19 +292,6 @@ } } -static void -output_replacement_character(unsigned char **out_pp, rb_encoding *enc) -{ - const char *replacement; - int len; - replacement = get_replacement_character(enc, &len); - - memcpy(*out_pp, replacement, len); - - *out_pp += len; - return; -} - /* * Transcoding engine logic */ @@ -818,6 +805,62 @@ *out_stop_ptr = *out_start_ptr + new_len; } +static void +output_replacement_character( + VALUE destination, + unsigned char *(*resize_destination)(VALUE, int, int), + rb_trans_t *ts, + unsigned char **out_start_ptr, + unsigned char **out_pos, + unsigned char **out_stop_ptr) + +{ + rb_transcoding *tc; + const rb_transcoder *tr; + int max_output; + rb_encoding *enc; + const char *replacement; + int len; + + tc = ts->elems[ts->num_trans-1].tc; + tr = tc->transcoder; + max_output = tr->max_output; + enc = rb_enc_find(tr->to_encoding); + + /* + * Assumption for stateful encoding: + * + * - The replacement character can be output on resetted state and doesn't + * change the state. + * - it is acceptable that extra state changing sequence if the replacement + * character contains a state changing sequence. + * + * Currently the replacement character for stateful encoding such as + * ISO-2022-JP is "?" and it has no state changing sequence. + * So the extra state changing sequence don't occur. + * + * Thease assumption may be removed in future. + * It needs to scan the replacement character to check + * state changing sequences in the replacement character. + */ + + if (tr->resetstate_func) { + if (*out_stop_ptr - *out_pos < max_output) + more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr); + *out_pos += tr->resetstate_func(tc, *out_pos); + } + + if (*out_stop_ptr - *out_pos < max_output) + more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr); + + replacement = get_replacement_character(enc, &len); + + memcpy(*out_pos, replacement, len); + + *out_pos += len; + return; +} + #if 1 static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, @@ -848,9 +891,7 @@ goto resume; } else if (opt&INVALID_REPLACE) { - if (out_stop - *out_pos < max_output) - more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop); - output_replacement_character(out_pos, rb_enc_find(to_encoding)); + output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop); goto resume; } rb_trans_close(ts); @@ -864,9 +905,7 @@ goto resume; } else if (opt&UNDEF_REPLACE) { - if (out_stop - *out_pos < max_output) - more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop); - output_replacement_character(out_pos, rb_enc_find(to_encoding)); + output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop); goto resume; } rb_trans_close(ts); @@ -931,9 +970,7 @@ break; } else if (opt&INVALID_REPLACE) { - if (out_stop - *out_pos < max_output) - more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop); - output_replacement_character(out_pos, rb_enc_find(to_encoding)); + output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop); break; } rb_trans_close(ts); @@ -948,9 +985,7 @@ break; } else if (opt&UNDEF_REPLACE) { - if (out_stop - *out_pos < max_output) - more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop); - output_replacement_character(out_pos, rb_enc_find(to_encoding)); + output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop); break; } rb_trans_close(ts); -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/