ruby-changes:3115
From: ko1@a...
Date: 24 Dec 2007 22:51:41 +0900
Subject: [ruby-changes:3115] naruse - Ruby:r14607 (trunk): * transocode.c: register_functional_transcoder() added.
naruse 2007-12-24 22:51:19 +0900 (Mon, 24 Dec 2007) New Revision: 14607 Modified files: trunk/ChangeLog trunk/transcode.c trunk/transcode_data.h trunk/transcode_data_japanese.c Log: * transocode.c: register_functional_transcoder() added. (init_transcoder_table(: register ISO-2022-JP. (str_transcode): add preprocessor and postprocessor. * transcode_data_japanese.c: add ISO-2022-JP support. * transcode_data.h: moved transcoder and transcoding difinition from transcode.c. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/transcode_data.h?r1=14607&r2=14606 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=14607&r2=14606 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/transcode_data_japanese.c?r1=14607&r2=14606 http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/transcode.c?r1=14607&r2=14606 Index: ChangeLog =================================================================== --- ChangeLog (revision 14606) +++ ChangeLog (revision 14607) @@ -1,3 +1,14 @@ +Mon Dec 24 22:46:42 2007 NARUSE, Yui <naruse@r...> + + * transocode.c: register_functional_transcoder() added. + (init_transcoder_table(: register ISO-2022-JP. + (str_transcode): add preprocessor and postprocessor. + + * transcode_data_japanese.c: add ISO-2022-JP support. + + * transcode_data.h: moved transcoder and transcoding difinition from + transcode.c. + Mon Dec 24 20:29:28 2007 Koichi Sasada <ko1@a...> * test/io/nonblock/test_flush.rb: fix test for 1.9. Index: transcode_data_japanese.c =================================================================== --- transcode_data_japanese.c (revision 14606) +++ transcode_data_japanese.c (revision 14607) @@ -23618,4 +23618,209 @@ to_EUC_JP_infos }; -/* Footprint (bytes): gross: 212680, saved: 50764, net: 161916 */ +#define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte) +enum ISO_2022_ESCSEQ { + ISO_2022_CZD = '!', + ISO_2022_C1D = '"', + ISO_2022_GZD4 = '(', + ISO_2022_G1D4 = ')', + ISO_2022_G2D4 = '*', + ISO_2022_G3D4 = '+', + ISO_2022_G1D6 = '-', + ISO_2022_G2D6 = '.', + ISO_2022_G3D6 = '/', + ISO_2022_GZDM4 = ISO_2022_ENCODING('$','('), + ISO_2022_G1DM4 = ISO_2022_ENCODING('$',')'), + ISO_2022_G2DM4 = ISO_2022_ENCODING('$','*'), + ISO_2022_G3DM4 = ISO_2022_ENCODING('$','+'), + ISO_2022_G1DM6 = ISO_2022_ENCODING('$','-'), + ISO_2022_G2DM6 = ISO_2022_ENCODING('$','.'), + ISO_2022_G3DM6 = ISO_2022_ENCODING('$','/'), + ISO_2022_DOCS = ISO_2022_ENCODING('%','I'), + ISO_2022_IRR = '&' +}; + + +#define ISO_2022_GZ_ASCII ISO_2022_ENCODING(ISO_2022_GZD4, 'B') +#define ISO_2022_GZ_JIS_X_0201_Katakana ISO_2022_ENCODING(ISO_2022_GZD4, 'I') +#define ISO_2022_GZ_JIS_X_0201_Roman ISO_2022_ENCODING(ISO_2022_GZD4, 'J') +#define ISO_2022_GZ_JIS_C_6226_1978 ISO_2022_ENCODING(ISO_2022_GZDM4,'@') +#define ISO_2022_GZ_JIS_X_0208_1983 ISO_2022_ENCODING(ISO_2022_GZDM4,'B') +#define ISO_2022_GZ_JIS_X_0212_1990 ISO_2022_ENCODING(ISO_2022_GZDM4,'D') +#define ISO_2022_GZ_JIS_X_0213_2000_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'O') +#define ISO_2022_GZ_JIS_X_0213_2000_2 ISO_2022_ENCODING(ISO_2022_GZDM4,'P') +#define ISO_2022_GZ_JIS_X_0213_2004_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'Q') + +static int +get_iso_2022_mode(char **in_pos) +{ + int new_mode; + char *in_p = *in_pos; + switch (*in_p++) + { + case '(': + switch (*in_p++) + { + case 'B': case 'I': case 'J': + new_mode = ISO_2022_ENCODING(ISO_2022_GZD4, *(in_p-1)); + break; + default: + rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported (ESC ( %c)", *(in_p-1)); + break; + } + break; + case '$': + switch (*in_p++) + { + case '@': case 'A': case 'B': + new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1)); + break; + case '(': + switch (*in_p++) + { + case 'D': case 'O': case 'P': case 'Q': + new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1)); + break; + default: + rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported (ESC $ ( %c)", *(in_p-1)); + break; + } + break; + default: + rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported (ESC $ %c)", *(in_p-1)); + break; + } + break; + default: + rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported (ESC %c)", *(in_p-1)); + break; + } + *in_pos = in_p; + return new_mode; +} + +void +from_iso_2022_jp_transcoder_preprocessor(char **in_pos, char **out_pos, + char *in_stop, char *out_stop, + transcoder *my_transcoder, + transcoding *my_transcoding) +{ + char *in_p = *in_pos, *out_p = *out_pos; + int cur_mode = ISO_2022_GZ_ASCII; + unsigned char c1; + char *out_s = out_stop - my_transcoder->max_output + 1; + while (in_p < in_stop) { + if (out_p >= out_s) { + int len = (out_p - *out_pos); + int new_len = (len + my_transcoder->max_output) * 2; + *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); + out_p = *out_pos + len; + out_s = *out_pos + new_len - my_transcoder->max_output; + } + c1 = *in_p++; + if (c1 == 0x1B) { + cur_mode = get_iso_2022_mode(&in_p); + } else if (c1 == 0x1E || c1 == 0x1F) { + /* SHIFT */ + rb_raise(rb_eRuntimeError /*change exception*/, "shift is not supported"); + } else if (c1 >= 0x80) { + rb_raise(rb_eRuntimeError /*change exception*/, "illegal byte sequence"); + } else { + switch (cur_mode) { + case ISO_2022_GZ_ASCII: + case ISO_2022_GZ_JIS_X_0201_Roman: + *out_p++ = c1; + break; + case ISO_2022_GZ_JIS_X_0201_Katakana: + *out_p++ = 0x8E; + *out_p++ = c1 | 0x80; + break; + case ISO_2022_GZ_JIS_X_0212_1990: + *out_p++ = 0x8F; + case ISO_2022_GZ_JIS_C_6226_1978: + case ISO_2022_GZ_JIS_X_0208_1983: + *out_p++ = c1 | 0x80; + *out_p++ = *in_p++ | 0x80; + break; + } + } + } + /* cleanup */ + *in_pos = in_p; + *out_pos = out_p; +} + +static int +select_iso_2022_mode(char **out_pos, int new_mode) +{ + char *out_p = *out_pos; + *out_p++ = '\e'; + switch (new_mode>>8) + { + case ISO_2022_GZD4: + *out_p++ = new_mode >> 8; + *out_p++ = new_mode & 0x7F; + break; + case ISO_2022_GZDM4: + *out_p++ = new_mode >> 16; + if ((new_mode & 0x7F) != '@' && + (new_mode & 0x7F) != 'A' && + (new_mode & 0x7F) != 'B') + { + *out_p++ = (new_mode>>8) & 0x7F; + } + *out_p++ = new_mode & 0x7F; + break; + default: + rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported."); + break; + } + *out_pos = out_p; + return new_mode; +} + +void +to_iso_2022_jp_transcoder_postprocessor(char **in_pos, char **out_pos, + char *in_stop, char *out_stop, + transcoder *my_transcoder, + transcoding *my_transcoding) +{ + char *in_p = *in_pos, *out_p = *out_pos; + int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0; + unsigned char next_byte; + char *out_s = out_stop - my_transcoder->max_output + 1; + while (in_p < in_stop) { + if (out_p >= out_s) { + int len = (out_p - *out_pos); + int new_len = (len + my_transcoder->max_output) * 2; + *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); + out_p = *out_pos + len; + out_s = *out_pos + new_len - my_transcoder->max_output; + } + next_byte = *in_p++; + if (next_byte < 0x80) { + new_mode = ISO_2022_GZ_ASCII; + } else if (next_byte == 0x8E) { + new_mode = ISO_2022_GZ_JIS_X_0201_Katakana; + next_byte = *in_p++; + } else if (next_byte == 0x8F) { + new_mode = ISO_2022_GZ_JIS_X_0212_1990; + next_byte = *in_p++; + } else { + new_mode = ISO_2022_GZ_JIS_X_0208_1983; + } + if (cur_mode != new_mode) + cur_mode = select_iso_2022_mode(&out_p, new_mode); + if (cur_mode < 0xFFFF) { + *out_p++ = next_byte & 0x7F; + } else { + *out_p++ = next_byte & 0x7F; + *out_p++ = *in_p++ & 0x7F; + } + } + if (cur_mode != ISO_2022_GZ_ASCII) + cur_mode = select_iso_2022_mode(&out_p, ISO_2022_GZ_ASCII); + /* cleanup */ + *in_pos = in_p; + *out_pos = out_p; +} Index: transcode_data.h =================================================================== --- transcode_data.h (revision 14606) +++ transcode_data.h (revision 14607) @@ -1,3 +1,20 @@ +/********************************************************************** + + transcode_data.h - + + $Author$ + $Date$ + created at: Mon 10 Dec 2007 14:01:47 JST 2007 + + Copyright (C) 2007 Martin Duerst + +**********************************************************************/ + +#include "ruby/ruby.h" + +#ifndef RUBY_TRANSCODE_DATA_H +#define RUBY_TRANSCODE_DATA_H 1 + typedef unsigned char base_element; typedef struct byte_lookup { @@ -37,3 +54,25 @@ #define TWOTRAIL /* legal but undefined if two more trailing UTF-8 */ #define THREETRAIL /* legal but undefined if three more trailing UTF-8 */ +/* dynamic structure, one per conversion (similar to iconv_t) */ +/* may carry conversion state (e.g. for iso-2022-jp) */ +typedef struct transcoding { + VALUE ruby_string_dest; /* the String used as the conversion destination, + or NULL if something else is being converted */ + char *(*flush_func)(struct transcoding*, int, int); +} transcoding; + +/* static structure, one per supported encoding pair */ +typedef struct transcoder_st{ + const char *from_encoding; + const char *to_encoding; + const BYTE_LOOKUP *conv_tree_start; + int max_output; + int from_utf8; + void (*preprocessor)(char**, char**, char*, char*, + struct transcoder_st *transcoder, struct transcoding*); + void (*postprocessor)(char**, char**, char*, char*, + struct transcoder_st *transcoder, struct transcoding*); +} transcoder; + +#endif /* RUBY_TRANSCODE_DATA_H */ Index: transcode.c =================================================================== --- transcode.c (revision 14606) +++ transcode.c (revision 14607) @@ -60,22 +60,17 @@ extern const BYTE_LOOKUP rb_to_SHIFT_JIS; extern const BYTE_LOOKUP rb_to_EUC_JP; +extern void from_iso_2022_jp_transcoder_preprocessor(char**, char**, char*, char*, + struct transcoder_st *transcoder, struct transcoding*); +extern void to_iso_2022_jp_transcoder_postprocessor(char**, char**, char*, char*, + struct transcoder_st *transcoder, struct transcoding*); /* declarations probably need to go into separate header file, e.g. transcode.h */ -/* static structure, one per supported encoding pair */ -typedef struct { - const char *from_encoding; - const char *to_encoding; - const BYTE_LOOKUP *conv_tree_start; - int max_output; - int from_utf8; -} transcoder; - /* todo: dynamic structure, one per conversion (stream) */ /* in the future, add some mechanism for dynamically adding stuff here */ -#define MAX_TRANSCODERS 33 /* todo: fix: this number has to be adjusted by hand */ +#define MAX_TRANSCODERS 35 /* todo: fix: this number has to be adjusted by hand */ static transcoder transcoder_table[MAX_TRANSCODERS]; /* not sure why it's not possible to do relocatable initializations */ @@ -100,6 +95,29 @@ } static void +register_functional_transcoder(const char *from_e, const char *to_e, + const BYTE_LOOKUP *tree_start, int max_output, int from_utf8, + void (*preprocessor)(char**, char**, char*, char*, transcoder*, transcoding*), + void (*postprocessor)(char**, char**, char*, char*, transcoder*, transcoding*)) +{ + static int n = 0; + if (n >= MAX_TRANSCODERS) { + /* we are initializing, is it okay to use rb_raise here? */ + rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots"); + } + transcoder_table[n].from_encoding = from_e; + transcoder_table[n].to_encoding = to_e; + transcoder_table[n].conv_tree_start = tree_start; + transcoder_table[n].max_output = max_output; + transcoder_table[n].from_utf8 = from_utf8; + transcoder_table[n].conv_tree_start = tree_start; + transcoder_table[n].preprocessor = preprocessor; + transcoder_table[n].postprocessor = postprocessor; + + n++; +} + +static void init_transcoder_table(void) { register_transcoder("ISO-8859-1", "UTF-8", &rb_from_ISO_8859_1, 2, 0); @@ -135,6 +153,10 @@ register_transcoder("EUC-JP", "UTF-8", &rb_from_EUC_JP, 3, 0); register_transcoder("UTF-8", "SHIFT_JIS", &rb_to_SHIFT_JIS, 2, 1); register_transcoder("UTF-8", "EUC-JP", &rb_to_EUC_JP, 2, 1); + register_functional_transcoder("ISO-2022-JP", "UTF-8", &rb_from_EUC_JP, + 8, 0, &from_iso_2022_jp_transcoder_preprocessor, NULL); + register_functional_transcoder("UTF-8", "ISO-2022-JP", &rb_to_EUC_JP, + 8, 1, NULL, &to_iso_2022_jp_transcoder_postprocessor); register_transcoder(NULL, NULL, NULL, 0, 0); } @@ -165,15 +187,7 @@ return NULL; } -/* dynamic structure, one per conversion (similar to iconv_t) */ -/* may carry conversion state (e.g. for iso-2022-jp) */ -typedef struct transcoding { - VALUE ruby_string_dest; /* the String used as the conversion destination, - or NULL if something else is being converted */ - char *(*flush_func)(struct transcoding*, int, int); -} transcoding; - /* * Transcoding engine logic */ @@ -331,6 +345,23 @@ rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e); } + if (my_transcoder->preprocessor) + { + fromp = sp = RSTRING_PTR(str); + slen = RSTRING_LEN(str); + blen = slen + 30; /* len + margin */ + dest = rb_str_tmp_new(blen); + bp = RSTRING_PTR(dest); + my_transcoding.ruby_string_dest = dest; + (*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding); + if (fromp != sp+slen) { + rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp); + } + buf = RSTRING_PTR(dest); + *bp = '\0'; + rb_str_set_len(dest, bp - buf); + str = dest; + } fromp = sp = RSTRING_PTR(str); slen = RSTRING_LEN(str); blen = slen + 30; /* len + margin */ @@ -346,6 +377,23 @@ buf = RSTRING_PTR(dest); *bp = '\0'; rb_str_set_len(dest, bp - buf); + if (my_transcoder->postprocessor) + { + str = dest; + fromp = sp = RSTRING_PTR(str); + slen = RSTRING_LEN(str); + blen = slen + 30; /* len + margin */ + dest = rb_str_tmp_new(blen); + bp = RSTRING_PTR(dest); + my_transcoding.ruby_string_dest = dest; + (*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding); + if (fromp != sp+slen) { + rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp); + } + buf = RSTRING_PTR(dest); + *bp = '\0'; + rb_str_set_len(dest, bp - buf); + } if (encoding_equal(my_transcoder->to_encoding, to_e)) { final_encoding = 1; -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml