[前][次][番号順一覧][スレッド一覧]

ruby-changes:25743

From: naruse <ko1@a...>
Date: Thu, 22 Nov 2012 17:47:45 +0900 (JST)
Subject: [ruby-changes:25743] naruse:r37800 (trunk): * ext/nkf/nkf-utf8: Merge b0a6577a521d1bba5e19853f95d5c4b9be1072b5.

naruse	2012-11-22 17:47:30 +0900 (Thu, 22 Nov 2012)

  New Revision: 37800

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=37800

  Log:
    * ext/nkf/nkf-utf8: Merge b0a6577a521d1bba5e19853f95d5c4b9be1072b5.
      Support JIS X 0213 and some bugfixes.

  Modified files:
    trunk/ChangeLog
    trunk/ext/nkf/nkf-utf8/nkf.c
    trunk/ext/nkf/nkf-utf8/nkf.h
    trunk/ext/nkf/nkf-utf8/utf8tbl.c
    trunk/ext/nkf/nkf-utf8/utf8tbl.h

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 37799)
+++ ChangeLog	(revision 37800)
@@ -1,3 +1,8 @@
+Thu Nov 22 17:45:17 2012  NARUSE, Yui  <naruse@r...>
+
+	* ext/nkf/nkf-utf8: Merge b0a6577a521d1bba5e19853f95d5c4b9be1072b5.
+	  Support JIS X 0213 and some bugfixes.
+
 Thu Nov 22 17:39:37 2012  KOSAKI Motohiro  <kosaki.motohiro@g...>
 
 	* tool/gen_dummy_probes.rb: don't change #include, #if and #endif
@@ -136,7 +141,7 @@
 
 Tue Nov 20 21:22:44 2012  Masaki Suketa <masaki.suketa@n...>
 
-	* test/win32ole/test_win32ole_type.rb (test_implemented_ole_types): 
+	* test/win32ole/test_win32ole_type.rb (test_implemented_ole_types):
 	  IShellDispatch6 bundled in Windows 8. Thanks to phasis68 (Heesob
 	  Park).  [ruby-core:49580][Bug #7403]
 
@@ -440,7 +445,7 @@
 	* rational.c (read_num): ditto.
 
 Sun Nov 18 02:50:12 2012  Luis Lavena <luislavena@g...>
- 
+
 	* win32/file.c (replace_to_long_name): correct logic around wildcard
 	  characters detection and ensure wide-chars are used as pattern.
 	  [ruby-core:49451] [Bug #7374]
Index: ext/nkf/nkf-utf8/nkf.c
===================================================================
--- ext/nkf/nkf-utf8/nkf.c	(revision 37799)
+++ ext/nkf/nkf-utf8/nkf.c	(revision 37800)
@@ -21,7 +21,7 @@
  * 3. This notice may not be removed or altered from any source distribution.
  */
 #define NKF_VERSION "2.1.3"
-#define NKF_RELEASE_DATE "2012-09-13"
+#define NKF_RELEASE_DATE "2012-11-22"
 #define COPY_RIGHT \
     "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
     "Copyright (C) 1996-2012, The nkf Project."
@@ -431,6 +431,8 @@
 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
 
+#define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
+
 #ifdef NUMCHAR_OPTION
 static int numchar_f = FALSE;
 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
@@ -507,7 +509,7 @@
 /* process default */
 
 static nkf_char
-no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
+no_connection2(ARG_UNUSED nkf_char c2, ARG_UNUSED nkf_char c1, ARG_UNUSED nkf_char c0)
 {
     fprintf(stderr,"nkf internal module connection failure.\n");
     exit(EXIT_FAILURE);
@@ -621,7 +623,28 @@
     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
     0x00,0x00};
 
+/* X0201 kana to X0213 conversion table for han-daguten */
+/* 90-9F A0-DF */
+static const unsigned char ev_x0213[]= {
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x25,0x77,0x25,0x78,
+    0x25,0x79,0x25,0x7a,0x25,0x7b,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x25,0x7c,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x25,0x7d,0x00,0x00,
+    0x25,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00};
 
+
 /* X0208 kigou conversion table */
 /* 0x8140 - 0x819e */
 static const unsigned char fv[] = {
@@ -1288,6 +1311,7 @@
 	x0213_f = TRUE;
 #ifdef SHIFTJIS_CP932
 	cp51932_f = FALSE;
+	if (cp932inv_f == TRUE) cp932inv_f = FALSE;
 #endif
 	break;
     case EUC_JISX0213:
@@ -1358,6 +1382,7 @@
 #endif
 	break;
     case ISO_2022_JP_3:
+    case ISO_2022_JP_2004:
 	x0212_f = TRUE;
 	x0213_f = TRUE;
 #ifdef SHIFTJIS_CP932
@@ -1541,13 +1566,26 @@
 }
 #endif /* X0212_ENABLE */
 
+static int
+is_x0213_2_in_x0212(nkf_char c1)
+{
+    static const char x0213_2_table[] =
+	{0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1};
+    int ku = c1 - 0x20;
+    if (ku <= 15)
+	return x0213_2_table[ku]; /* 1, 3-5, 8, 12-15 */
+    if (78 <= ku && ku <= 94)
+	return 1;
+    return 0;
+}
+
 static nkf_char
 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
 {
     nkf_char ndx;
     if (is_eucg3(c2)){
 	ndx = c2 & 0x7f;
-	if (x0213_f){
+	if (x0213_f && is_x0213_2_in_x0212(ndx)){
 	    if((0x21 <= ndx && ndx <= 0x2F)){
 		if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
 		if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
@@ -1593,7 +1631,7 @@
     static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
     if (0xFC < c1) return 1;
 #ifdef SHIFTJIS_CP932
-    if (!cp932inv_f && is_ibmext_in_sjis(c2)){
+    if (!cp932inv_f && !x0213_f && is_ibmext_in_sjis(c2)){
 	val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
 	if (val){
 	    c2 = val >> 8;
@@ -1696,7 +1734,7 @@
 	/* single byte */
 	wc = c1;
     }
-    else if (c1 <= 0xC3) {
+    else if (c1 <= 0xC1) {
 	/* trail byte or invalid */
 	return -1;
     }
@@ -1836,6 +1874,7 @@
 	    ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
 	    ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
 	    ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
+	    x0213_f ? utf8_to_euc_2bytes_x0213 :
 	    utf8_to_euc_2bytes;
 	ret =  unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
     }else if(c0 < 0xF0){
@@ -1903,6 +1942,7 @@
 	    ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
 	    ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
 	    ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
+	    x0213_f ? utf8_to_euc_3bytes_x0213 :
 	    utf8_to_euc_3bytes;
 	ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
     }else return -1;
@@ -1920,6 +1960,15 @@
 }
 
 #ifdef UTF8_OUTPUT_ENABLE
+#define X0213_SURROGATE_FIND(tbl, size, euc) do { \
+	int i; \
+	for (i = 0; i < size; i++) \
+	    if (tbl[i][0] == euc) { \
+		low = tbl[i][2]; \
+		break; \
+	    } \
+    } while (0)
+
 static nkf_char
 e2w_conv(nkf_char c2, nkf_char c1)
 {
@@ -1942,7 +1991,9 @@
 	}
 	c2 = (c2&0x7f) - 0x21;
 	if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
-	    p = x0212_to_utf8_2bytes[c2];
+	    p =
+		x0213_f ? x0212_to_utf8_2bytes_x0213[c2] :
+		x0212_to_utf8_2bytes[c2];
 	else
 	    return 0;
 #endif
@@ -1951,6 +2002,7 @@
 	c2 = (c2&0x7f) - 0x21;
 	if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
 	    p =
+		x0213_f ? euc_to_utf8_2bytes_x0213[c2] :
 		ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
 		ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
 		euc_to_utf8_2bytes_ms[c2];
@@ -1959,10 +2011,41 @@
     }
     if (!p) return 0;
     c1 = (c1 & 0x7f) - 0x21;
-    if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
-	return p[c1];
+    if (0<=c1 && c1<sizeof_euc_to_utf8_1byte) {
+	nkf_char val = p[c1];
+	if (x0213_f && 0xD800<=val && val<=0xDBFF) {
+	    nkf_char euc = (c2+0x21)<<8 | (c1+0x21);
+	    nkf_char low = 0;
+	    if (p==x0212_to_utf8_2bytes_x0213[c2]) {
+		X0213_SURROGATE_FIND(x0213_2_surrogate_table, sizeof_x0213_2_surrogate_table, euc);
+	    } else {
+		X0213_SURROGATE_FIND(x0213_1_surrogate_table, sizeof_x0213_1_surrogate_table, euc);
+	    }
+	    if (!low) return 0;
+	    return UTF16_TO_UTF32(val, low);
+	} else {
+	    return val;
+	}
+    }
     return 0;
 }
+
+static nkf_char
+e2w_combining(nkf_char comb, nkf_char c2, nkf_char c1)
+{
+    nkf_char euc;
+    int i;
+    for (i = 0; i < sizeof_x0213_combining_chars; i++)
+	if (x0213_combining_chars[i] == comb)
+	    break;
+    if (i >= sizeof_x0213_combining_chars)
+	return 0;
+    euc = (c2&0x7f)<<8 | (c1&0x7f);
+    for (i = 0; i < sizeof_x0213_combining_table; i++)
+	if (x0213_combining_table[i][0] == euc)
+	    return x0213_combining_table[i][1];
+    return 0;
+}
 #endif
 
 static nkf_char
@@ -2007,6 +2090,25 @@
 	}
     }
     else {
+	int i;
+	if (x0213_f) {
+	    c1 = (val >> 10) + NKF_INT32_C(0xD7C0);   /* high surrogate */
+	    c2 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
+	    for (i = 0; i < sizeof_x0213_1_surrogate_table; i++)
+		if (x0213_1_surrogate_table[i][1] == c1 && x0213_1_surrogate_table[i][2] == c2) {
+		    val = x0213_1_surrogate_table[i][0];
+		    *p2 = val >> 8;
+		    *p1 = val & 0xFF;
+		    return 0;
+		}
+	    for (i = 0; i < sizeof_x0213_2_surrogate_table; i++)
+		if (x0213_2_surrogate_table[i][1] == c1 && x0213_2_surrogate_table[i][2] == c2) {
+		    val = x0213_2_surrogate_table[i][0];
+		    *p2 = PREFIX_EUCG3 | (val >> 8);
+		    *p1 = val & 0xFF;
+		    return 0;
+		}
+	}
 	*p2 = 0;
 	*p1 = nkf_char_unicode_new(val);
     }
@@ -2079,7 +2181,7 @@
 }
 
 static nkf_char
-s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
+s_iconv(ARG_UNUSED nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
 {
     if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
 	if (iso2022jp_f && !x0201_f) {
@@ -2102,6 +2204,30 @@
     return 0;
 }
 
+static int
+x0213_wait_combining_p(nkf_char wc)
+{
+    int i;
+    for (i = 0; i < sizeof_x0213_combining_table; i++) {
+	if (x0213_combining_table[i][1] == wc) {
+	    return TRUE;
+	}
+    }
+    return FALSE;
+}
+
+static int
+x0213_combining_p(nkf_char wc)
+{
+    int i;
+    for (i = 0; i < sizeof_x0213_combining_chars; i++) {
+	if (x0213_combining_chars[i] == wc) {
+	    return TRUE;
+	}
+    }
+    return FALSE;
+}
+
 static nkf_char
 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
 {
@@ -2169,6 +2295,8 @@
 	c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
 	c1 = 0;
     } else {
+	if (x0213_f && x0213_wait_combining_p(nkf_utf8_to_unicode(c1, c2, c3, c4)))
+	    return -3;
 	ret = w2e_conv(c1, c2, c3, &c1, &c2);
     }
     if (ret == 0){
@@ -2177,9 +2305,22 @@
     return ret;
 }
 
+static nkf_char
+w_iconv_nocombine(nkf_char c1, nkf_char c2, nkf_char c3)
+{
+    /* continue from the line below 'return -3;' in w_iconv() */
+    nkf_char ret = w2e_conv(c1, c2, c3, &c1, &c2);
+    if (ret == 0){
+	(*oconv)(c1, c2);
+    }
+    return ret;
+}
+
 #define NKF_ICONV_INVALID_CODE_RANGE -13
+#define NKF_ICONV_WAIT_COMBINING_CHAR -14
+#define NKF_ICONV_NOT_COMBINED -15
 static size_t
-unicode_iconv(nkf_char wc)
+unicode_iconv(nkf_char wc, int nocombine)
 {
     nkf_char c1, c2;
     int ret = 0;
@@ -2191,6 +2332,8 @@
 	/* unpaired surrogate */
 	return NKF_ICONV_INVALID_CODE_RANGE;
     }else if (wc < 0xFFFF) {
+	if (!nocombine && x0213_f && x0213_wait_combining_p(wc))
+	    return NKF_ICONV_WAIT_COMBINING_CHAR;
 	ret = w16e_conv(wc, &c2, &c1);
 	if (ret) return ret;
     }else if (wc < 0x10FFFF) {
@@ -2203,9 +2346,50 @@
     return 0;
 }
 
+static size_t
+unicode_iconv_combine(nkf_char wc, nkf_char wc2)
+{
+    nkf_char c1, c2;
+    int i;
+
+    if (wc2 < 0x80) {
+	return NKF_ICONV_NOT_COMBINED;
+    }else if ((wc2>>11) == 27) {
+	/* unpaired surrogate */
+	return NKF_ICONV_INVALID_CODE_RANGE;
+    }else if (wc2 < 0xFFFF) {
+	if (!x0213_combining_p(wc2))
+	    return NKF_ICONV_NOT_COMBINED;
+	for (i = 0; i < sizeof_x0213_combining_table; i++) {
+	    if (x0213_combining_table[i][1] == wc &&
+		x0213_combining_table[i][2] == wc2) {
+		c2 = x0213_combining_table[i][0] >> 8;
+		c1 = x0213_combining_table[i][0] & 0x7f;
+		(*oconv)(c2, c1);
+		return 0;
+	    }
+	}
+    }else if (wc2 < 0x10FFFF) {
+	return NKF_ICONV_NOT_COMBINED;
+    } else {
+	return NKF_ICONV_INVALID_CODE_RANGE;
+    }
+    return NKF_ICONV_NOT_COMBINED;
+}
+
+static nkf_char
+w_iconv_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6)
+{
+    nkf_char wc, wc2;
+    wc = nkf_utf8_to_unicode(c1, c2, c3, 0);
+    wc2 = nkf_utf8_to_unicode(c4, c5, c6, 0);
+    if (wc2 < 0)
+	return wc2;
+    return unicode_iconv_combine(wc, wc2);
+}
+
 #define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1
 #define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2
-#define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
 static size_t
 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
 {
@@ -2234,33 +2418,63 @@
 	}
     }
 
-    return (*unicode_iconv)(wc);
+    return (*unicode_iconv)(wc, FALSE);
 }
 
+static size_t
+nkf_iconv_utf_16_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
+{
+    nkf_char wc, wc2;
+
+    if (input_endian == ENDIAN_BIG) {
+	if (0xD8 <= c3 && c3 <= 0xDB) {
+	    return NKF_ICONV_NOT_COMBINED;
+	} else {
+	    wc = c1 << 8 | c2;
+	    wc2 = c3 << 8 | c4;
+	}
+    } else {
+	if (0xD8 <= c2 && c2 <= 0xDB) {
+	    return NKF_ICONV_NOT_COMBINED;
+	} else {
+	    wc = c2 << 8 | c1;
+	    wc2 = c4 << 8 | c3;
+	}
+    }
+
+    return unicode_iconv_combine(wc, wc2);
+}
+
+static size_t
+nkf_iconv_utf_16_nocombine(nkf_char c1, nkf_char c2)
+{
+    nkf_char wc;
+    if (input_endian == ENDIAN_BIG)
+	wc = c1 << 8 | c2;
+    else
+	wc = c2 << 8 | c1;
+    return (*unicode_iconv)(wc, TRUE);
+}
+
 static nkf_char
-w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
+w_iconv16(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
 {
     (*oconv)(c2, c1);
     return 16; /* different from w_iconv32 */
 }
 
 static nkf_char
-w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
+w_iconv32(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
 {
     (*oconv)(c2, c1);
     return 32; /* different from w_iconv16 */
 }
 
-static size_t
-nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
+static nkf_char
+utf32_to_nkf_char(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
 {
     nkf_char wc;
 
-    if (c1 == EOF) {
-	(*oconv)(EOF, 0);
-	return 0;
-    }
-
     switch(input_endian){
     case ENDIAN_BIG:
 	wc = c2 << 16 | c3 << 8 | c4;
@@ -2277,9 +2491,49 @@
     default:
 	return NKF_ICONV_INVALID_CODE_RANGE;
     }
+    return wc;
+}
 
-    return (*unicode_iconv)(wc);
+static size_t
+nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
+{
+    nkf_char wc;
+
+    if (c1 == EOF) {
+	(*oconv)(EOF, 0);
+	return 0;
+    }
+
+    wc = utf32_to_nkf_char(c1, c2, c3, c4);
+    if (wc < 0)
+	return wc;
+
+    return (*unicode_iconv)(wc, FALSE);
 }
+
+static size_t
+nkf_iconv_utf_32_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6, nkf_char c7, nkf_char c8)
+{
+    nkf_char wc, wc2;
+
+    wc = utf32_to_nkf_char(c1, c2, c3, c4);
+    if (wc < 0)
+	return wc;
+    wc2 = utf32_to_nkf_char(c5, c6, c7, c8);
+    if (wc2 < 0)
+	return wc2;
+
+    return unicode_iconv_combine(wc, wc2);
+}
+
+static size_t
+nkf_iconv_utf_32_nocombine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
+{
+    nkf_char wc;
+
+    wc = utf32_to_nkf_char(c1, c2, c3, c4);
+    return (*unicode_iconv)(wc, TRUE);
+}
 #endif
 
 #define output_ascii_escape_sequence(mode) do { \
@@ -2533,11 +2787,19 @@
 }
 
 #ifdef UTF8_OUTPUT_ENABLE
+#define OUTPUT_UTF8(val) do { \
+	nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); \
+	(*o_putc)(c1); \
+	if (c2) (*o_putc)(c2); \
+	if (c3) (*o_putc)(c3); \
+	if (c4) (*o_putc)(c4); \
+    } while (0)
+
 static void
 w_oconv(nkf_char c2, nkf_char c1)
 {
     nkf_char c3, c4;
-    nkf_char val;
+    nkf_char val, val2;
 
     if (output_bom_f) {
 	output_bom_f = FALSE;
@@ -2553,11 +2815,7 @@
 
     if (c2 == 0 && nkf_char_unicode_p(c1)){
 	val = c1 & VALUE_MASK;
-	nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
-	(*o_putc)(c1);
-	if (c2) (*o_putc)(c2);
-	if (c3) (*o_putc)(c3);
-	if (c4) (*o_putc)(c4);
+	OUTPUT_UTF8(val);
 	return;
     }
 
@@ -2566,27 +2824,46 @@
     } else {
 	val = e2w_conv(c2, c1);
 	if (val){
-	    nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
-	    (*o_putc)(c1);
-	    if (c2) (*o_putc)(c2);
-	    if (c3) (*o_putc)(c3);
-	    if (c4) (*o_putc)(c4);
+	    val2 = e2w_combining(val, c2, c1);
+	    if (val2)
+		OUTPUT_UTF8(val2);
+	    OUTPUT_UTF8(val);
 	}
     }
 }
 
+#define OUTPUT_UTF16_BYTES(c1, c2) do { \
+	if (output_endian == ENDIAN_LITTLE){ \
+	    (*o_putc)(c1); \
+	    (*o_putc)(c2); \
+	}else{ \
+	    (*o_putc)(c2); \
+	    (*o_putc)(c1); \
+	} \
+    } while (0)
+
+#define OUTPUT_UTF16(val) do { \
+	if (nkf_char_unicode_bmp_p(val)) { \
+	    c2 = (val >> 8) & 0xff; \
+	    c1 = val & 0xff; \
+	    OUTPUT_UTF16_BYTES(c1, c2); \
+	} else { \
+	    val &= VALUE_MASK; \
+	    if (val <= UNICODE_MAX) { \
+		c2 = (val >> 10) + NKF_INT32_C(0xD7C0);   /* high surrogate */ \
+		c1 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ \
+		OUTPUT_UTF16_BYTES(c2 & 0xff, (c2 >> 8) & 0xff); \
+		OUTPUT_UTF16_BYTES(c1 & 0xff, (c1 >> 8) & 0xff); \
+	    } \
+	} \
+    } while (0)
+
 static void
 w_oconv16(nkf_char c2, nkf_char c1)
 {
     if (output_bom_f) {
 	output_bom_f = FALSE;
-	if (output_endian == ENDIAN_LITTLE){
-	    (*o_putc)(0xFF);
-	    (*o_putc)(0xFE);
-	}else{
-	    (*o_putc)(0xFE);
-	    (*o_putc)(0xFF);
-	}
+	OUTPUT_UTF16_BYTES(0xFF, 0xFE);
     }
 
     if (c2 == EOF) {
@@ -2595,44 +2872,34 @@
     }
 
     if (c2 == 0 && nkf_char_unicode_p(c1)) {
-	if (nkf_char_unicode_bmp_p(c1)) {
-	    c2 = (c1 >> 8) & 0xff;
-	    c1 &= 0xff;
-	} else {
-	    c1 &= VALUE_MASK;
-	    if (c1 <= UNICODE_MAX) {
-		c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0);   /* high surrogate */
-		c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
-		if (output_endian == ENDIAN_LITTLE){
-		    (*o_putc)(c2 & 0xff);
-		    (*o_putc)((c2 >> 8) & 0xff);
-		    (*o_putc)(c1 & 0xff);
-		    (*o_putc)((c1 >> 8) & 0xff);
-		}else{
-		    (*o_putc)((c2 >> 8) & 0xff);
-		    (*o_putc)(c2 & 0xff);
-		    (*o_putc)((c1 >> 8) & 0xff);
-		    (*o_putc)(c1 & 0xff);
-		}
-	    }
-	    return;
-	}
+	OUTPUT_UTF16(c1);
     } else if (c2) {
-	nkf_char val = e2w_conv(c2, c1);
-	c2 = (val >> 8) & 0xff;
-	c1 = val & 0xff;
+	nkf_char val, val2;
+	val = e2w_conv(c2, c1);
 	if (!val) return;
+	val2 = e2w_combining(val, c2, c1);
+	if (val2)
+	    OUTPUT_UTF16(val2);
+	OUTPUT_UTF16(val);
+    } else {
+	OUTPUT_UTF16_BYTES(c1, c2);
     }
-
-    if (output_endian == ENDIAN_LITTLE){
-	(*o_putc)(c1);
-	(*o_putc)(c2);
-    }else{
-	(*o_putc)(c2);
-	(*o_putc)(c1);
-    }
 }
 
+#define OUTPUT_UTF32(c) do { \
+	if (output_endian == ENDIAN_LITTLE){ \
+	    (*o_putc)( (c)        & 0xFF); \
+	    (*o_putc)(((c) >>  8) & 0xFF); \
+	    (*o_putc)(((c) >> 16) & 0xFF); \
+	    (*o_putc)(0); \
+	}else{ \
+	    (*o_putc)(0); \
+	    (*o_putc)(((c) >> 16) & 0xFF); \
+	    (*o_putc)(((c) >>  8) & 0xFF); \
+	    (*o_putc)( (c)        & 0xFF); \
+	} \
+    } while (0)
+
 static void
 w_oconv32(nkf_char c2, nkf_char c1)
 {
@@ -2661,20 +2928,15 @@
     } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
 	c1 &= VALUE_MASK;
     } else if (c2) {
-	c1 = e2w_conv(c2, c1);
-	if (!c1) return;
+	nkf_char val, val2;
+	val = e2w_conv(c2, c1);
+	if (!val) return;
+	val2 = e2w_combining(val, c2, c1);
+	if (val2)
+	    OUTPUT_UTF32(val2);
+	c1 = val;
     }
-    if (output_endian == ENDIAN_LITTLE){
-	(*o_putc)( c1        & 0xFF);
-	(*o_putc)((c1 >>  8) & 0xFF);
-	(*o_putc)((c1 >> 16) & 0xFF);
-	(*o_putc)(0);
-    }else{
-	(*o_putc)(0);
-	(*o_putc)((c1 >> 16) & 0xFF);
-	(*o_putc)((c1 >>  8) & 0xFF);
-	(*o_putc)( c1        & 0xFF);
-    }
+    OUTPUT_UTF32(c1);
 }
 #endif
 
@@ -2683,7 +2945,8 @@
 #define SCORE_DEPEND   (SCORE_KANA << 1)     /* MD Characters */
 #define SCORE_CP932    (SCORE_DEPEND << 1)   /* IBM extended characters */
 #define SCORE_X0212    (SCORE_CP932 << 1)    /* JIS X 0212 */
-#define SCORE_NO_EXIST (SCORE_X0212 << 1)    /* Undefined Characters */
+#define SCORE_X0213    (SCORE_X0212 << 1)    /* JIS X 0213 */
+#define SCORE_NO_EXIST (SCORE_X0213 << 1)    /* Undefined Characters */
 #define SCORE_iMIME    (SCORE_NO_EXIST << 1) /* MIME selected */
 #define SCORE_ERROR    (SCORE_iMIME << 1) /* Error */
 
@@ -2693,16 +2956,37 @@
     0, 0, 0, 0,
     0, 0, 0, 0,
     0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
-    SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
+    SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_X0213,
 };
 
 static const nkf_char score_table_F0[] = {
     SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
-    SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
+    SCORE_L2, SCORE_DEPEND, SCORE_X0 (... truncated)

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]