ruby-changes:2640

akr	2007-12-08 11:50:43 +0900 (Sat, 08 Dec 2007)

  New Revision: 14131

  Modified files:
    trunk/ChangeLog
    trunk/encoding.c
    trunk/ext/tk/sample/tkextlib/vu/canvSticker2.rb
    trunk/include/ruby/encoding.h
    trunk/include/ruby/regex.h
    trunk/parse.y
    trunk/re.c
    trunk/string.c
    trunk/test/ruby/test_m17n.rb
    trunk/test/ruby/test_regexp.rb

  Log:
    * encoding.c (rb_enc_mbclen): make it never fail.
      (rb_enc_nth): don't check the return value of rb_enc_mbclen.
      (rb_enc_strlen): ditto.
      (rb_enc_precise_mbclen): return needmore(1) if e <= p.
      (rb_enc_get_ascii): new function for extracting ASCII character.
    
    * include/ruby/encoding.h (rb_enc_get_ascii): declared.
    
    * include/ruby/regex.h (ismbchar): removed.
    
    * re.c (rb_reg_expr_str): use rb_enc_get_ascii.
      (unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine
      the termination of escaped non-ASCII character.
      (unescape_nonascii): use rb_enc_precise_mbclen.
      (rb_reg_quote): use rb_enc_get_ascii.
      (rb_reg_regsub): use rb_enc_get_ascii.
    
    * string.c (rb_str_reverse) don't check the return value of
      rb_enc_mbclen.
      (rb_str_split_m): don't call rb_enc_mbclen with e <= p.
    
    * parse.y (is_identchar): use ISASCII.
      (parser_ismbchar): removed.
      (parser_precise_mbclen): new macro.
      (parser_isascii): new macro.
      (parser_tokadd_mbchar): use parser_precise_mbclen to check invalid
      character precisely.
      (parser_tokadd_string): use parser_isascii.
      (parser_yylex): ditto.
      (is_special_global_name): don't call is_identchar with e <= p.
      (rb_enc_symname_p): ditto.
    
      [ruby-dev:32455]
    
    * ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie
      because the encoding is not UTF-8.  [ruby-dev:32475]


  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/parse.y?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/encoding.h?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_regexp.rb?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ext/tk/sample/tkextlib/vu/canvSticker2.rb?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/re.c?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/include/ruby/regex.h?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/encoding.c?r1=14131&r2=14130
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/test/ruby/test_m17n.rb?r1=14131&r2=14130

Index: encoding.c
===================================================================
--- encoding.c	(revision 14130)
+++ encoding.c	(revision 14131)
@@ -459,7 +459,6 @@
 	for (c=0; p<e && nth--; c++) {
 	    int n = rb_enc_mbclen(p, e, enc);
 
-	    if (n == 0) return 0;
 	    p += n;
 	}
     }
@@ -478,7 +477,6 @@
     for (c=0; p<e; c++) {
 	int n = rb_enc_mbclen(p, e, enc);
 
-	if (n == 0) return -1;
 	p += n;
     }
     return c;
@@ -487,19 +485,39 @@
 int
 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
 {
-    int n = ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
-    if (n == 0) {
-	rb_raise(rb_eArgError, "invalid mbstring sequence");
-    }
-    return n;
+    int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
+    if (MBCLEN_CHARFOUND(n))
+        return n;
+    else
+        return 1;
 }
 
 int
 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
 {
+    if (e <= p)
+        return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
     return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
 }
 
+int rb_enc_get_ascii(const char *p, const char *e, rb_encoding *enc)
+{
+    int c, l;
+    if (e <= p)
+        return -1;
+    if (rb_enc_asciicompat(enc)) {
+        c = (unsigned char)*p;
+        return ISASCII(c) ? c : -1;
+    }
+    l = rb_enc_precise_mbclen(p, e, enc);
+    if (!MBCLEN_CHARFOUND(l))
+        return -1;
+    c = rb_enc_codepoint(p, e, enc);
+    if (rb_enc_isascii(c, enc))
+        return c;
+    return -1;
+}
+
 int
 rb_enc_codelen(int c, rb_encoding *enc)
 {
Index: include/ruby/regex.h
===================================================================
--- include/ruby/regex.h	(revision 14130)
+++ include/ruby/regex.h	(revision 14131)
@@ -29,7 +29,6 @@
 
 ONIG_EXTERN OnigEncoding    OnigEncDefaultCharEncoding;
 
-#define ismbchar(p, e, enc) (mbclen((p),(e),(enc)) != 1)
 #define mbclen(p,e,enc)  rb_enc_mbclen((p),(e),(enc))
 
 #endif /* ifndef ONIG_RUBY_M17N */
Index: include/ruby/encoding.h
===================================================================
--- include/ruby/encoding.h	(revision 14130)
+++ include/ruby/encoding.h	(revision 14131)
@@ -77,6 +77,9 @@
 #define MBCLEN_INVALID(ret)       ONIGENC_MBCLEN_INVALID(ret)
 #define MBCLEN_NEEDMORE(ret)      ONIGENC_MBCLEN_NEEDMORE(ret)
 
+/* ptr,endptr,encoding -> 0x00..0x7f, -1 */
+int rb_enc_get_ascii(const char*, const char *, rb_encoding*);
+
 /* code,encoding -> codelen */
 int rb_enc_codelen(int, rb_encoding*);
 
Index: re.c
===================================================================
--- re.c	(revision 14130)
+++ re.c	(revision 14131)
@@ -218,10 +218,12 @@
     rb_encoding *enc = rb_enc_get(str);
     const char *p, *pend;
     int need_escape = 0;
+    int c;
 
     p = s; pend = p + len;
     while (p<pend) {
-	if (*p == '/' || (!rb_enc_isprint(*p, enc) && !ismbchar(p, pend, enc))) {
+        c = rb_enc_get_ascii(p, pend, enc);
+	if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) {
 	    need_escape = 1;
 	    break;
 	}
@@ -233,29 +235,31 @@
     else {
 	p = s;
 	while (p<pend) {
-	    if (*p == '\\') {
+            c = rb_enc_get_ascii(p, pend, enc);
+	    if (c == '\\') {
 		int n = mbclen(p+1, pend, enc) + 1;
 		rb_str_buf_cat(str, p, n);
 		p += n;
 		continue;
 	    }
-	    else if (*p == '/') {
+	    else if (c == '/') {
 		char c = '\\';
 		rb_str_buf_cat(str, &c, 1);
 		rb_str_buf_cat(str, p, 1);
 	    }
-	    else if (ismbchar(p, pend, enc)) {
-	    	rb_str_buf_cat(str, p, mbclen(p, pend, enc));
-		p += mbclen(p, pend, enc);
+	    else if (c == -1) {
+                int l = mbclen(p, pend, enc);
+	    	rb_str_buf_cat(str, p, l);
+		p += l;
 		continue;
 	    }
-	    else if (rb_enc_isprint(*p, enc)) {
+	    else if (rb_enc_isprint(c, enc)) {
 		rb_str_buf_cat(str, p, 1);
 	    }
-	    else if (!rb_enc_isspace(*p, enc)) {
+	    else if (!rb_enc_isspace(c, enc)) {
 		char b[8];
 
-		sprintf(b, "\\%03o", *p & 0377);
+		sprintf(b, "\\%03o", c);
 		rb_str_buf_cat(str, b, 4);
 	    }
 	    else {
@@ -1377,6 +1381,7 @@
     char *chbuf = ALLOCA_N(char, chmaxlen);
     int chlen = 0;
     int byte;
+    int l;
 
     memset(chbuf, 0, chmaxlen);
 
@@ -1386,7 +1391,8 @@
     }
 
     chbuf[chlen++] = byte;
-    while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chlen, enc)) {
+    while (chlen < chmaxlen &&
+           MBCLEN_NEEDMORE(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
         byte = read_escaped_byte(&p, end, err);
         if (byte == -1) {
             return -1;
@@ -1394,11 +1400,11 @@
         chbuf[chlen++] = byte;
     }
 
-    if (chlen != mbclen(chbuf, chbuf+chlen, enc)) {
+    l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
+    if (MBCLEN_INVALID(l)) {
         strcpy(err, "invalid multibyte escape");
         return -1;
     }
-
     if (1 < chlen || (chbuf[0] & 0x80)) {
         rb_str_buf_cat(buf, chbuf, chlen);
 
@@ -1515,13 +1521,12 @@
     char smallbuf[2];
 
     while (p < end) {
-        int chlen = mbclen(p, end, enc);
+        int chlen = rb_enc_precise_mbclen(p, end, enc);
+        if (!MBCLEN_CHARFOUND(chlen)) {
+            strcpy(err, "invalid multibyte character");
+            return -1;
+        }
         if (1 < chlen || (*p & 0x80)) {
-            if (end < p + chlen) {
-                strcpy(err, "too short multibyte character");
-                return -1;
-            }
-            /* xxx: validate the non-ascii character */
             rb_str_buf_cat(buf, p, chlen);
             p += chlen;
             if (*encp == 0)
@@ -2093,8 +2098,8 @@
     s = RSTRING_PTR(str);
     send = s + RSTRING_LEN(str);
     for (; s < send; s++) {
-	c = *s;
-	if (ismbchar(s, send, enc)) {
+        c = rb_enc_get_ascii(s, send, enc);
+	if (c == -1) {
 	    int n = mbclen(s, send, enc);
 
 	    while (n-- && s < send)
@@ -2129,8 +2134,8 @@
     t += s - RSTRING_PTR(str);
 
     for (; s < send; s++) {
-	c = *s;
-	if (ismbchar(s, send, enc)) {
+        c = rb_enc_get_ascii(s, send, enc);
+	if (c == -1) {
 	    int n = mbclen(s, send, enc);
 
 	    while (n-- && s < send)
@@ -2397,13 +2402,14 @@
     e = s + RSTRING_LEN(str);
 
     while (s < e) {
+        int c = rb_enc_get_ascii(s, e, enc);
 	char *ss = s++;
 
-	if (ismbchar(ss, e, enc)) {
+	if (c == -1) {
 	    s += mbclen(ss, e, enc) - 1;
 	    continue;
 	}
-	if (*ss != '\\' || s == e) continue;
+	if (c != '\\' || s == e) continue;
 
 	if (!val) {
 	    val = rb_str_buf_new(ss-p);
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 14130)
+++ ChangeLog	(revision 14131)
@@ -1,3 +1,42 @@
+Sat Dec  8 11:06:29 2007  Tanaka Akira  <akr@f...>
+
+	* encoding.c (rb_enc_mbclen): make it never fail.
+	  (rb_enc_nth): don't check the return value of rb_enc_mbclen.
+	  (rb_enc_strlen): ditto.
+	  (rb_enc_precise_mbclen): return needmore(1) if e <= p.
+	  (rb_enc_get_ascii): new function for extracting ASCII character.
+
+	* include/ruby/encoding.h (rb_enc_get_ascii): declared.
+
+	* include/ruby/regex.h (ismbchar): removed.
+
+	* re.c (rb_reg_expr_str): use rb_enc_get_ascii.
+	  (unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine
+	  the termination of escaped non-ASCII character.
+	  (unescape_nonascii): use rb_enc_precise_mbclen.
+	  (rb_reg_quote): use rb_enc_get_ascii.
+	  (rb_reg_regsub): use rb_enc_get_ascii.
+
+	* string.c (rb_str_reverse) don't check the return value of
+	  rb_enc_mbclen.
+	  (rb_str_split_m): don't call rb_enc_mbclen with e <= p.
+
+	* parse.y (is_identchar): use ISASCII.
+	  (parser_ismbchar): removed.
+	  (parser_precise_mbclen): new macro.
+	  (parser_isascii): new macro.
+	  (parser_tokadd_mbchar): use parser_precise_mbclen to check invalid
+	  character precisely.
+	  (parser_tokadd_string): use parser_isascii.
+	  (parser_yylex): ditto.
+	  (is_special_global_name): don't call is_identchar with e <= p.
+	  (rb_enc_symname_p): ditto.
+
+	  [ruby-dev:32455]
+
+	* ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie
+	  because the encoding is not UTF-8.  [ruby-dev:32475]
+
 Fri Dec  7 20:21:35 2007  GOTOU Yuuzou  <gotoyuzo@n...>
 
 	* ext/openssl/lib/net/ftptls.rb, ext/openssl/lib/net/telnets.rb:
Index: string.c
===================================================================
--- string.c	(revision 14130)
+++ string.c	(revision 14131)
@@ -2725,9 +2725,6 @@
 	    while (s < e) {
 		int clen = rb_enc_mbclen(s, e, enc);
 
-		if (clen == 0) {
-		    rb_raise(rb_eArgError, "invalid mbstring sequence");
-		}
 		p -= clen;
 		memcpy(p, s, clen);
 		s += clen;
@@ -4079,7 +4076,10 @@
 		    beg = start;
 		}
 		else {
-		    start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
+                    if (RSTRING_PTR(str)+start == RSTRING_END(str))
+                        start++;
+                    else
+                        start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
 		    last_null = 1;
 		    continue;
 		}
Index: parse.y
===================================================================
--- parse.y	(revision 14130)
+++ parse.y	(revision 14131)
@@ -4583,10 +4583,12 @@
 #endif
 
 #define parser_mbclen()  mbclen((lex_p-1),lex_pend,parser->enc)
-#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || ismbchar(p,e,enc))
-#define parser_ismbchar() ismbchar((lex_p-1), lex_pend, parser->enc)
+#define parser_precise_mbclen()  rb_enc_precise_mbclen((lex_p-1),lex_pend,parser->enc)
+#define is_identchar(p,e,enc) (rb_enc_isalnum(*p,enc) || (*p) == '_' || !ISASCII(*p))
 #define parser_is_identchar() (!parser->eofp && is_identchar((lex_p-1),lex_pend,parser->enc))
 
+#define parser_isascii() ISASCII(*(lex_p-1))
+
 static int
 parser_yyerror(struct parser_params *parser, const char *msg)
 {
@@ -5305,8 +5307,8 @@
 static int
 parser_tokadd_mbchar(struct parser_params *parser, int c)
 {
-    int len = parser_mbclen();
-    if (len <= 0 || lex_p + len - 1 > lex_pend) {
+    int len = parser_precise_mbclen();
+    if (!MBCLEN_CHARFOUND(len)) {
 	compile_error(PARSER_ARG "illegal multibyte char");
 	return -1;
     }
@@ -5414,7 +5416,7 @@
 		}
 	    }
 	}
-	else if (parser_ismbchar()) {
+	else if (!parser_isascii()) {
 	    has_nonascii = 1;
 	    if (enc != *encp) {
 		mixed_error(enc, *encp);
@@ -6306,7 +6308,7 @@
 	}
 	newtok();
 	enc = parser->enc;
-	if (parser_ismbchar()) {
+	if (!parser_isascii()) {
 	    if (tokadd_mbchar(c) == -1) return 0;
 	}
 	else if ((rb_enc_isalnum(c, parser->enc) || c == '_') &&
@@ -6889,7 +6891,7 @@
 	    }
 	    else {
 		term = nextc();
-		if (rb_enc_isalnum(term, parser->enc) || parser_ismbchar()) {
+		if (rb_enc_isalnum(term, parser->enc) || !parser_isascii()) {
 		    yyerror("unknown type of %string");
 		    return 0;
 		}
@@ -8693,7 +8695,7 @@
 	break;
       case '-':
 	++m;
-	if (is_identchar(m, e, enc)) {
+	if (m < e && is_identchar(m, e, enc)) {
 	    if (!ISASCII(*m)) mb = 1;
 	    m += rb_enc_mbclen(m, e, enc);
 	}
@@ -8776,9 +8778,9 @@
       default:
 	localid = !rb_enc_isupper(*m, enc);
       id:
-	if (*m != '_' && !rb_enc_isalpha(*m, enc) && !ismbchar(m, e, enc))
+	if (m >= e || (*m != '_' && !rb_enc_isalpha(*m, enc) && ISASCII(*m)))
 		  return Qfalse;
-	while (is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc);
+	while (m < e && is_identchar(m, e, enc)) m += rb_enc_mbclen(m, e, enc);
 	if (localid) {
 	    switch (*m) {
 	      case '!': case '?': case '=': ++m;
Index: ext/tk/sample/tkextlib/vu/canvSticker2.rb
===================================================================
--- ext/tk/sample/tkextlib/vu/canvSticker2.rb	(revision 14130)
+++ ext/tk/sample/tkextlib/vu/canvSticker2.rb	(revision 14131)
@@ -1,5 +1,4 @@
 #!/usr/bin/env ruby
-# -*- coding: utf-8 -*-
 
 require 'tk'
 require 'tkextlib/vu/charts'
Index: test/ruby/test_m17n.rb
===================================================================
--- test/ruby/test_m17n.rb	(revision 14130)
+++ test/ruby/test_m17n.rb	(revision 14131)
@@ -77,8 +77,8 @@
     assert_raise(SyntaxError) { eval('/\xc2/u') }
     assert_raise(SyntaxError) { eval('/\xe0\x80/u') }
     assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') }
-    #assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
-    #assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
+    assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
+    assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
 
     # raw 8bit
     assert_raise(SyntaxError) { eval("/\xfe/e") }
@@ -87,7 +87,7 @@
     # invalid suffix
     assert_raise(SyntaxError) { eval('/\xc2\xff/u') }
     assert_raise(SyntaxError) { eval('/\xc2 /u') }
-    #assert_raise(SyntaxError) { eval('/\xc2\x20/u') }
+    assert_raise(SyntaxError) { eval('/\xc2\x20/u') }
   end
 
   def assert_regexp_generic_encoding(r)
Index: test/ruby/test_regexp.rb
===================================================================
--- test/ruby/test_regexp.rb	(revision 14130)
+++ test/ruby/test_regexp.rb	(revision 14131)
@@ -20,7 +20,7 @@
 
   def test_yoshidam_net_20041111_2
     assert_raise(RegexpError) do
-      s = "[\xFF-\xFF]"
+      s = "[\xFF-\xFF]".force_encoding("utf-8")
       Regexp.new(s, nil, "u")
     end
   end
@@ -42,8 +42,8 @@
     assert_equal :ok, begin
       Regexp.union(
         "a",
-        Regexp.new("\x80".force_encoding("euc-jp")),
-        Regexp.new("\x80".force_encoding("utf-8")))
+        Regexp.new("\xc2\xa1".force_encoding("euc-jp")),
+        Regexp.new("\xc2\xa1".force_encoding("utf-8")))
       :ng
     rescue ArgumentError
       :ok

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml