ruby-changes:4205
From: ko1@a...
Date: Wed, 5 Mar 2008 22:55:24 +0900 (JST)
Subject: [ruby-changes:4205] naruse - Ruby:r15695 (trunk): * string.c (is_utf8_lead_byte, count_utf8_lead_bytes_with_ulong):
naruse 2008-03-05 22:54:36 +0900 (Wed, 05 Mar 2008) New Revision: 15695 Modified files: trunk/ChangeLog trunk/string.c Log: * string.c (is_utf8_lead_byte, count_utf8_lead_bytes_with_ulong): defined for UTF-8 optimization. * string.c (str_strlen): use is_utf8_lead_byte and count_utf8_lead_bytes_with_ulong. * string.c (str_utf8_nth) ditto. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=15695&r2=15694&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=15695&r2=15694&diff_format=u Index: ChangeLog =================================================================== --- ChangeLog (revision 15694) +++ ChangeLog (revision 15695) @@ -1,3 +1,13 @@ +Wed Mar 05 22:49:20 2008 NARUSE, Yui <naruse@r...> + + * string.c (is_utf8_lead_byte, count_utf8_lead_bytes_with_ulong): + defined for UTF-8 optimization. + + * string.c (str_strlen): use is_utf8_lead_byte and + count_utf8_lead_bytes_with_ulong. + + * string.c (str_utf8_nth) ditto. + Wed Mar 5 17:53:01 2008 Nobuyoshi Nakada <nobu@r...> * file.c (rb_file_flock): returns false on EAGAIN if non-blocking. Index: string.c =================================================================== --- string.c (revision 15694) +++ string.c (revision 15695) @@ -755,6 +755,24 @@ return c; } +#ifdef NONASCII_MASK +#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) +static inline const long +count_utf8_lead_bytes_with_ulong(const unsigned long *s) +{ + unsigned long d = *s; + d |= ~(d>>1); + d >>= 6; + d &= NONASCII_MASK >> 3; + d += (d>>8); + d += (d>>16); +#if NONASCII_MASK == 0x8080808080808080UL + d += (d>>32); +#endif + return (long)(d&0xF); +} +#endif + static long str_strlen(VALUE str, rb_encoding *enc) { @@ -774,26 +792,19 @@ const VALUE lowbits = sizeof(unsigned long) - 1; s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits)); t = (const unsigned long*)(~lowbits & (VALUE)e); - for (len=0; p<(const char *)s; p++) { - if (((*p)&0xC0) != 0x80) len++; + while (p < (const char *)s) { + if (is_utf8_lead_byte(*p)) len++; + p++; } while (s < t) { - unsigned long d = *s; - d = ~d | (d<<1); - d &= NONASCII_MASK; - d >>= 7; - d += (d>>8); - d += (d>>16); -#if NONASCII_MASK == 0x8080808080808080UL - d = d + (d>>32); -#endif - len += (long)(d&0xF); + len += count_utf8_lead_bytes_with_ulong(s); s++; } - p = (const char *)t; + p = (const char *)s; } - for (; p<e; p++) { - if (((*p)&0xC0) != 0x80) len++; + while (p < e) { + if (is_utf8_lead_byte(*p)) len++; + p++; } return len; } @@ -1162,33 +1173,22 @@ const VALUE lowbits = sizeof(unsigned long) - 1; s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits)); t = (const unsigned long*)(~lowbits & (VALUE)e); - for (; p<(const char *)s && 0<nth; p++) { - if (((*p)&0xC0) != 0x80) nth--; + while (p < (const char *)s) { + if (is_utf8_lead_byte(*p)) nth--; + p++; } while (s < t) { - unsigned long d = *s++; - d = ~d | (d<<1); - d &= NONASCII_MASK; - d >>= 7; - d += (d>>8); - d += (d>>16); -#if NONASCII_MASK == 0x8080808080808080UL - d += (d>>32); -#endif - nth -= (long)(d&0xF); - if (nth < 8) { - t = s; - break; - } + nth -= count_utf8_lead_bytes_with_ulong(s); + if (nth < sizeof(long)) break; + s++; } - p = (char *)t; + p = (char *)s; } if (0 < nth) { while (p < e) { - if (((*p)&0xC0) != 0x80) { + if (is_utf8_lead_byte(*p)) { nth--; - if (nth < 0) - break; + if (nth < 0) break; } p++; } -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/