ruby-changes:30887

glass	2013-09-18 23:34:04 +0900 (Wed, 18 Sep 2013)

  New Revision: 42966

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=42966

  Log:
    * string.c (rb_str_enumerate_lines): make String#each_line and
      #lines not raise invalid byte sequence error when it is called
      with an argument. The patch also causes performance improvement.
      [ruby-dev:47549] [Bug #8698]
    
    * test/ruby/test_m17n_comb.rb (test_str_each_line): remove
      assertions which check that String#each_line and #lines will
      raise an error if the receiver includes invalid byte sequence.

  Modified files:
    trunk/ChangeLog
    trunk/string.c
    trunk/test/ruby/test_m17n_comb.rb
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 42965)
+++ ChangeLog	(revision 42966)
@@ -1,3 +1,14 @@ https://github.com/ruby/ruby/blob/trunk/ChangeLog#L1
+Wed Sep 18 23:14:58 2013  Masaki Matsushita  <glass.saga@g...>
+
+	* string.c (rb_str_enumerate_lines): make String#each_line and
+	  #lines not raise invalid byte sequence error when it is called
+	  with an argument. The patch also causes performance improvement.
+	  [ruby-dev:47549] [Bug #8698]
+
+	* test/ruby/test_m17n_comb.rb (test_str_each_line): remove
+	  assertions which check that String#each_line and #lines will
+	  raise an error if the receiver includes invalid byte sequence.
+
 Wed Sep 18 16:32:15 2013  Nobuyoshi Nakada  <nobu@r...>
 
 	* proc.c (mnew_from_me): allocate structs after allocated wrapper
Index: string.c
===================================================================
--- string.c	(revision 42965)
+++ string.c	(revision 42966)
@@ -6352,21 +6352,17 @@ static VALUE https://github.com/ruby/ruby/blob/trunk/string.c#L6352
 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
 {
     rb_encoding *enc;
-    VALUE rs;
-    unsigned int newline;
-    const char *p, *pend, *s, *ptr;
-    long len, rslen;
-    VALUE line;
-    int n;
-    VALUE orig = str;
+    VALUE line, rs, orig = str;
+    const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
+    long pos, len, rslen;
+    int paragraph_mode = 0;
+
     VALUE UNINITIALIZED_VAR(ary);
 
-    if (argc == 0) {
+    if (argc == 0)
 	rs = rb_rs;
-    }
-    else {
+    else
 	rb_scan_args(argc, argv, "01", &rs);
-    }
 
     if (rb_block_given_p()) {
 	if (wantarray) {
@@ -6396,76 +6392,63 @@ rb_str_enumerate_lines(int argc, VALUE * https://github.com/ruby/ruby/blob/trunk/string.c#L6392
 	    return orig;
 	}
     }
+
     str = rb_str_new4(str);
-    ptr = p = s = RSTRING_PTR(str);
-    pend = p + RSTRING_LEN(str);
+    ptr = subptr = RSTRING_PTR(str);
+    pend = RSTRING_END(str);
     len = RSTRING_LEN(str);
     StringValue(rs);
-    if (rs == rb_default_rs) {
-	enc = rb_enc_get(str);
-	while (p < pend) {
-	    char *p0;
+    rslen = RSTRING_LEN(rs);
 
-	    p = memchr(p, '\n', pend - p);
-	    if (!p) break;
-	    p0 = rb_enc_left_char_head(s, p, pend, enc);
-	    if (!rb_enc_is_newline(p0, pend, enc)) {
-		p++;
-		continue;
-	    }
-	    p = p0 + rb_enc_mbclen(p0, pend, enc);
-	    line = rb_str_subseq(str, s - ptr, p - s);
-	    if (wantarray)
-		rb_ary_push(ary, line);
-	    else
-		rb_yield(line);
-	    str_mod_check(str, ptr, len);
-	    s = p;
-	}
-	goto finish;
-    }
+    if (rs == rb_default_rs)
+	enc = rb_enc_get(str);
+    else
+	enc = rb_enc_check(str, rs);
 
-    enc = rb_enc_check(str, rs);
-    rslen = RSTRING_LEN(rs);
     if (rslen == 0) {
-	newline = '\n';
+	rsptr = "\n\n";
+	rslen = 2;
+	paragraph_mode = 1;
     }
     else {
-	newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
+	rsptr = RSTRING_PTR(rs);
     }
 
-    while (p < pend) {
-	unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
-
-      again:
-	if (rslen == 0 && c == newline) {
-	    p += n;
-	    if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
-		goto again;
-	    }
-	    while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
-		p += n;
+    if ((rs == rb_default_rs || paragraph_mode) && !rb_enc_asciicompat(enc)) {
+	rs = rb_str_new(rsptr, rslen);
+	rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
+	rsptr = RSTRING_PTR(rs);
+	rslen = RSTRING_LEN(rs);
+    }
+
+    while (subptr < pend) {
+	pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
+	if (pos < 0) break;
+	hit = subptr + pos;
+	adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
+	if (hit != adjusted) {
+	    subptr = adjusted;
+	    continue;
+	}
+	subend = hit + rslen;
+	if (paragraph_mode) {
+	    while (subend < pend && rb_enc_is_newline(subend, pend, enc)) {
+		subend += rb_enc_mbclen(subend, pend, enc);
 	    }
-	    p -= n;
 	}
-	if (c == newline &&
-	    (rslen <= 1 ||
-	     (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
-	    const char *pp = p + (rslen ? rslen : n);
-	    line = rb_str_subseq(str, s - ptr, pp - s);
-	    if (wantarray)
-		rb_ary_push(ary, line);
-	    else
-		rb_yield(line);
+	line = rb_str_subseq(str, subptr - ptr, subend - subptr);
+	if (wantarray) {
+	    rb_ary_push(ary, line);
+	}
+	else {
+	    rb_yield(line);
 	    str_mod_check(str, ptr, len);
-	    s = pp;
 	}
-	p += n;
+	subptr = subend;
     }
 
-  finish:
-    if (s != pend) {
-	line = rb_str_subseq(str, s - ptr, pend - s);
+    if (subptr != pend) {
+	line = rb_str_subseq(str, subptr - ptr, pend - subptr);
 	if (wantarray)
 	    rb_ary_push(ary, line);
 	else
Index: test/ruby/test_m17n_comb.rb
===================================================================
--- test/ruby/test_m17n_comb.rb	(revision 42965)
+++ test/ruby/test_m17n_comb.rb	(revision 42966)
@@ -798,17 +798,12 @@ class TestM17NComb < Test::Unit::TestCas https://github.com/ruby/ruby/blob/trunk/test/ruby/test_m17n_comb.rb#L798
 
   def test_str_each_line
     combination(STRINGS, STRINGS) {|s1, s2|
-      if !s1.valid_encoding? || !s2.valid_encoding?
-        assert_raise(ArgumentError, Encoding::CompatibilityError) { s1.each_line(s2) {} }
-        next
-      end
       if !s1.ascii_only? && !s2.ascii_only? && s1.encoding != s2.encoding
         assert_raise(Encoding::CompatibilityError) { s1.each_line(s2) {} }
         next
       end
       lines = []
       enccall(s1, :each_line, s2) {|line|
-        assert(line.valid_encoding?)
         assert_equal(s1.encoding, line.encoding)
         lines << line
       }

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/