ruby-changes:15147

jeg2	2010-03-23 23:59:25 +0900 (Tue, 23 Mar 2010)

  New Revision: 27025

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi?view=rev&revision=27025

  Log:
    * lib/csv.rb: Incorporating the fixes from the recent
      FasterCSV releases:  1.5.2 and 1.5.3.  [ruby-core:25038]

  Modified files:
    trunk/ChangeLog
    trunk/lib/csv.rb
    trunk/test/csv/test_csv_parsing.rb
    trunk/test/csv/test_interface.rb
    trunk/test/csv/test_table.rb

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 27024)
+++ ChangeLog	(revision 27025)
@@ -1,3 +1,8 @@
+Tue Mar 23 23:58:51 2010  James Edward Gray II  <jeg2@r...>
+
+	* lib/csv.rb: Incorporating the fixes from the recent
+	  FasterCSV releases:  1.5.2 and 1.5.3.  [ruby-core:25038]
+
 Tue Mar 23 18:35:46 2010  Shugo Maeda  <shugo@r...>
 
 	* object.c (rb_obj_singleton_class): new method
Index: lib/csv.rb
===================================================================
--- lib/csv.rb	(revision 27024)
+++ lib/csv.rb	(revision 27025)
@@ -826,8 +826,12 @@
     # Returns the table as a complete CSV String.  Headers will be listed first,
     # then all of the field rows.
     #
+    # This method assumes you want the Table.headers(), unless you explicitly
+    # pass <tt>:write_headers => false</tt>.
+    # 
     def to_csv(options = Hash.new)
-      @table.inject([headers.to_csv(options)]) do |rows, row|
+      wh = options.fetch(:write_headers, true)
+      @table.inject(wh ? [headers.to_csv(options)] : [ ]) do |rows, row|
         if row.header_row?
           rows
         else
@@ -1808,66 +1812,95 @@
     # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
     # because of \r and/or \n characters embedded in quoted fields
     #
+    in_extended_col = false
+    csv             = Array.new
+
     loop do
       # add another read to the line
-      (line += @io.gets(@row_sep)) rescue return nil
-      # copy the line so we can chop it up in parsing
-      parse =  line.dup
+      unless parse = @io.gets(@row_sep)
+        return nil
+      end
+
       parse.sub!(@parsers[:line_end], "")
 
-      #
-      # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
-      # CSV's <tt>[nil]</tt>
-      #
-      if parse.empty?
-        @lineno += 1
-        if @skip_blanks
-          line = ""
-          next
-        elsif @unconverted_fields
-          return add_unconverted_fields(Array.new, Array.new)
-        elsif @use_headers
-          return self.class::Row.new(Array.new, Array.new)
-        else
-          return Array.new
+      if csv.empty?
+        #
+        # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
+        # CSV's <tt>[nil]</tt>
+        #
+        if parse.empty?
+          @lineno += 1
+          if @skip_blanks
+            next
+          elsif @unconverted_fields
+            return add_unconverted_fields(Array.new, Array.new)
+          elsif @use_headers
+            return self.class::Row.new(Array.new, Array.new)
+          else
+            return Array.new
+          end
         end
       end
 
-      #
-      # shave leading empty fields if needed, because the main parser chokes
-      # on these
-      #
-      csv = if parse.sub!(@parsers[:leading_fields], "")
-        [nil] * ($&.length / @col_sep.length)
-      else
-        Array.new
-      end
-      #
-      # then parse the main fields with a hyper-tuned Regexp from
-      # Mastering Regular Expressions, Second Edition
-      #
-      parse.gsub!(@parsers[:csv_row]) do
-        csv << if $1.nil?     # we found an unquoted field
-          if $2.empty?        # switch empty unquoted fields to +nil+...
-            nil               # for Ruby 1.8 CSV compatibility
+      parts =  parse.split(@col_sep, -1)
+      csv   << nil if parts.empty?
+
+      # This loop is the hot path of csv parsing. Some things may be non-dry
+      # for a reason. Make sure to benchmark when refactoring.
+      parts.each do |part|
+        if in_extended_col
+          # If we are continuing a previous column
+          if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
+            # extended column ends
+            csv.last << part[0..-2]
+            raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
+            csv.last.gsub!(@quote_char * 2, @quote_char)
+            in_extended_col = false
           else
-            # I decided to take a strict approach to CSV parsing...
-            if $2.count(@parsers[:return_newline]).zero?  # verify correctness
-              $2
-            else
-              # or throw an Exception
-              raise MalformedCSVError, "Unquoted fields do not allow " +
-                                       "\\r or \\n (line #{lineno + 1})."
-            end
+            csv.last << part
+            csv.last << @col_sep
           end
-        else                  # we found a quoted field...
-          $1.gsub(@quote_char * 2, @quote_char)  # unescape contents
+        elsif part[0] == @quote_char
+          # If we are staring a new quoted column
+          if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
+            # start an extended column
+            csv             << part[1..-1]
+            csv.last        << @col_sep
+            in_extended_col =  true
+          else
+            # regular quoted column
+            csv << part[1..-2]
+            raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
+            csv.last.gsub!(@quote_char * 2, @quote_char)
+          end
+        elsif part =~ @parsers[:quote_or_nl]
+          # Unquoted field with bad characters.
+          if part =~ @parsers[:nl_or_lf]
+            raise MalformedCSVError, "Unquoted fields do not allow " +
+                                     "\\r or \\n (line #{lineno + 1})."
+          else
+            raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
+          end
+        else
+          # Regular ole unquoted field.
+          csv << (part.empty? ? nil : part)
         end
-        ""  # gsub!'s replacement, clear the field
       end
 
-      # if parse is empty?(), we found all the fields on the line...
-      if parse.empty?
+      # Replace tacked on @col_sep with @row_sep if we are still in an extended
+      # column.
+      csv[-1][-1] = @row_sep if in_extended_col
+
+      if in_extended_col
+        # if we're at eof?(), a quoted field wasn't closed...
+        if @io.eof?
+          raise MalformedCSVError,
+                "Unclosed quoted field on line #{lineno + 1}."
+        elsif @field_size_limit and csv.last.size >= @field_size_limit
+          raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
+        end
+        # otherwise, we need to loop and pull some more data to complete the row
+      else
         @lineno += 1
 
         # save fields unconverted fields, if needed...
@@ -1886,15 +1919,6 @@
         # return the results
         break csv
       end
-      # if we're not empty?() but at eof?(), a quoted field wasn't closed...
-      if @io.eof?
-        raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
-      elsif parse =~ @parsers[:bad_field]
-        raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
-      elsif @field_size_limit and parse.length >= @field_size_limit
-        raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
-      end
-      # otherwise, we need to loop and pull some more data to complete the row
     end
   end
   alias_method :gets,     :shift
@@ -2046,33 +2070,11 @@
     esc_row_sep = escape_re(@row_sep)
     esc_quote   = escape_re(@quote_char)
     @parsers = {
-      # for empty leading fields
-      leading_fields: encode_re("\\A(?:", esc_col_sep, ")+"),
-      # The Primary Parser
-      csv_row:        encode_re(
-        "\\G(?:\\A|", esc_col_sep, ")",                # anchor the match
-        "(?:", esc_quote,                              # find quoted fields
-               "((?>[^", esc_quote, "]*)",             # "unrolling the loop"
-               "(?>", esc_quote * 2,                   # double for escaping
-               "[^", esc_quote, "]*)*)",
-               esc_quote,
-               "|",                                    # ... or ...
-               "([^", esc_quote, esc_col_sep, "]*))",  # unquoted fields
-        "(?=", esc_col_sep, "|\\z)"                    # ensure field is ended
-      ),
-      # a test for unescaped quotes
-      bad_field:      encode_re(
-        "\\A", esc_col_sep, "?",                   # an optional comma
-        "(?:", esc_quote,                          # a quoted field
-               "(?>[^", esc_quote, "]*)",          # "unrolling the loop"
-               "(?>", esc_quote * 2,               # double for escaping
-               "[^", esc_quote, "]*)*",
-               esc_quote,                          # the closing quote
-               "[^", esc_quote, "]",               # an extra character
-               "|",                                # ... or ...
-               "[^", esc_quote, esc_col_sep, "]+", # an unquoted field
-               esc_quote, ")"                      # an extra quote
-      ),
+      # for detecting parse errors
+      quote_or_nl:    encode_re("[", esc_quote, "\r\n]"),
+      nl_or_lf:       encode_re("[\r\n]"),
+      stray_quote:    encode_re( "[^", esc_quote, "]", esc_quote,
+                                 "[^", esc_quote, "]" ),
       # safer than chomp!()
       line_end:       encode_re(esc_row_sep, "\\z"),
       # illegal unquoted characters
Index: test/csv/test_csv_parsing.rb
===================================================================
--- test/csv/test_csv_parsing.rb	(revision 27024)
+++ test/csv/test_csv_parsing.rb	(revision 27025)
@@ -115,6 +115,18 @@
     assert_equal(Array.new, CSV.parse_line("\n1,2,3\n"))
   end
 
+  def test_non_regex_edge_cases
+    # An early version of the non-regex parser fails this test
+    [ [ "foo,\"foo,bar,baz,foo\",\"foo\"",
+        ["foo", "foo,bar,baz,foo", "foo"] ] ].each do |edge_case|
+      assert_equal(edge_case.last, CSV.parse_line(edge_case.first))
+    end
+
+    assert_raise(CSV::MalformedCSVError) do
+      CSV.parse_line("1,\"23\"4\"5\", 6")
+    end
+  end
+
   def test_malformed_csv
     assert_raise(CSV::MalformedCSVError) do
       CSV.parse_line("1,2\r,3", row_sep: "\n")
Index: test/csv/test_interface.rb
===================================================================
--- test/csv/test_interface.rb	(revision 27024)
+++ test/csv/test_interface.rb	(revision 27025)
@@ -75,6 +75,11 @@
     assert_equal(%w{1 2 3}, row)
   end
 
+  def test_parse_line_with_empty_lines
+    assert_equal(nil,       CSV.parse_line(""))  # to signal eof
+    assert_equal(Array.new, CSV.parse_line("\n1,2,3"))
+  end
+  
   def test_read_and_readlines
     assert_equal( @expected,
                   CSV.read(@path, col_sep: "\t", row_sep: "\r\n") )
@@ -167,7 +172,7 @@
       csv << lines.first.keys
       lines.each { |line| csv << line }
     end
-    CSV.open( @path, "w", headers:           true,
+    CSV.open( @path, "r", headers:           true,
                           converters:        :all,
                           header_converters: :symbol ) do |csv|
       csv.each { |line| assert_equal(lines.shift, line.to_hash) }
Index: test/csv/test_table.rb
===================================================================
--- test/csv/test_table.rb	(revision 27024)
+++ test/csv/test_table.rb	(revision 27025)
@@ -253,6 +253,8 @@
     # with options
     assert_equal( csv.gsub(",", "|").gsub("\n", "\r\n"),
                   @table.to_csv(col_sep: "|", row_sep: "\r\n") )
+    assert_equal( csv.lines.to_a[1..-1].join,
+                  @table.to_csv(:write_headers => false) )
 
     # with headers
     assert_equal(csv, @header_table.to_csv)

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/