[前][次][番号順一覧][スレッド一覧]

ruby-changes:70510

From: adamroyjones <ko1@a...>
Date: Fri, 24 Dec 2021 14:41:17 +0900 (JST)
Subject: [ruby-changes:70510] c70dc3cafb (master): [ruby/csv] Add handling for ambiguous parsing options (https://github.com/ruby/csv/pull/226)

https://git.ruby-lang.org/ruby.git/commit/?id=c70dc3cafb

From c70dc3cafb29d89d0377677ead346495183db47e Mon Sep 17 00:00:00 2001
From: adamroyjones <10088591+adamroyjones@u...>
Date: Thu, 18 Nov 2021 21:20:09 +0000
Subject: [ruby/csv] Add handling for ambiguous parsing options
 (https://github.com/ruby/csv/pull/226)

GitHub: fix GH-225

With Ruby 3.0.2 and csv 3.2.1, the file

```ruby
require "csv"
File.open("example.tsv", "w") { |f| f.puts("foo\t\tbar") }
CSV.read("example.tsv", col_sep: "\t", strip: true)
```

produces the error

```
lib/csv/parser.rb:935:in `parse_quotable_robust': TODO: Meaningful
message in line 1. (CSV::MalformedCSVError)
```

However, the CSV in this example is not malformed; instead, ambiguous
options were provided to the parser. It is not obvious (to me) whether
the string should be parsed as

- `["foo\t\tbar"]`,
- `["foo", "bar"]`,
- `["foo", "", "bar"]`, or
- `["foo", nil, "bar"]`.

This commit adds code that raises an exception when this situation is
encountered. Specifically, it checks if the column separator either ends
with or starts with the characters that would be stripped away.

This commit also adds unit tests and updates the documentation.

https://github.com/ruby/csv/commit/cc317dd42d
---
 lib/csv.rb                   | 13 +++++++------
 lib/csv/parser.rb            | 23 +++++++++++++++++++++++
 test/csv/parse/test_strip.rb | 29 +++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/lib/csv.rb b/lib/csv.rb
index 42e99435cbf..06a490f34c6 100644
--- a/lib/csv.rb
+++ b/lib/csv.rb
@@ -330,6 +330,7 @@ using CSV::MatchP if CSV.const_defined?(:MatchP) https://github.com/ruby/ruby/blob/trunk/lib/csv.rb#L330
 #     liberal_parsing:    false,
 #     nil_value:          nil,
 #     empty_value:        "",
+#     strip:              false,
 #     # For generating.
 #     write_headers:      nil,
 #     quote_empty:        true,
@@ -337,7 +338,6 @@ using CSV::MatchP if CSV.const_defined?(:MatchP) https://github.com/ruby/ruby/blob/trunk/lib/csv.rb#L338
 #     write_converters:   nil,
 #     write_nil_value:    nil,
 #     write_empty_value:  "",
-#     strip:              false,
 #   }
 #
 # ==== Options for Parsing
@@ -355,8 +355,9 @@ using CSV::MatchP if CSV.const_defined?(:MatchP) https://github.com/ruby/ruby/blob/trunk/lib/csv.rb#L355
 # - +header_converters+: Specifies the header converters to be used.
 # - +skip_blanks+: Specifies whether blanks lines are to be ignored.
 # - +skip_lines+: Specifies how comments lines are to be recognized.
-# - +strip+: Specifies whether leading and trailing whitespace are
-#   to be stripped from fields..
+# - +strip+: Specifies whether leading and trailing whitespace are to be
+#   stripped from fields. This must be compatible with +col_sep+; if it is not,
+#   then an +ArgumentError+ exception will be raised.
 # - +liberal_parsing+: Specifies whether \CSV should attempt to parse
 #   non-compliant data.
 # - +nil_value+: Specifies the object that is to be substituted for each null (no-text) field.
@@ -935,6 +936,7 @@ class CSV https://github.com/ruby/ruby/blob/trunk/lib/csv.rb#L936
     liberal_parsing:    false,
     nil_value:          nil,
     empty_value:        "",
+    strip:              false,
     # For generating.
     write_headers:      nil,
     quote_empty:        true,
@@ -942,7 +944,6 @@ class CSV https://github.com/ruby/ruby/blob/trunk/lib/csv.rb#L944
     write_converters:   nil,
     write_nil_value:    nil,
     write_empty_value:  "",
-    strip:              false,
   }.freeze
 
   class << self
@@ -1760,11 +1761,11 @@ class CSV https://github.com/ruby/ruby/blob/trunk/lib/csv.rb#L1761
                  encoding: nil,
                  nil_value: nil,
                  empty_value: "",
+                 strip: false,
                  quote_empty: true,
                  write_converters: nil,
                  write_nil_value: nil,
-                 write_empty_value: "",
-                 strip: false)
+                 write_empty_value: "")
     raise ArgumentError.new("Cannot parse nil as CSV") if data.nil?
 
     if data.is_a?(String)
diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb
index 3334acfbdd7..f87db3bb126 100644
--- a/lib/csv/parser.rb
+++ b/lib/csv/parser.rb
@@ -361,6 +361,7 @@ class CSV https://github.com/ruby/ruby/blob/trunk/lib/csv/parser.rb#L361
       prepare_skip_lines
       prepare_strip
       prepare_separators
+      validate_strip_and_col_sep_options
       prepare_quoted
       prepare_unquoted
       prepare_line
@@ -531,6 +532,28 @@ class CSV https://github.com/ruby/ruby/blob/trunk/lib/csv/parser.rb#L532
       @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
     end
 
+    # This method verifies that there are no (obvious) ambiguities with the
+    # provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
+    # and +strip+ were both equal to +\t+, then there would be no clear way to
+    # parse the input.
+    def validate_strip_and_col_sep_options
+      return unless @strip
+
+      if @strip.is_a?(String)
+        if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
+          raise ArgumentError,
+                "The provided strip (#{@escaped_strip}) and " \
+                "col_sep (#{@escaped_column_separator}) options are incompatible."
+        end
+      else
+        if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
+          raise ArgumentError,
+                "The provided strip (true) and " \
+                "col_sep (#{@escaped_column_separator}) options are incompatible."
+        end
+      end
+    end
+
     def prepare_quoted
       if @quote_character
         @quotes = Regexp.new(@escaped_quote_character +
diff --git a/test/csv/parse/test_strip.rb b/test/csv/parse/test_strip.rb
index 3564fcb3ba5..c5e35209cc1 100644
--- a/test/csv/parse/test_strip.rb
+++ b/test/csv/parse/test_strip.rb
@@ -80,4 +80,33 @@ class TestCSVParseStrip < Test::Unit::TestCase https://github.com/ruby/ruby/blob/trunk/test/csv/parse/test_strip.rb#L80
                            %Q{"a" ,"b " \r\n},
                            strip: true))
   end
+
+  def test_col_sep_incompatible_true
+    message = "The provided strip (true) and " \
+              "col_sep (\\t) options are incompatible."
+    assert_raise_with_message(ArgumentError, message) do
+      CSV.parse_line(%Q{"a"\t"b"\n},
+                     col_sep: "\t",
+                     strip: true)
+    end
+  end
+
+  def test_col_sep_incompatible_string
+    message = "The provided strip (\\t) and " \
+              "col_sep (\\t) options are incompatible."
+    assert_raise_with_message(ArgumentError, message) do
+      CSV.parse_line(%Q{"a"\t"b"\n},
+                     col_sep: "\t",
+                     strip: "\t")
+    end
+  end
+
+  def test_col_sep_compatible_string
+    assert_equal(
+      ["a", "b"],
+      CSV.parse_line(%Q{\va\tb\v\n},
+                     col_sep: "\t",
+                     strip: "\v")
+    )
+  end
 end
-- 
cgit v1.2.1


--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/

[前][次][番号順一覧][スレッド一覧]