| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- """
- Tests that comments are properly handled during parsing
- for all of the parsers defined in parsers.py
- """
- from io import StringIO
- import numpy as np
- import pytest
- from pandas import DataFrame
- import pandas._testing as tm
- @pytest.mark.parametrize("na_values", [None, ["NaN"]])
- def test_comment(all_parsers, na_values):
- parser = all_parsers
- data = """A,B,C
- 1,2.,4.#hello world
- 5.,NaN,10.0
- """
- expected = DataFrame(
- [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
- )
- if parser.engine == "pyarrow":
- msg = "The 'comment' option is not supported with the 'pyarrow' engine"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), comment="#", na_values=na_values)
- return
- result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}]
- )
- def test_line_comment(all_parsers, read_kwargs, request):
- parser = all_parsers
- data = """# empty
- A,B,C
- 1,2.,4.#hello world
- #ignore this line
- 5.,NaN,10.0
- """
- warn = None
- depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
- if read_kwargs.get("delim_whitespace"):
- data = data.replace(",", " ")
- warn = FutureWarning
- elif read_kwargs.get("lineterminator"):
- data = data.replace("\n", read_kwargs.get("lineterminator"))
- read_kwargs["comment"] = "#"
- if parser.engine == "pyarrow":
- if "lineterminator" in read_kwargs:
- msg = (
- "The 'lineterminator' option is not supported with the 'pyarrow' engine"
- )
- else:
- msg = "The 'comment' option is not supported with the 'pyarrow' engine"
- with pytest.raises(ValueError, match=msg):
- with tm.assert_produces_warning(
- warn, match=depr_msg, check_stacklevel=False
- ):
- parser.read_csv(StringIO(data), **read_kwargs)
- return
- elif parser.engine == "python" and read_kwargs.get("lineterminator"):
- msg = r"Custom line terminators not supported in python parser \(yet\)"
- with pytest.raises(ValueError, match=msg):
- with tm.assert_produces_warning(
- warn, match=depr_msg, check_stacklevel=False
- ):
- parser.read_csv(StringIO(data), **read_kwargs)
- return
- with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
- result = parser.read_csv(StringIO(data), **read_kwargs)
- expected = DataFrame(
- [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
- )
- tm.assert_frame_equal(result, expected)
- def test_comment_skiprows(all_parsers):
- parser = all_parsers
- data = """# empty
- random line
- # second empty line
- 1,2,3
- A,B,C
- 1,2.,4.
- 5.,NaN,10.0
- """
- # This should ignore the first four lines (including comments).
- expected = DataFrame(
- [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
- )
- if parser.engine == "pyarrow":
- msg = "The 'comment' option is not supported with the 'pyarrow' engine"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), comment="#", skiprows=4)
- return
- result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
- tm.assert_frame_equal(result, expected)
- def test_comment_header(all_parsers):
- parser = all_parsers
- data = """# empty
- # second empty line
- 1,2,3
- A,B,C
- 1,2.,4.
- 5.,NaN,10.0
- """
- # Header should begin at the second non-comment line.
- expected = DataFrame(
- [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
- )
- if parser.engine == "pyarrow":
- msg = "The 'comment' option is not supported with the 'pyarrow' engine"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), comment="#", header=1)
- return
- result = parser.read_csv(StringIO(data), comment="#", header=1)
- tm.assert_frame_equal(result, expected)
- def test_comment_skiprows_header(all_parsers):
- parser = all_parsers
- data = """# empty
- # second empty line
- # third empty line
- X,Y,Z
- 1,2,3
- A,B,C
- 1,2.,4.
- 5.,NaN,10.0
- """
- # Skiprows should skip the first 4 lines (including comments),
- # while header should start from the second non-commented line,
- # starting with line 5.
- expected = DataFrame(
- [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
- )
- if parser.engine == "pyarrow":
- msg = "The 'comment' option is not supported with the 'pyarrow' engine"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
- return
- result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
- def test_custom_comment_char(all_parsers, comment_char):
- parser = all_parsers
- data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
- if parser.engine == "pyarrow":
- msg = "The 'comment' option is not supported with the 'pyarrow' engine"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(
- StringIO(data.replace("#", comment_char)), comment=comment_char
- )
- return
- result = parser.read_csv(
- StringIO(data.replace("#", comment_char)), comment=comment_char
- )
- expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("header", ["infer", None])
- def test_comment_first_line(all_parsers, header):
- # see gh-4623
- parser = all_parsers
- data = "# notes\na,b,c\n# more notes\n1,2,3"
- if header is None:
- expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
- else:
- expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
- if parser.engine == "pyarrow":
- msg = "The 'comment' option is not supported with the 'pyarrow' engine"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), comment="#", header=header)
- return
- result = parser.read_csv(StringIO(data), comment="#", header=header)
- tm.assert_frame_equal(result, expected)
- def test_comment_char_in_default_value(all_parsers, request):
- # GH#34002
- if all_parsers.engine == "c":
- reason = "see gh-34002: works on the python engine but not the c engine"
- # NA value containing comment char is interpreted as comment
- request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError))
- parser = all_parsers
- data = (
- "# this is a comment\n"
- "col1,col2,col3,col4\n"
- "1,2,3,4#inline comment\n"
- "4,5#,6,10\n"
- "7,8,#N/A,11\n"
- )
- if parser.engine == "pyarrow":
- msg = "The 'comment' option is not supported with the 'pyarrow' engine"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
- return
- result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
- expected = DataFrame(
- {
- "col1": [1, 4, 7],
- "col2": [2, 5, 8],
- "col3": [3.0, np.nan, np.nan],
- "col4": [4.0, np.nan, 11.0],
- }
- )
- tm.assert_frame_equal(result, expected)
|