test_comment.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. """
  2. Tests that comments are properly handled during parsing
  3. for all of the parsers defined in parsers.py
  4. """
  5. from io import StringIO
  6. import numpy as np
  7. import pytest
  8. from pandas import DataFrame
  9. import pandas._testing as tm
  10. @pytest.mark.parametrize("na_values", [None, ["NaN"]])
  11. def test_comment(all_parsers, na_values):
  12. parser = all_parsers
  13. data = """A,B,C
  14. 1,2.,4.#hello world
  15. 5.,NaN,10.0
  16. """
  17. expected = DataFrame(
  18. [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
  19. )
  20. if parser.engine == "pyarrow":
  21. msg = "The 'comment' option is not supported with the 'pyarrow' engine"
  22. with pytest.raises(ValueError, match=msg):
  23. parser.read_csv(StringIO(data), comment="#", na_values=na_values)
  24. return
  25. result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
  26. tm.assert_frame_equal(result, expected)
  27. @pytest.mark.parametrize(
  28. "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}]
  29. )
  30. def test_line_comment(all_parsers, read_kwargs, request):
  31. parser = all_parsers
  32. data = """# empty
  33. A,B,C
  34. 1,2.,4.#hello world
  35. #ignore this line
  36. 5.,NaN,10.0
  37. """
  38. warn = None
  39. depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
  40. if read_kwargs.get("delim_whitespace"):
  41. data = data.replace(",", " ")
  42. warn = FutureWarning
  43. elif read_kwargs.get("lineterminator"):
  44. data = data.replace("\n", read_kwargs.get("lineterminator"))
  45. read_kwargs["comment"] = "#"
  46. if parser.engine == "pyarrow":
  47. if "lineterminator" in read_kwargs:
  48. msg = (
  49. "The 'lineterminator' option is not supported with the 'pyarrow' engine"
  50. )
  51. else:
  52. msg = "The 'comment' option is not supported with the 'pyarrow' engine"
  53. with pytest.raises(ValueError, match=msg):
  54. with tm.assert_produces_warning(
  55. warn, match=depr_msg, check_stacklevel=False
  56. ):
  57. parser.read_csv(StringIO(data), **read_kwargs)
  58. return
  59. elif parser.engine == "python" and read_kwargs.get("lineterminator"):
  60. msg = r"Custom line terminators not supported in python parser \(yet\)"
  61. with pytest.raises(ValueError, match=msg):
  62. with tm.assert_produces_warning(
  63. warn, match=depr_msg, check_stacklevel=False
  64. ):
  65. parser.read_csv(StringIO(data), **read_kwargs)
  66. return
  67. with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
  68. result = parser.read_csv(StringIO(data), **read_kwargs)
  69. expected = DataFrame(
  70. [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
  71. )
  72. tm.assert_frame_equal(result, expected)
  73. def test_comment_skiprows(all_parsers):
  74. parser = all_parsers
  75. data = """# empty
  76. random line
  77. # second empty line
  78. 1,2,3
  79. A,B,C
  80. 1,2.,4.
  81. 5.,NaN,10.0
  82. """
  83. # This should ignore the first four lines (including comments).
  84. expected = DataFrame(
  85. [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
  86. )
  87. if parser.engine == "pyarrow":
  88. msg = "The 'comment' option is not supported with the 'pyarrow' engine"
  89. with pytest.raises(ValueError, match=msg):
  90. parser.read_csv(StringIO(data), comment="#", skiprows=4)
  91. return
  92. result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
  93. tm.assert_frame_equal(result, expected)
  94. def test_comment_header(all_parsers):
  95. parser = all_parsers
  96. data = """# empty
  97. # second empty line
  98. 1,2,3
  99. A,B,C
  100. 1,2.,4.
  101. 5.,NaN,10.0
  102. """
  103. # Header should begin at the second non-comment line.
  104. expected = DataFrame(
  105. [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
  106. )
  107. if parser.engine == "pyarrow":
  108. msg = "The 'comment' option is not supported with the 'pyarrow' engine"
  109. with pytest.raises(ValueError, match=msg):
  110. parser.read_csv(StringIO(data), comment="#", header=1)
  111. return
  112. result = parser.read_csv(StringIO(data), comment="#", header=1)
  113. tm.assert_frame_equal(result, expected)
  114. def test_comment_skiprows_header(all_parsers):
  115. parser = all_parsers
  116. data = """# empty
  117. # second empty line
  118. # third empty line
  119. X,Y,Z
  120. 1,2,3
  121. A,B,C
  122. 1,2.,4.
  123. 5.,NaN,10.0
  124. """
  125. # Skiprows should skip the first 4 lines (including comments),
  126. # while header should start from the second non-commented line,
  127. # starting with line 5.
  128. expected = DataFrame(
  129. [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
  130. )
  131. if parser.engine == "pyarrow":
  132. msg = "The 'comment' option is not supported with the 'pyarrow' engine"
  133. with pytest.raises(ValueError, match=msg):
  134. parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
  135. return
  136. result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
  137. tm.assert_frame_equal(result, expected)
  138. @pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
  139. def test_custom_comment_char(all_parsers, comment_char):
  140. parser = all_parsers
  141. data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
  142. if parser.engine == "pyarrow":
  143. msg = "The 'comment' option is not supported with the 'pyarrow' engine"
  144. with pytest.raises(ValueError, match=msg):
  145. parser.read_csv(
  146. StringIO(data.replace("#", comment_char)), comment=comment_char
  147. )
  148. return
  149. result = parser.read_csv(
  150. StringIO(data.replace("#", comment_char)), comment=comment_char
  151. )
  152. expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
  153. tm.assert_frame_equal(result, expected)
  154. @pytest.mark.parametrize("header", ["infer", None])
  155. def test_comment_first_line(all_parsers, header):
  156. # see gh-4623
  157. parser = all_parsers
  158. data = "# notes\na,b,c\n# more notes\n1,2,3"
  159. if header is None:
  160. expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
  161. else:
  162. expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
  163. if parser.engine == "pyarrow":
  164. msg = "The 'comment' option is not supported with the 'pyarrow' engine"
  165. with pytest.raises(ValueError, match=msg):
  166. parser.read_csv(StringIO(data), comment="#", header=header)
  167. return
  168. result = parser.read_csv(StringIO(data), comment="#", header=header)
  169. tm.assert_frame_equal(result, expected)
  170. def test_comment_char_in_default_value(all_parsers, request):
  171. # GH#34002
  172. if all_parsers.engine == "c":
  173. reason = "see gh-34002: works on the python engine but not the c engine"
  174. # NA value containing comment char is interpreted as comment
  175. request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError))
  176. parser = all_parsers
  177. data = (
  178. "# this is a comment\n"
  179. "col1,col2,col3,col4\n"
  180. "1,2,3,4#inline comment\n"
  181. "4,5#,6,10\n"
  182. "7,8,#N/A,11\n"
  183. )
  184. if parser.engine == "pyarrow":
  185. msg = "The 'comment' option is not supported with the 'pyarrow' engine"
  186. with pytest.raises(ValueError, match=msg):
  187. parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
  188. return
  189. result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
  190. expected = DataFrame(
  191. {
  192. "col1": [1, 4, 7],
  193. "col2": [2, 5, 8],
  194. "col3": [3.0, np.nan, np.nan],
  195. "col4": [4.0, np.nan, 11.0],
  196. }
  197. )
  198. tm.assert_frame_equal(result, expected)