test_skiprows.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. """
  2. Tests that skipped rows are properly handled during
  3. parsing for all of the parsers defined in parsers.py
  4. """
  5. from datetime import datetime
  6. from io import StringIO
  7. import numpy as np
  8. import pytest
  9. from pandas.errors import EmptyDataError
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. )
  14. import pandas._testing as tm
  15. xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
  16. pytestmark = pytest.mark.filterwarnings(
  17. "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
  18. )
  19. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  20. @pytest.mark.parametrize("skiprows", [list(range(6)), 6])
  21. def test_skip_rows_bug(all_parsers, skiprows):
  22. # see gh-505
  23. parser = all_parsers
  24. text = """#foo,a,b,c
  25. #foo,a,b,c
  26. #foo,a,b,c
  27. #foo,a,b,c
  28. #foo,a,b,c
  29. #foo,a,b,c
  30. 1/1/2000,1.,2.,3.
  31. 1/2/2000,4,5,6
  32. 1/3/2000,7,8,9
  33. """
  34. result = parser.read_csv(
  35. StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
  36. )
  37. index = Index(
  38. [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
  39. )
  40. expected = DataFrame(
  41. np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
  42. )
  43. tm.assert_frame_equal(result, expected)
  44. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  45. def test_deep_skip_rows(all_parsers):
  46. # see gh-4382
  47. parser = all_parsers
  48. data = "a,b,c\n" + "\n".join(
  49. [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
  50. )
  51. condensed_data = "a,b,c\n" + "\n".join(
  52. [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
  53. )
  54. result = parser.read_csv(StringIO(data), skiprows=[6, 8])
  55. condensed_result = parser.read_csv(StringIO(condensed_data))
  56. tm.assert_frame_equal(result, condensed_result)
  57. @xfail_pyarrow # AssertionError: DataFrame are different
  58. def test_skip_rows_blank(all_parsers):
  59. # see gh-9832
  60. parser = all_parsers
  61. text = """#foo,a,b,c
  62. #foo,a,b,c
  63. #foo,a,b,c
  64. #foo,a,b,c
  65. 1/1/2000,1.,2.,3.
  66. 1/2/2000,4,5,6
  67. 1/3/2000,7,8,9
  68. """
  69. data = parser.read_csv(
  70. StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
  71. )
  72. index = Index(
  73. [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
  74. )
  75. expected = DataFrame(
  76. np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
  77. )
  78. tm.assert_frame_equal(data, expected)
  79. @pytest.mark.parametrize(
  80. "data,kwargs,expected",
  81. [
  82. (
  83. """id,text,num_lines
  84. 1,"line 11
  85. line 12",2
  86. 2,"line 21
  87. line 22",2
  88. 3,"line 31",1""",
  89. {"skiprows": [1]},
  90. DataFrame(
  91. [[2, "line 21\nline 22", 2], [3, "line 31", 1]],
  92. columns=["id", "text", "num_lines"],
  93. ),
  94. ),
  95. (
  96. "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
  97. {"quotechar": "~", "skiprows": [2]},
  98. DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
  99. ),
  100. (
  101. (
  102. "Text,url\n~example\n "
  103. "sentence\n one~,url1\n~"
  104. "example\n sentence\n two~,url2\n~"
  105. "example\n sentence\n three~,url3"
  106. ),
  107. {"quotechar": "~", "skiprows": [1, 3]},
  108. DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
  109. ),
  110. ],
  111. )
  112. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  113. def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
  114. # see gh-12775 and gh-10911
  115. parser = all_parsers
  116. result = parser.read_csv(StringIO(data), **kwargs)
  117. tm.assert_frame_equal(result, expected)
  118. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  119. def test_skip_row_with_quote(all_parsers):
  120. # see gh-12775 and gh-10911
  121. parser = all_parsers
  122. data = """id,text,num_lines
  123. 1,"line '11' line 12",2
  124. 2,"line '21' line 22",2
  125. 3,"line '31' line 32",1"""
  126. exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
  127. expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
  128. result = parser.read_csv(StringIO(data), skiprows=[1])
  129. tm.assert_frame_equal(result, expected)
  130. @pytest.mark.parametrize(
  131. "data,exp_data",
  132. [
  133. (
  134. """id,text,num_lines
  135. 1,"line \n'11' line 12",2
  136. 2,"line \n'21' line 22",2
  137. 3,"line \n'31' line 32",1""",
  138. [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
  139. ),
  140. (
  141. """id,text,num_lines
  142. 1,"line '11\n' line 12",2
  143. 2,"line '21\n' line 22",2
  144. 3,"line '31\n' line 32",1""",
  145. [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
  146. ),
  147. (
  148. """id,text,num_lines
  149. 1,"line '11\n' \r\tline 12",2
  150. 2,"line '21\n' \r\tline 22",2
  151. 3,"line '31\n' \r\tline 32",1""",
  152. [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
  153. ),
  154. ],
  155. )
  156. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  157. def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
  158. # see gh-12775 and gh-10911
  159. parser = all_parsers
  160. result = parser.read_csv(StringIO(data), skiprows=[1])
  161. expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
  162. tm.assert_frame_equal(result, expected)
  163. @xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported
  164. @pytest.mark.parametrize(
  165. "lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
  166. )
  167. def test_skiprows_lineterminator(all_parsers, lineterminator, request):
  168. # see gh-9079
  169. parser = all_parsers
  170. data = "\n".join(
  171. [
  172. "SMOSMANIA ThetaProbe-ML2X ",
  173. "2007/01/01 01:00 0.2140 U M ",
  174. "2007/01/01 02:00 0.2141 M O ",
  175. "2007/01/01 04:00 0.2142 D M ",
  176. ]
  177. )
  178. expected = DataFrame(
  179. [
  180. ["2007/01/01", "01:00", 0.2140, "U", "M"],
  181. ["2007/01/01", "02:00", 0.2141, "M", "O"],
  182. ["2007/01/01", "04:00", 0.2142, "D", "M"],
  183. ],
  184. columns=["date", "time", "var", "flag", "oflag"],
  185. )
  186. if parser.engine == "python" and lineterminator == "\r":
  187. mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet")
  188. request.applymarker(mark)
  189. data = data.replace("\n", lineterminator)
  190. depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
  191. with tm.assert_produces_warning(
  192. FutureWarning, match=depr_msg, check_stacklevel=False
  193. ):
  194. result = parser.read_csv(
  195. StringIO(data),
  196. skiprows=1,
  197. delim_whitespace=True,
  198. names=["date", "time", "var", "flag", "oflag"],
  199. )
  200. tm.assert_frame_equal(result, expected)
  201. @xfail_pyarrow # AssertionError: DataFrame are different
  202. def test_skiprows_infield_quote(all_parsers):
  203. # see gh-14459
  204. parser = all_parsers
  205. data = 'a"\nb"\na\n1'
  206. expected = DataFrame({"a": [1]})
  207. result = parser.read_csv(StringIO(data), skiprows=2)
  208. tm.assert_frame_equal(result, expected)
  209. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  210. @pytest.mark.parametrize(
  211. "kwargs,expected",
  212. [
  213. ({}, DataFrame({"1": [3, 5]})),
  214. ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
  215. ],
  216. )
  217. def test_skip_rows_callable(all_parsers, kwargs, expected):
  218. parser = all_parsers
  219. data = "a\n1\n2\n3\n4\n5"
  220. result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
  221. tm.assert_frame_equal(result, expected)
  222. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  223. def test_skip_rows_callable_not_in(all_parsers):
  224. parser = all_parsers
  225. data = "0,a\n1,b\n2,c\n3,d\n4,e"
  226. expected = DataFrame([[1, "b"], [3, "d"]])
  227. result = parser.read_csv(
  228. StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
  229. )
  230. tm.assert_frame_equal(result, expected)
  231. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  232. def test_skip_rows_skip_all(all_parsers):
  233. parser = all_parsers
  234. data = "a\n1\n2\n3\n4\n5"
  235. msg = "No columns to parse from file"
  236. with pytest.raises(EmptyDataError, match=msg):
  237. parser.read_csv(StringIO(data), skiprows=lambda x: True)
  238. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  239. def test_skip_rows_bad_callable(all_parsers):
  240. msg = "by zero"
  241. parser = all_parsers
  242. data = "a\n1\n2\n3\n4\n5"
  243. with pytest.raises(ZeroDivisionError, match=msg):
  244. parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
  245. @xfail_pyarrow # ValueError: skiprows argument must be an integer
  246. def test_skip_rows_and_n_rows(all_parsers):
  247. # GH#44021
  248. data = """a,b
  249. 1,a
  250. 2,b
  251. 3,c
  252. 4,d
  253. 5,e
  254. 6,f
  255. 7,g
  256. 8,h
  257. """
  258. parser = all_parsers
  259. result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
  260. expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
  261. tm.assert_frame_equal(result, expected)
  262. @xfail_pyarrow
  263. def test_skip_rows_with_chunks(all_parsers):
  264. # GH 55677
  265. data = """col_a
  266. 10
  267. 20
  268. 30
  269. 40
  270. 50
  271. 60
  272. 70
  273. 80
  274. 90
  275. 100
  276. """
  277. parser = all_parsers
  278. reader = parser.read_csv(
  279. StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
  280. )
  281. df1 = next(reader)
  282. df2 = next(reader)
  283. tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
  284. tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))