test_header.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733
  1. """
  2. Tests that the file header is properly handled or inferred
  3. during parsing for all of the parsers defined in parsers.py
  4. """
  5. from collections import namedtuple
  6. from io import StringIO
  7. import numpy as np
  8. import pytest
  9. from pandas.errors import ParserError
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. MultiIndex,
  14. )
  15. import pandas._testing as tm
  16. pytestmark = pytest.mark.filterwarnings(
  17. "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
  18. )
  19. xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
  20. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  21. @xfail_pyarrow # TypeError: an integer is required
  22. def test_read_with_bad_header(all_parsers):
  23. parser = all_parsers
  24. msg = r"but only \d+ lines in file"
  25. with pytest.raises(ValueError, match=msg):
  26. s = StringIO(",,")
  27. parser.read_csv(s, header=[10])
  28. def test_negative_header(all_parsers):
  29. # see gh-27779
  30. parser = all_parsers
  31. data = """1,2,3,4,5
  32. 6,7,8,9,10
  33. 11,12,13,14,15
  34. """
  35. with pytest.raises(
  36. ValueError,
  37. match="Passing negative integer to header is invalid. "
  38. "For no header, use header=None instead",
  39. ):
  40. parser.read_csv(StringIO(data), header=-1)
  41. @pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])])
  42. def test_negative_multi_index_header(all_parsers, header):
  43. # see gh-27779
  44. parser = all_parsers
  45. data = """1,2,3,4,5
  46. 6,7,8,9,10
  47. 11,12,13,14,15
  48. """
  49. with pytest.raises(
  50. ValueError, match="cannot specify multi-index header with negative integers"
  51. ):
  52. parser.read_csv(StringIO(data), header=header)
  53. @pytest.mark.parametrize("header", [True, False])
  54. def test_bool_header_arg(all_parsers, header):
  55. # see gh-6114
  56. parser = all_parsers
  57. data = """\
  58. MyColumn
  59. a
  60. b
  61. a
  62. b"""
  63. msg = "Passing a bool to header is invalid"
  64. with pytest.raises(TypeError, match=msg):
  65. parser.read_csv(StringIO(data), header=header)
  66. @xfail_pyarrow # AssertionError: DataFrame are different
  67. def test_header_with_index_col(all_parsers):
  68. parser = all_parsers
  69. data = """foo,1,2,3
  70. bar,4,5,6
  71. baz,7,8,9
  72. """
  73. names = ["A", "B", "C"]
  74. result = parser.read_csv(StringIO(data), names=names)
  75. expected = DataFrame(
  76. [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  77. index=["foo", "bar", "baz"],
  78. columns=["A", "B", "C"],
  79. )
  80. tm.assert_frame_equal(result, expected)
  81. def test_header_not_first_line(all_parsers):
  82. parser = all_parsers
  83. data = """got,to,ignore,this,line
  84. got,to,ignore,this,line
  85. index,A,B,C,D
  86. foo,2,3,4,5
  87. bar,7,8,9,10
  88. baz,12,13,14,15
  89. """
  90. data2 = """index,A,B,C,D
  91. foo,2,3,4,5
  92. bar,7,8,9,10
  93. baz,12,13,14,15
  94. """
  95. result = parser.read_csv(StringIO(data), header=2, index_col=0)
  96. expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
  97. tm.assert_frame_equal(result, expected)
  98. @xfail_pyarrow # TypeError: an integer is required
  99. def test_header_multi_index(all_parsers):
  100. parser = all_parsers
  101. data = """\
  102. C0,,C_l0_g0,C_l0_g1,C_l0_g2
  103. C1,,C_l1_g0,C_l1_g1,C_l1_g2
  104. C2,,C_l2_g0,C_l2_g1,C_l2_g2
  105. C3,,C_l3_g0,C_l3_g1,C_l3_g2
  106. R0,R1,,,
  107. R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
  108. R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
  109. R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
  110. R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
  111. R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
  112. """
  113. result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
  114. data_gen_f = lambda r, c: f"R{r}C{c}"
  115. data = [[data_gen_f(r, c) for c in range(3)] for r in range(5)]
  116. index = MultiIndex.from_arrays(
  117. [[f"R_l0_g{i}" for i in range(5)], [f"R_l1_g{i}" for i in range(5)]],
  118. names=["R0", "R1"],
  119. )
  120. columns = MultiIndex.from_arrays(
  121. [
  122. [f"C_l0_g{i}" for i in range(3)],
  123. [f"C_l1_g{i}" for i in range(3)],
  124. [f"C_l2_g{i}" for i in range(3)],
  125. [f"C_l3_g{i}" for i in range(3)],
  126. ],
  127. names=["C0", "C1", "C2", "C3"],
  128. )
  129. expected = DataFrame(data, columns=columns, index=index)
  130. tm.assert_frame_equal(result, expected)
  131. @pytest.mark.parametrize(
  132. "kwargs,msg",
  133. [
  134. (
  135. {"index_col": ["foo", "bar"]},
  136. (
  137. "index_col must only contain "
  138. "row numbers when specifying "
  139. "a multi-index header"
  140. ),
  141. ),
  142. (
  143. {"index_col": [0, 1], "names": ["foo", "bar"]},
  144. ("cannot specify names when specifying a multi-index header"),
  145. ),
  146. (
  147. {"index_col": [0, 1], "usecols": ["foo", "bar"]},
  148. ("cannot specify usecols when specifying a multi-index header"),
  149. ),
  150. ],
  151. )
  152. def test_header_multi_index_invalid(all_parsers, kwargs, msg):
  153. data = """\
  154. C0,,C_l0_g0,C_l0_g1,C_l0_g2
  155. C1,,C_l1_g0,C_l1_g1,C_l1_g2
  156. C2,,C_l2_g0,C_l2_g1,C_l2_g2
  157. C3,,C_l3_g0,C_l3_g1,C_l3_g2
  158. R0,R1,,,
  159. R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
  160. R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
  161. R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
  162. R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
  163. R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
  164. """
  165. parser = all_parsers
  166. with pytest.raises(ValueError, match=msg):
  167. parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
  168. _TestTuple = namedtuple("_TestTuple", ["first", "second"])
  169. @xfail_pyarrow # TypeError: an integer is required
  170. @pytest.mark.parametrize(
  171. "kwargs",
  172. [
  173. {"header": [0, 1]},
  174. {
  175. "skiprows": 3,
  176. "names": [
  177. ("a", "q"),
  178. ("a", "r"),
  179. ("a", "s"),
  180. ("b", "t"),
  181. ("c", "u"),
  182. ("c", "v"),
  183. ],
  184. },
  185. {
  186. "skiprows": 3,
  187. "names": [
  188. _TestTuple("a", "q"),
  189. _TestTuple("a", "r"),
  190. _TestTuple("a", "s"),
  191. _TestTuple("b", "t"),
  192. _TestTuple("c", "u"),
  193. _TestTuple("c", "v"),
  194. ],
  195. },
  196. ],
  197. )
  198. def test_header_multi_index_common_format1(all_parsers, kwargs):
  199. parser = all_parsers
  200. expected = DataFrame(
  201. [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
  202. index=["one", "two"],
  203. columns=MultiIndex.from_tuples(
  204. [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
  205. ),
  206. )
  207. data = """,a,a,a,b,c,c
  208. ,q,r,s,t,u,v
  209. ,,,,,,
  210. one,1,2,3,4,5,6
  211. two,7,8,9,10,11,12"""
  212. result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
  213. tm.assert_frame_equal(result, expected)
  214. @xfail_pyarrow # TypeError: an integer is required
  215. @pytest.mark.parametrize(
  216. "kwargs",
  217. [
  218. {"header": [0, 1]},
  219. {
  220. "skiprows": 2,
  221. "names": [
  222. ("a", "q"),
  223. ("a", "r"),
  224. ("a", "s"),
  225. ("b", "t"),
  226. ("c", "u"),
  227. ("c", "v"),
  228. ],
  229. },
  230. {
  231. "skiprows": 2,
  232. "names": [
  233. _TestTuple("a", "q"),
  234. _TestTuple("a", "r"),
  235. _TestTuple("a", "s"),
  236. _TestTuple("b", "t"),
  237. _TestTuple("c", "u"),
  238. _TestTuple("c", "v"),
  239. ],
  240. },
  241. ],
  242. )
  243. def test_header_multi_index_common_format2(all_parsers, kwargs):
  244. parser = all_parsers
  245. expected = DataFrame(
  246. [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
  247. index=["one", "two"],
  248. columns=MultiIndex.from_tuples(
  249. [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
  250. ),
  251. )
  252. data = """,a,a,a,b,c,c
  253. ,q,r,s,t,u,v
  254. one,1,2,3,4,5,6
  255. two,7,8,9,10,11,12"""
  256. result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
  257. tm.assert_frame_equal(result, expected)
  258. @xfail_pyarrow # TypeError: an integer is required
  259. @pytest.mark.parametrize(
  260. "kwargs",
  261. [
  262. {"header": [0, 1]},
  263. {
  264. "skiprows": 2,
  265. "names": [
  266. ("a", "q"),
  267. ("a", "r"),
  268. ("a", "s"),
  269. ("b", "t"),
  270. ("c", "u"),
  271. ("c", "v"),
  272. ],
  273. },
  274. {
  275. "skiprows": 2,
  276. "names": [
  277. _TestTuple("a", "q"),
  278. _TestTuple("a", "r"),
  279. _TestTuple("a", "s"),
  280. _TestTuple("b", "t"),
  281. _TestTuple("c", "u"),
  282. _TestTuple("c", "v"),
  283. ],
  284. },
  285. ],
  286. )
  287. def test_header_multi_index_common_format3(all_parsers, kwargs):
  288. parser = all_parsers
  289. expected = DataFrame(
  290. [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
  291. index=["one", "two"],
  292. columns=MultiIndex.from_tuples(
  293. [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
  294. ),
  295. )
  296. expected = expected.reset_index(drop=True)
  297. data = """a,a,a,b,c,c
  298. q,r,s,t,u,v
  299. 1,2,3,4,5,6
  300. 7,8,9,10,11,12"""
  301. result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
  302. tm.assert_frame_equal(result, expected)
  303. @xfail_pyarrow # TypeError: an integer is required
  304. def test_header_multi_index_common_format_malformed1(all_parsers):
  305. parser = all_parsers
  306. expected = DataFrame(
  307. np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
  308. index=Index([1, 7]),
  309. columns=MultiIndex(
  310. levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
  311. codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
  312. names=["a", "q"],
  313. ),
  314. )
  315. data = """a,a,a,b,c,c
  316. q,r,s,t,u,v
  317. 1,2,3,4,5,6
  318. 7,8,9,10,11,12"""
  319. result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
  320. tm.assert_frame_equal(expected, result)
  321. @xfail_pyarrow # TypeError: an integer is required
  322. def test_header_multi_index_common_format_malformed2(all_parsers):
  323. parser = all_parsers
  324. expected = DataFrame(
  325. np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
  326. index=Index([1, 7]),
  327. columns=MultiIndex(
  328. levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
  329. codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
  330. names=[None, "q"],
  331. ),
  332. )
  333. data = """,a,a,b,c,c
  334. q,r,s,t,u,v
  335. 1,2,3,4,5,6
  336. 7,8,9,10,11,12"""
  337. result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
  338. tm.assert_frame_equal(expected, result)
  339. @xfail_pyarrow # TypeError: an integer is required
  340. def test_header_multi_index_common_format_malformed3(all_parsers):
  341. parser = all_parsers
  342. expected = DataFrame(
  343. np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
  344. index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
  345. columns=MultiIndex(
  346. levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
  347. codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
  348. names=[None, "q"],
  349. ),
  350. )
  351. data = """,a,a,b,c,c
  352. q,r,s,t,u,v
  353. 1,2,3,4,5,6
  354. 7,8,9,10,11,12"""
  355. result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
  356. tm.assert_frame_equal(expected, result)
  357. @xfail_pyarrow # TypeError: an integer is required
  358. def test_header_multi_index_blank_line(all_parsers):
  359. # GH 40442
  360. parser = all_parsers
  361. data = [[None, None], [1, 2], [3, 4]]
  362. columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
  363. expected = DataFrame(data, columns=columns)
  364. data = "a,b\nA,B\n,\n1,2\n3,4"
  365. result = parser.read_csv(StringIO(data), header=[0, 1])
  366. tm.assert_frame_equal(expected, result)
  367. @pytest.mark.parametrize(
  368. "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
  369. )
  370. def test_header_names_backward_compat(all_parsers, data, header, request):
  371. # see gh-2539
  372. parser = all_parsers
  373. if parser.engine == "pyarrow" and header is not None:
  374. mark = pytest.mark.xfail(reason="DataFrame.columns are different")
  375. request.applymarker(mark)
  376. expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
  377. result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
  378. tm.assert_frame_equal(result, expected)
  379. @skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer
  380. @pytest.mark.parametrize("kwargs", [{}, {"index_col": False}])
  381. def test_read_only_header_no_rows(all_parsers, kwargs):
  382. # See gh-7773
  383. parser = all_parsers
  384. expected = DataFrame(columns=["a", "b", "c"])
  385. result = parser.read_csv(StringIO("a,b,c"), **kwargs)
  386. tm.assert_frame_equal(result, expected)
  387. @pytest.mark.parametrize(
  388. "kwargs,names",
  389. [
  390. ({}, [0, 1, 2, 3, 4]),
  391. (
  392. {"names": ["foo", "bar", "baz", "quux", "panda"]},
  393. ["foo", "bar", "baz", "quux", "panda"],
  394. ),
  395. ],
  396. )
  397. def test_no_header(all_parsers, kwargs, names):
  398. parser = all_parsers
  399. data = """1,2,3,4,5
  400. 6,7,8,9,10
  401. 11,12,13,14,15
  402. """
  403. expected = DataFrame(
  404. [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
  405. )
  406. result = parser.read_csv(StringIO(data), header=None, **kwargs)
  407. tm.assert_frame_equal(result, expected)
  408. @pytest.mark.parametrize("header", [["a", "b"], "string_header"])
  409. def test_non_int_header(all_parsers, header):
  410. # see gh-16338
  411. msg = "header must be integer or list of integers"
  412. data = """1,2\n3,4"""
  413. parser = all_parsers
  414. with pytest.raises(ValueError, match=msg):
  415. parser.read_csv(StringIO(data), header=header)
  416. @xfail_pyarrow # TypeError: an integer is required
  417. def test_singleton_header(all_parsers):
  418. # see gh-7757
  419. data = """a,b,c\n0,1,2\n1,2,3"""
  420. parser = all_parsers
  421. expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
  422. result = parser.read_csv(StringIO(data), header=[0])
  423. tm.assert_frame_equal(result, expected)
  424. @xfail_pyarrow # TypeError: an integer is required
  425. @pytest.mark.parametrize(
  426. "data,expected",
  427. [
  428. (
  429. "A,A,A,B\none,one,one,two\n0,40,34,0.1",
  430. DataFrame(
  431. [[0, 40, 34, 0.1]],
  432. columns=MultiIndex.from_tuples(
  433. [("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
  434. ),
  435. ),
  436. ),
  437. (
  438. "A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
  439. DataFrame(
  440. [[0, 40, 34, 0.1]],
  441. columns=MultiIndex.from_tuples(
  442. [("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
  443. ),
  444. ),
  445. ),
  446. (
  447. "A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
  448. DataFrame(
  449. [[0, 40, 34, 0.1, 0.1]],
  450. columns=MultiIndex.from_tuples(
  451. [
  452. ("A", "one"),
  453. ("A", "one.1"),
  454. ("A", "one.1.1"),
  455. ("B", "two"),
  456. ("B", "two.1"),
  457. ]
  458. ),
  459. ),
  460. ),
  461. ],
  462. )
  463. def test_mangles_multi_index(all_parsers, data, expected):
  464. # see gh-18062
  465. parser = all_parsers
  466. result = parser.read_csv(StringIO(data), header=[0, 1])
  467. tm.assert_frame_equal(result, expected)
  468. @xfail_pyarrow # TypeError: an integer is requireds
  469. @pytest.mark.parametrize("index_col", [None, [0]])
  470. @pytest.mark.parametrize(
  471. "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
  472. )
  473. def test_multi_index_unnamed(all_parsers, index_col, columns):
  474. # see gh-23687
  475. #
  476. # When specifying a multi-index header, make sure that
  477. # we don't error just because one of the rows in our header
  478. # has ALL column names containing the string "Unnamed". The
  479. # correct condition to check is whether the row contains
  480. # ALL columns that did not have names (and instead were given
  481. # placeholder ones).
  482. parser = all_parsers
  483. header = [0, 1]
  484. if index_col is None:
  485. data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
  486. else:
  487. data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
  488. result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
  489. exp_columns = []
  490. if columns is None:
  491. columns = ["", "", ""]
  492. for i, col in enumerate(columns):
  493. if not col: # Unnamed.
  494. col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
  495. exp_columns.append(col)
  496. columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
  497. expected = DataFrame([[2, 3], [4, 5]], columns=columns)
  498. tm.assert_frame_equal(result, expected)
  499. @skip_pyarrow # CSV parse error: Expected 2 columns, got 3
  500. def test_names_longer_than_header_but_equal_with_data_rows(all_parsers):
  501. # GH#38453
  502. parser = all_parsers
  503. data = """a, b
  504. 1,2,3
  505. 5,6,4
  506. """
  507. result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"])
  508. expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]})
  509. tm.assert_frame_equal(result, expected)
  510. @xfail_pyarrow # TypeError: an integer is required
  511. def test_read_csv_multiindex_columns(all_parsers):
  512. # GH#6051
  513. parser = all_parsers
  514. s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81"
  515. s2 = (
  516. "Male, Male, Male, Female, Female\n"
  517. "R, R, L, R, R\n"
  518. ".86, .67, .88, .78, .81\n"
  519. ".86, .67, .88, .78, .82"
  520. )
  521. mi = MultiIndex.from_tuples(
  522. [
  523. ("Male", "R"),
  524. (" Male", " R"),
  525. (" Male", " L"),
  526. (" Female", " R"),
  527. (" Female", " R.1"),
  528. ]
  529. )
  530. expected = DataFrame(
  531. [[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi
  532. )
  533. df1 = parser.read_csv(StringIO(s1), header=[0, 1])
  534. tm.assert_frame_equal(df1, expected.iloc[:1])
  535. df2 = parser.read_csv(StringIO(s2), header=[0, 1])
  536. tm.assert_frame_equal(df2, expected)
  537. @xfail_pyarrow # TypeError: an integer is required
  538. def test_read_csv_multi_header_length_check(all_parsers):
  539. # GH#43102
  540. parser = all_parsers
  541. case = """row11,row12,row13
  542. row21,row22, row23
  543. row31,row32
  544. """
  545. with pytest.raises(
  546. ParserError, match="Header rows must have an equal number of columns."
  547. ):
  548. parser.read_csv(StringIO(case), header=[0, 2])
  549. @skip_pyarrow # CSV parse error: Expected 3 columns, got 2
  550. def test_header_none_and_implicit_index(all_parsers):
  551. # GH#22144
  552. parser = all_parsers
  553. data = "x,1,5\ny,2\nz,3\n"
  554. result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
  555. expected = DataFrame(
  556. {"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
  557. )
  558. tm.assert_frame_equal(result, expected)
  559. @skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got "
  560. def test_header_none_and_implicit_index_in_second_row(all_parsers):
  561. # GH#22144
  562. parser = all_parsers
  563. data = "x,1\ny,2,5\nz,3\n"
  564. with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
  565. parser.read_csv(StringIO(data), names=["a", "b"], header=None)
  566. def test_header_none_and_on_bad_lines_skip(all_parsers):
  567. # GH#22144
  568. parser = all_parsers
  569. data = "x,1\ny,2,5\nz,3\n"
  570. result = parser.read_csv(
  571. StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
  572. )
  573. expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
  574. tm.assert_frame_equal(result, expected)
  575. @xfail_pyarrow # TypeError: an integer is requireds
  576. def test_header_missing_rows(all_parsers):
  577. # GH#47400
  578. parser = all_parsers
  579. data = """a,b
  580. 1,2
  581. """
  582. msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
  583. with pytest.raises(ValueError, match=msg):
  584. parser.read_csv(StringIO(data), header=[0, 1, 2])
  585. # ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
  586. @xfail_pyarrow
  587. def test_header_multiple_whitespaces(all_parsers):
  588. # GH#54931
  589. parser = all_parsers
  590. data = """aa bb(1,1) cc(1,1)
  591. 0 2 3.5"""
  592. result = parser.read_csv(StringIO(data), sep=r"\s+")
  593. expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5})
  594. tm.assert_frame_equal(result, expected)
  595. # ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine
  596. @xfail_pyarrow
  597. def test_header_delim_whitespace(all_parsers):
  598. # GH#54918
  599. parser = all_parsers
  600. data = """a,b
  601. 1,2
  602. 3,4
  603. """
  604. depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
  605. with tm.assert_produces_warning(
  606. FutureWarning, match=depr_msg, check_stacklevel=False
  607. ):
  608. result = parser.read_csv(StringIO(data), delim_whitespace=True)
  609. expected = DataFrame({"a,b": ["1,2", "3,4"]})
  610. tm.assert_frame_equal(result, expected)
  611. def test_usecols_no_header_pyarrow(pyarrow_parser_only):
  612. parser = pyarrow_parser_only
  613. data = """
  614. a,i,x
  615. b,j,y
  616. """
  617. result = parser.read_csv(
  618. StringIO(data),
  619. header=None,
  620. usecols=[0, 1],
  621. dtype="string[pyarrow]",
  622. dtype_backend="pyarrow",
  623. engine="pyarrow",
  624. )
  625. expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]")
  626. tm.assert_frame_equal(result, expected)