test_readers.py 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735
  1. from __future__ import annotations
  2. from datetime import (
  3. datetime,
  4. time,
  5. )
  6. from functools import partial
  7. from io import BytesIO
  8. import os
  9. from pathlib import Path
  10. import platform
  11. import re
  12. from urllib.error import URLError
  13. from zipfile import BadZipFile
  14. import numpy as np
  15. import pytest
  16. from pandas.compat import is_platform_windows
  17. import pandas.util._test_decorators as td
  18. import pandas as pd
  19. from pandas import (
  20. DataFrame,
  21. Index,
  22. MultiIndex,
  23. Series,
  24. read_csv,
  25. )
  26. import pandas._testing as tm
  27. if is_platform_windows():
  28. pytestmark = pytest.mark.single_cpu
  29. read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
  30. engine_params = [
  31. # Add any engines to test here
  32. # When defusedxml is installed it triggers deprecation warnings for
  33. # xlrd and openpyxl, so catch those here
  34. pytest.param(
  35. "xlrd",
  36. marks=[
  37. td.skip_if_no("xlrd"),
  38. ],
  39. ),
  40. pytest.param(
  41. "openpyxl",
  42. marks=[
  43. td.skip_if_no("openpyxl"),
  44. ],
  45. ),
  46. pytest.param(
  47. None,
  48. marks=[
  49. td.skip_if_no("xlrd"),
  50. ],
  51. ),
  52. pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")),
  53. pytest.param("odf", marks=td.skip_if_no("odf")),
  54. pytest.param("calamine", marks=td.skip_if_no("python_calamine")),
  55. ]
  56. def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
  57. """
  58. Filter out invalid (engine, ext) pairs instead of skipping, as that
  59. produces 500+ pytest.skips.
  60. """
  61. engine = engine.values[0]
  62. if engine == "openpyxl" and read_ext == ".xls":
  63. return False
  64. if engine == "odf" and read_ext != ".ods":
  65. return False
  66. if read_ext == ".ods" and engine not in {"odf", "calamine"}:
  67. return False
  68. if engine == "pyxlsb" and read_ext != ".xlsb":
  69. return False
  70. if read_ext == ".xlsb" and engine not in {"pyxlsb", "calamine"}:
  71. return False
  72. if engine == "xlrd" and read_ext != ".xls":
  73. return False
  74. return True
  75. def _transfer_marks(engine, read_ext):
  76. """
  77. engine gives us a pytest.param object with some marks, read_ext is just
  78. a string. We need to generate a new pytest.param inheriting the marks.
  79. """
  80. values = engine.values + (read_ext,)
  81. new_param = pytest.param(values, marks=engine.marks)
  82. return new_param
  83. @pytest.fixture(
  84. params=[
  85. _transfer_marks(eng, ext)
  86. for eng in engine_params
  87. for ext in read_ext_params
  88. if _is_valid_engine_ext_pair(eng, ext)
  89. ],
  90. ids=str,
  91. )
  92. def engine_and_read_ext(request):
  93. """
  94. Fixture for Excel reader engine and read_ext, only including valid pairs.
  95. """
  96. return request.param
  97. @pytest.fixture
  98. def engine(engine_and_read_ext):
  99. engine, read_ext = engine_and_read_ext
  100. return engine
  101. @pytest.fixture
  102. def read_ext(engine_and_read_ext):
  103. engine, read_ext = engine_and_read_ext
  104. return read_ext
  105. @pytest.fixture
  106. def df_ref(datapath):
  107. """
  108. Obtain the reference data from read_csv with the Python engine.
  109. """
  110. filepath = datapath("io", "data", "csv", "test1.csv")
  111. df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python")
  112. return df_ref
  113. def get_exp_unit(read_ext: str, engine: str | None) -> str:
  114. return "ns"
  115. def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None:
  116. expected.index.name = None
  117. unit = get_exp_unit(read_ext, engine)
  118. # error: "Index" has no attribute "as_unit"
  119. expected.index = expected.index.as_unit(unit) # type: ignore[attr-defined]
  120. def xfail_datetimes_with_pyxlsb(engine, request):
  121. if engine == "pyxlsb":
  122. request.applymarker(
  123. pytest.mark.xfail(
  124. reason="Sheets containing datetimes not supported by pyxlsb"
  125. )
  126. )
  127. class TestReaders:
  128. @pytest.fixture(autouse=True)
  129. def cd_and_set_engine(self, engine, datapath, monkeypatch):
  130. """
  131. Change directory and set engine for read_excel calls.
  132. """
  133. func = partial(pd.read_excel, engine=engine)
  134. monkeypatch.chdir(datapath("io", "data", "excel"))
  135. monkeypatch.setattr(pd, "read_excel", func)
  136. def test_engine_used(self, read_ext, engine, monkeypatch):
  137. # GH 38884
  138. def parser(self, *args, **kwargs):
  139. return self.engine
  140. monkeypatch.setattr(pd.ExcelFile, "parse", parser)
  141. expected_defaults = {
  142. "xlsx": "openpyxl",
  143. "xlsm": "openpyxl",
  144. "xlsb": "pyxlsb",
  145. "xls": "xlrd",
  146. "ods": "odf",
  147. }
  148. with open("test1" + read_ext, "rb") as f:
  149. result = pd.read_excel(f)
  150. if engine is not None:
  151. expected = engine
  152. else:
  153. expected = expected_defaults[read_ext[1:]]
  154. assert result == expected
  155. def test_engine_kwargs(self, read_ext, engine):
  156. # GH#52214
  157. expected_defaults = {
  158. "xlsx": {"foo": "abcd"},
  159. "xlsm": {"foo": 123},
  160. "xlsb": {"foo": "True"},
  161. "xls": {"foo": True},
  162. "ods": {"foo": "abcd"},
  163. }
  164. if engine in {"xlrd", "pyxlsb"}:
  165. msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'")
  166. elif engine == "odf":
  167. msg = re.escape(r"load() got an unexpected keyword argument 'foo'")
  168. else:
  169. msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'")
  170. if engine is not None:
  171. with pytest.raises(TypeError, match=msg):
  172. pd.read_excel(
  173. "test1" + read_ext,
  174. sheet_name="Sheet1",
  175. index_col=0,
  176. engine_kwargs=expected_defaults[read_ext[1:]],
  177. )
  178. def test_usecols_int(self, read_ext):
  179. # usecols as int
  180. msg = "Passing an integer for `usecols`"
  181. with pytest.raises(ValueError, match=msg):
  182. pd.read_excel(
  183. "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3
  184. )
  185. # usecols as int
  186. with pytest.raises(ValueError, match=msg):
  187. pd.read_excel(
  188. "test1" + read_ext,
  189. sheet_name="Sheet2",
  190. skiprows=[1],
  191. index_col=0,
  192. usecols=3,
  193. )
  194. def test_usecols_list(self, request, engine, read_ext, df_ref):
  195. xfail_datetimes_with_pyxlsb(engine, request)
  196. expected = df_ref[["B", "C"]]
  197. adjust_expected(expected, read_ext, engine)
  198. df1 = pd.read_excel(
  199. "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3]
  200. )
  201. df2 = pd.read_excel(
  202. "test1" + read_ext,
  203. sheet_name="Sheet2",
  204. skiprows=[1],
  205. index_col=0,
  206. usecols=[0, 2, 3],
  207. )
  208. # TODO add index to xls file)
  209. tm.assert_frame_equal(df1, expected)
  210. tm.assert_frame_equal(df2, expected)
  211. def test_usecols_str(self, request, engine, read_ext, df_ref):
  212. xfail_datetimes_with_pyxlsb(engine, request)
  213. expected = df_ref[["A", "B", "C"]]
  214. adjust_expected(expected, read_ext, engine)
  215. df2 = pd.read_excel(
  216. "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D"
  217. )
  218. df3 = pd.read_excel(
  219. "test1" + read_ext,
  220. sheet_name="Sheet2",
  221. skiprows=[1],
  222. index_col=0,
  223. usecols="A:D",
  224. )
  225. # TODO add index to xls, read xls ignores index name ?
  226. tm.assert_frame_equal(df2, expected)
  227. tm.assert_frame_equal(df3, expected)
  228. expected = df_ref[["B", "C"]]
  229. adjust_expected(expected, read_ext, engine)
  230. df2 = pd.read_excel(
  231. "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D"
  232. )
  233. df3 = pd.read_excel(
  234. "test1" + read_ext,
  235. sheet_name="Sheet2",
  236. skiprows=[1],
  237. index_col=0,
  238. usecols="A,C,D",
  239. )
  240. # TODO add index to xls file
  241. tm.assert_frame_equal(df2, expected)
  242. tm.assert_frame_equal(df3, expected)
  243. df2 = pd.read_excel(
  244. "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D"
  245. )
  246. df3 = pd.read_excel(
  247. "test1" + read_ext,
  248. sheet_name="Sheet2",
  249. skiprows=[1],
  250. index_col=0,
  251. usecols="A,C:D",
  252. )
  253. tm.assert_frame_equal(df2, expected)
  254. tm.assert_frame_equal(df3, expected)
  255. @pytest.mark.parametrize(
  256. "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]]
  257. )
  258. def test_usecols_diff_positional_int_columns_order(
  259. self, request, engine, read_ext, usecols, df_ref
  260. ):
  261. xfail_datetimes_with_pyxlsb(engine, request)
  262. expected = df_ref[["A", "C"]]
  263. adjust_expected(expected, read_ext, engine)
  264. result = pd.read_excel(
  265. "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols
  266. )
  267. tm.assert_frame_equal(result, expected)
  268. @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]])
  269. def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref):
  270. expected = df_ref[["B", "D"]]
  271. expected.index = range(len(expected))
  272. result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols)
  273. tm.assert_frame_equal(result, expected)
  274. def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref):
  275. xfail_datetimes_with_pyxlsb(engine, request)
  276. expected = df_ref
  277. adjust_expected(expected, read_ext, engine)
  278. result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
  279. tm.assert_frame_equal(result, expected)
  280. def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref):
  281. xfail_datetimes_with_pyxlsb(engine, request)
  282. expected = df_ref[["C", "D"]]
  283. adjust_expected(expected, read_ext, engine)
  284. result = pd.read_excel(
  285. "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E"
  286. )
  287. tm.assert_frame_equal(result, expected)
  288. def test_usecols_excel_range_str_invalid(self, read_ext):
  289. msg = "Invalid column name: E1"
  290. with pytest.raises(ValueError, match=msg):
  291. pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols="D:E1")
  292. def test_index_col_label_error(self, read_ext):
  293. msg = "list indices must be integers.*, not str"
  294. with pytest.raises(TypeError, match=msg):
  295. pd.read_excel(
  296. "test1" + read_ext,
  297. sheet_name="Sheet1",
  298. index_col=["A"],
  299. usecols=["A", "C"],
  300. )
  301. def test_index_col_str(self, read_ext):
  302. # see gh-52716
  303. result = pd.read_excel("test1" + read_ext, sheet_name="Sheet3", index_col="A")
  304. expected = DataFrame(
  305. columns=["B", "C", "D", "E", "F"], index=Index([], name="A")
  306. )
  307. tm.assert_frame_equal(result, expected)
  308. def test_index_col_empty(self, read_ext):
  309. # see gh-9208
  310. result = pd.read_excel(
  311. "test1" + read_ext, sheet_name="Sheet3", index_col=["A", "B", "C"]
  312. )
  313. expected = DataFrame(
  314. columns=["D", "E", "F"],
  315. index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]),
  316. )
  317. tm.assert_frame_equal(result, expected)
  318. @pytest.mark.parametrize("index_col", [None, 2])
  319. def test_index_col_with_unnamed(self, read_ext, index_col):
  320. # see gh-18792
  321. result = pd.read_excel(
  322. "test1" + read_ext, sheet_name="Sheet4", index_col=index_col
  323. )
  324. expected = DataFrame(
  325. [["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]
  326. )
  327. if index_col:
  328. expected = expected.set_index(expected.columns[index_col])
  329. tm.assert_frame_equal(result, expected)
  330. def test_usecols_pass_non_existent_column(self, read_ext):
  331. msg = (
  332. "Usecols do not match columns, "
  333. "columns expected but not found: "
  334. r"\['E'\]"
  335. )
  336. with pytest.raises(ValueError, match=msg):
  337. pd.read_excel("test1" + read_ext, usecols=["E"])
  338. def test_usecols_wrong_type(self, read_ext):
  339. msg = (
  340. "'usecols' must either be list-like of "
  341. "all strings, all unicode, all integers or a callable."
  342. )
  343. with pytest.raises(ValueError, match=msg):
  344. pd.read_excel("test1" + read_ext, usecols=["E1", 0])
  345. def test_excel_stop_iterator(self, read_ext):
  346. parsed = pd.read_excel("test2" + read_ext, sheet_name="Sheet1")
  347. expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"])
  348. tm.assert_frame_equal(parsed, expected)
  349. def test_excel_cell_error_na(self, request, engine, read_ext):
  350. xfail_datetimes_with_pyxlsb(engine, request)
  351. # https://github.com/tafia/calamine/issues/355
  352. if engine == "calamine" and read_ext == ".ods":
  353. request.applymarker(
  354. pytest.mark.xfail(reason="Calamine can't extract error from ods files")
  355. )
  356. parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1")
  357. expected = DataFrame([[np.nan]], columns=["Test"])
  358. tm.assert_frame_equal(parsed, expected)
  359. def test_excel_table(self, request, engine, read_ext, df_ref):
  360. xfail_datetimes_with_pyxlsb(engine, request)
  361. expected = df_ref
  362. adjust_expected(expected, read_ext, engine)
  363. df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
  364. df2 = pd.read_excel(
  365. "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0
  366. )
  367. # TODO add index to file
  368. tm.assert_frame_equal(df1, expected)
  369. tm.assert_frame_equal(df2, expected)
  370. df3 = pd.read_excel(
  371. "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1
  372. )
  373. tm.assert_frame_equal(df3, df1.iloc[:-1])
  374. def test_reader_special_dtypes(self, request, engine, read_ext):
  375. xfail_datetimes_with_pyxlsb(engine, request)
  376. unit = get_exp_unit(read_ext, engine)
  377. expected = DataFrame.from_dict(
  378. {
  379. "IntCol": [1, 2, -3, 4, 0],
  380. "FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
  381. "BoolCol": [True, False, True, True, False],
  382. "StrCol": [1, 2, 3, 4, 5],
  383. "Str2Col": ["a", 3, "c", "d", "e"],
  384. "DateCol": Index(
  385. [
  386. datetime(2013, 10, 30),
  387. datetime(2013, 10, 31),
  388. datetime(1905, 1, 1),
  389. datetime(2013, 12, 14),
  390. datetime(2015, 3, 14),
  391. ],
  392. dtype=f"M8[{unit}]",
  393. ),
  394. },
  395. )
  396. basename = "test_types"
  397. # should read in correctly and infer types
  398. actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
  399. tm.assert_frame_equal(actual, expected)
  400. # if not coercing number, then int comes in as float
  401. float_expected = expected.copy()
  402. float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
  403. actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
  404. tm.assert_frame_equal(actual, float_expected)
  405. # check setting Index (assuming xls and xlsx are the same here)
  406. for icol, name in enumerate(expected.columns):
  407. actual = pd.read_excel(
  408. basename + read_ext, sheet_name="Sheet1", index_col=icol
  409. )
  410. exp = expected.set_index(name)
  411. tm.assert_frame_equal(actual, exp)
  412. expected["StrCol"] = expected["StrCol"].apply(str)
  413. actual = pd.read_excel(
  414. basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str}
  415. )
  416. tm.assert_frame_equal(actual, expected)
  417. # GH8212 - support for converters and missing values
  418. def test_reader_converters(self, read_ext):
  419. basename = "test_converters"
  420. expected = DataFrame.from_dict(
  421. {
  422. "IntCol": [1, 2, -3, -1000, 0],
  423. "FloatCol": [12.5, np.nan, 18.3, 19.2, 0.000000005],
  424. "BoolCol": ["Found", "Found", "Found", "Not found", "Found"],
  425. "StrCol": ["1", np.nan, "3", "4", "5"],
  426. }
  427. )
  428. converters = {
  429. "IntCol": lambda x: int(x) if x != "" else -1000,
  430. "FloatCol": lambda x: 10 * x if x else np.nan,
  431. 2: lambda x: "Found" if x != "" else "Not found",
  432. 3: lambda x: str(x) if x else "",
  433. }
  434. # should read in correctly and set types of single cells (not array
  435. # dtypes)
  436. actual = pd.read_excel(
  437. basename + read_ext, sheet_name="Sheet1", converters=converters
  438. )
  439. tm.assert_frame_equal(actual, expected)
  440. def test_reader_dtype(self, read_ext):
  441. # GH 8212
  442. basename = "testdtype"
  443. actual = pd.read_excel(basename + read_ext)
  444. expected = DataFrame(
  445. {
  446. "a": [1, 2, 3, 4],
  447. "b": [2.5, 3.5, 4.5, 5.5],
  448. "c": [1, 2, 3, 4],
  449. "d": [1.0, 2.0, np.nan, 4.0],
  450. }
  451. )
  452. tm.assert_frame_equal(actual, expected)
  453. actual = pd.read_excel(
  454. basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str}
  455. )
  456. expected["a"] = expected["a"].astype("float64")
  457. expected["b"] = expected["b"].astype("float32")
  458. expected["c"] = Series(["001", "002", "003", "004"], dtype="str")
  459. tm.assert_frame_equal(actual, expected)
  460. msg = "Unable to convert column d to type int64"
  461. with pytest.raises(ValueError, match=msg):
  462. pd.read_excel(basename + read_ext, dtype={"d": "int64"})
  463. @pytest.mark.parametrize(
  464. "dtype,expected",
  465. [
  466. (
  467. None,
  468. DataFrame(
  469. {
  470. "a": [1, 2, 3, 4],
  471. "b": [2.5, 3.5, 4.5, 5.5],
  472. "c": [1, 2, 3, 4],
  473. "d": [1.0, 2.0, np.nan, 4.0],
  474. }
  475. ),
  476. ),
  477. (
  478. {"a": "float64", "b": "float32", "c": str, "d": str},
  479. DataFrame(
  480. {
  481. "a": Series([1, 2, 3, 4], dtype="float64"),
  482. "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
  483. "c": Series(["001", "002", "003", "004"], dtype="str"),
  484. "d": Series(["1", "2", np.nan, "4"], dtype="str"),
  485. },
  486. ),
  487. ),
  488. ],
  489. )
  490. def test_reader_dtype_str(self, read_ext, dtype, expected):
  491. # see gh-20377
  492. basename = "testdtype"
  493. actual = pd.read_excel(basename + read_ext, dtype=dtype)
  494. tm.assert_frame_equal(actual, expected)
  495. def test_dtype_backend(self, read_ext, dtype_backend, engine):
  496. # GH#36712
  497. if read_ext in (".xlsb", ".xls"):
  498. pytest.skip(f"No engine for filetype: '{read_ext}'")
  499. df = DataFrame(
  500. {
  501. "a": Series([1, 3], dtype="Int64"),
  502. "b": Series([2.5, 4.5], dtype="Float64"),
  503. "c": Series([True, False], dtype="boolean"),
  504. "d": Series(["a", "b"], dtype="string"),
  505. "e": Series([pd.NA, 6], dtype="Int64"),
  506. "f": Series([pd.NA, 7.5], dtype="Float64"),
  507. "g": Series([pd.NA, True], dtype="boolean"),
  508. "h": Series([pd.NA, "a"], dtype="string"),
  509. "i": Series([pd.Timestamp("2019-12-31")] * 2),
  510. "j": Series([pd.NA, pd.NA], dtype="Int64"),
  511. }
  512. )
  513. with tm.ensure_clean(read_ext) as file_path:
  514. df.to_excel(file_path, sheet_name="test", index=False)
  515. result = pd.read_excel(
  516. file_path, sheet_name="test", dtype_backend=dtype_backend
  517. )
  518. if dtype_backend == "pyarrow":
  519. import pyarrow as pa
  520. from pandas.arrays import ArrowExtensionArray
  521. expected = DataFrame(
  522. {
  523. col: ArrowExtensionArray(pa.array(df[col], from_pandas=True))
  524. for col in df.columns
  525. }
  526. )
  527. # pyarrow by default infers timestamp resolution as us, not ns
  528. expected["i"] = ArrowExtensionArray(
  529. expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))
  530. )
  531. # pyarrow supports a null type, so don't have to default to Int64
  532. expected["j"] = ArrowExtensionArray(pa.array([None, None]))
  533. else:
  534. expected = df
  535. unit = get_exp_unit(read_ext, engine)
  536. expected["i"] = expected["i"].astype(f"M8[{unit}]")
  537. tm.assert_frame_equal(result, expected)
  538. def test_dtype_backend_and_dtype(self, read_ext):
  539. # GH#36712
  540. if read_ext in (".xlsb", ".xls"):
  541. pytest.skip(f"No engine for filetype: '{read_ext}'")
  542. df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]})
  543. with tm.ensure_clean(read_ext) as file_path:
  544. df.to_excel(file_path, sheet_name="test", index=False)
  545. result = pd.read_excel(
  546. file_path,
  547. sheet_name="test",
  548. dtype_backend="numpy_nullable",
  549. dtype="float64",
  550. )
  551. tm.assert_frame_equal(result, df)
  552. def test_dtype_backend_string(self, read_ext, string_storage):
  553. # GH#36712
  554. if read_ext in (".xlsb", ".xls"):
  555. pytest.skip(f"No engine for filetype: '{read_ext}'")
  556. with pd.option_context("mode.string_storage", string_storage):
  557. df = DataFrame(
  558. {
  559. "a": np.array(["a", "b"], dtype=np.object_),
  560. "b": np.array(["x", pd.NA], dtype=np.object_),
  561. }
  562. )
  563. with tm.ensure_clean(read_ext) as file_path:
  564. df.to_excel(file_path, sheet_name="test", index=False)
  565. result = pd.read_excel(
  566. file_path, sheet_name="test", dtype_backend="numpy_nullable"
  567. )
  568. expected = DataFrame(
  569. {
  570. "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
  571. "b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
  572. }
  573. )
  574. # the storage of the str columns' Index is also affected by the
  575. # string_storage setting -> ignore that for checking the result
  576. tm.assert_frame_equal(result, expected, check_column_type=False)
  577. @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
  578. def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
  579. # GH#35211
  580. basename = "df_mangle_dup_col_dtypes"
  581. dtype_dict = {"a": object, **dtypes}
  582. dtype_dict_copy = dtype_dict.copy()
  583. # GH#42462
  584. result = pd.read_excel(basename + read_ext, dtype=dtype_dict)
  585. expected = DataFrame(
  586. {
  587. "a": Series([1], dtype=object),
  588. "a.1": Series([exp_value], dtype=object if not dtypes else None),
  589. }
  590. )
  591. assert dtype_dict == dtype_dict_copy, "dtype dict changed"
  592. tm.assert_frame_equal(result, expected)
  593. def test_reader_spaces(self, read_ext):
  594. # see gh-32207
  595. basename = "test_spaces"
  596. actual = pd.read_excel(basename + read_ext)
  597. expected = DataFrame(
  598. {
  599. "testcol": [
  600. "this is great",
  601. "4 spaces",
  602. "1 trailing ",
  603. " 1 leading",
  604. "2 spaces multiple times",
  605. ]
  606. }
  607. )
  608. tm.assert_frame_equal(actual, expected)
  609. # gh-36122, gh-35802
  610. @pytest.mark.parametrize(
  611. "basename,expected",
  612. [
  613. ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})),
  614. ("gh-36122", DataFrame(columns=["got 2nd sa"])),
  615. ],
  616. )
  617. def test_read_excel_ods_nested_xml(self, engine, read_ext, basename, expected):
  618. # see gh-35802
  619. if engine != "odf":
  620. pytest.skip(f"Skipped for engine: {engine}")
  621. actual = pd.read_excel(basename + read_ext)
  622. tm.assert_frame_equal(actual, expected)
  623. def test_reading_all_sheets(self, read_ext):
  624. # Test reading all sheet names by setting sheet_name to None,
  625. # Ensure a dict is returned.
  626. # See PR #9450
  627. basename = "test_multisheet"
  628. dfs = pd.read_excel(basename + read_ext, sheet_name=None)
  629. # ensure this is not alphabetical to test order preservation
  630. expected_keys = ["Charlie", "Alpha", "Beta"]
  631. tm.assert_contains_all(expected_keys, dfs.keys())
  632. # Issue 9930
  633. # Ensure sheet order is preserved
  634. assert expected_keys == list(dfs.keys())
  635. def test_reading_multiple_specific_sheets(self, read_ext):
  636. # Test reading specific sheet names by specifying a mixed list
  637. # of integers and strings, and confirm that duplicated sheet
  638. # references (positions/names) are removed properly.
  639. # Ensure a dict is returned
  640. # See PR #9450
  641. basename = "test_multisheet"
  642. # Explicitly request duplicates. Only the set should be returned.
  643. expected_keys = [2, "Charlie", "Charlie"]
  644. dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys)
  645. expected_keys = list(set(expected_keys))
  646. tm.assert_contains_all(expected_keys, dfs.keys())
  647. assert len(expected_keys) == len(dfs.keys())
  648. def test_reading_all_sheets_with_blank(self, read_ext):
  649. # Test reading all sheet names by setting sheet_name to None,
  650. # In the case where some sheets are blank.
  651. # Issue #11711
  652. basename = "blank_with_header"
  653. dfs = pd.read_excel(basename + read_ext, sheet_name=None)
  654. expected_keys = ["Sheet1", "Sheet2", "Sheet3"]
  655. tm.assert_contains_all(expected_keys, dfs.keys())
  656. # GH6403
  657. def test_read_excel_blank(self, read_ext):
  658. actual = pd.read_excel("blank" + read_ext, sheet_name="Sheet1")
  659. tm.assert_frame_equal(actual, DataFrame())
  660. def test_read_excel_blank_with_header(self, read_ext):
  661. expected = DataFrame(columns=["col_1", "col_2"])
  662. actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1")
  663. tm.assert_frame_equal(actual, expected)
  664. def test_exception_message_includes_sheet_name(self, read_ext):
  665. # GH 48706
  666. with pytest.raises(ValueError, match=r" \(sheet: Sheet1\)$"):
  667. pd.read_excel("blank_with_header" + read_ext, header=[1], sheet_name=None)
  668. with pytest.raises(ZeroDivisionError, match=r" \(sheet: Sheet1\)$"):
  669. pd.read_excel("test1" + read_ext, usecols=lambda x: 1 / 0, sheet_name=None)
  670. @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl")
  671. def test_date_conversion_overflow(self, request, engine, read_ext):
  672. # GH 10001 : pandas.ExcelFile ignore parse_dates=False
  673. xfail_datetimes_with_pyxlsb(engine, request)
  674. expected = DataFrame(
  675. [
  676. [pd.Timestamp("2016-03-12"), "Marc Johnson"],
  677. [pd.Timestamp("2016-03-16"), "Jack Black"],
  678. [1e20, "Timothy Brown"],
  679. ],
  680. columns=["DateColWithBigInt", "StringCol"],
  681. )
  682. if engine == "openpyxl":
  683. request.applymarker(
  684. pytest.mark.xfail(reason="Maybe not supported by openpyxl")
  685. )
  686. if engine is None and read_ext in (".xlsx", ".xlsm"):
  687. # GH 35029
  688. request.applymarker(
  689. pytest.mark.xfail(reason="Defaults to openpyxl, maybe not supported")
  690. )
  691. result = pd.read_excel("testdateoverflow" + read_ext)
  692. tm.assert_frame_equal(result, expected)
  693. def test_sheet_name(self, request, read_ext, engine, df_ref):
  694. xfail_datetimes_with_pyxlsb(engine, request)
  695. filename = "test1"
  696. sheet_name = "Sheet1"
  697. expected = df_ref
  698. adjust_expected(expected, read_ext, engine)
  699. df1 = pd.read_excel(
  700. filename + read_ext, sheet_name=sheet_name, index_col=0
  701. ) # doc
  702. df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
  703. tm.assert_frame_equal(df1, expected)
  704. tm.assert_frame_equal(df2, expected)
  705. def test_excel_read_buffer(self, read_ext):
  706. pth = "test1" + read_ext
  707. expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0)
  708. with open(pth, "rb") as f:
  709. actual = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
  710. tm.assert_frame_equal(expected, actual)
  711. def test_bad_engine_raises(self):
  712. bad_engine = "foo"
  713. with pytest.raises(ValueError, match="Unknown engine: foo"):
  714. pd.read_excel("", engine=bad_engine)
  715. @pytest.mark.parametrize(
  716. "sheet_name",
  717. [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
  718. )
  719. def test_bad_sheetname_raises(self, read_ext, sheet_name):
  720. # GH 39250
  721. msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
  722. with pytest.raises(ValueError, match=msg):
  723. pd.read_excel("blank" + read_ext, sheet_name=sheet_name)
  724. def test_missing_file_raises(self, read_ext):
  725. bad_file = f"foo{read_ext}"
  726. # CI tests with other languages, translates to "No such file or directory"
  727. match = "|".join(
  728. [
  729. "(No such file or directory",
  730. "没有那个文件或目录",
  731. "File o directory non esistente)",
  732. ]
  733. )
  734. with pytest.raises(FileNotFoundError, match=match):
  735. pd.read_excel(bad_file)
  736. def test_corrupt_bytes_raises(self, engine):
  737. bad_stream = b"foo"
  738. if engine is None:
  739. error = ValueError
  740. msg = (
  741. "Excel file format cannot be determined, you must "
  742. "specify an engine manually."
  743. )
  744. elif engine == "xlrd":
  745. from xlrd import XLRDError
  746. error = XLRDError
  747. msg = (
  748. "Unsupported format, or corrupt file: Expected BOF "
  749. "record; found b'foo'"
  750. )
  751. elif engine == "calamine":
  752. from python_calamine import CalamineError
  753. error = CalamineError
  754. msg = "Cannot detect file format"
  755. else:
  756. error = BadZipFile
  757. msg = "File is not a zip file"
  758. with pytest.raises(error, match=msg):
  759. pd.read_excel(BytesIO(bad_stream))
  760. @pytest.mark.network
  761. @pytest.mark.single_cpu
  762. def test_read_from_http_url(self, httpserver, read_ext):
  763. with open("test1" + read_ext, "rb") as f:
  764. httpserver.serve_content(content=f.read())
  765. url_table = pd.read_excel(httpserver.url)
  766. local_table = pd.read_excel("test1" + read_ext)
  767. tm.assert_frame_equal(url_table, local_table)
  768. @td.skip_if_not_us_locale
  769. @pytest.mark.single_cpu
  770. def test_read_from_s3_url(self, read_ext, s3_public_bucket, s3so):
  771. # Bucket created in tests/io/conftest.py
  772. with open("test1" + read_ext, "rb") as f:
  773. s3_public_bucket.put_object(Key="test1" + read_ext, Body=f)
  774. url = f"s3://{s3_public_bucket.name}/test1" + read_ext
  775. url_table = pd.read_excel(url, storage_options=s3so)
  776. local_table = pd.read_excel("test1" + read_ext)
  777. tm.assert_frame_equal(url_table, local_table)
  778. @pytest.mark.single_cpu
  779. def test_read_from_s3_object(self, read_ext, s3_public_bucket, s3so):
  780. # GH 38788
  781. # Bucket created in tests/io/conftest.py
  782. with open("test1" + read_ext, "rb") as f:
  783. s3_public_bucket.put_object(Key="test1" + read_ext, Body=f)
  784. import s3fs
  785. s3 = s3fs.S3FileSystem(**s3so)
  786. with s3.open(f"s3://{s3_public_bucket.name}/test1" + read_ext) as f:
  787. url_table = pd.read_excel(f)
  788. local_table = pd.read_excel("test1" + read_ext)
  789. tm.assert_frame_equal(url_table, local_table)
  790. @pytest.mark.slow
  791. def test_read_from_file_url(self, read_ext, datapath):
  792. # FILE
  793. localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext)
  794. local_table = pd.read_excel(localtable)
  795. try:
  796. url_table = pd.read_excel("file://localhost/" + localtable)
  797. except URLError:
  798. # fails on some systems
  799. platform_info = " ".join(platform.uname()).strip()
  800. pytest.skip(f"failing on {platform_info}")
  801. tm.assert_frame_equal(url_table, local_table)
  802. def test_read_from_pathlib_path(self, read_ext):
  803. # GH12655
  804. str_path = "test1" + read_ext
  805. expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
  806. path_obj = Path("test1" + read_ext)
  807. actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
  808. tm.assert_frame_equal(expected, actual)
  809. @td.skip_if_no("py.path")
  810. def test_read_from_py_localpath(self, read_ext):
  811. # GH12655
  812. from py.path import local as LocalPath
  813. str_path = os.path.join("test1" + read_ext)
  814. expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
  815. path_obj = LocalPath().join("test1" + read_ext)
  816. actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
  817. tm.assert_frame_equal(expected, actual)
  818. def test_close_from_py_localpath(self, read_ext):
  819. # GH31467
  820. str_path = os.path.join("test1" + read_ext)
  821. with open(str_path, "rb") as f:
  822. x = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
  823. del x
  824. # should not throw an exception because the passed file was closed
  825. f.read()
  826. def test_reader_seconds(self, request, engine, read_ext):
  827. xfail_datetimes_with_pyxlsb(engine, request)
  828. # GH 55045
  829. if engine == "calamine" and read_ext == ".ods":
  830. request.applymarker(
  831. pytest.mark.xfail(
  832. reason="ODS file contains bad datetime (seconds as text)"
  833. )
  834. )
  835. # Test reading times with and without milliseconds. GH5945.
  836. expected = DataFrame.from_dict(
  837. {
  838. "Time": [
  839. time(1, 2, 3),
  840. time(2, 45, 56, 100000),
  841. time(4, 29, 49, 200000),
  842. time(6, 13, 42, 300000),
  843. time(7, 57, 35, 400000),
  844. time(9, 41, 28, 500000),
  845. time(11, 25, 21, 600000),
  846. time(13, 9, 14, 700000),
  847. time(14, 53, 7, 800000),
  848. time(16, 37, 0, 900000),
  849. time(18, 20, 54),
  850. ]
  851. }
  852. )
  853. actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
  854. tm.assert_frame_equal(actual, expected)
  855. actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
  856. tm.assert_frame_equal(actual, expected)
  857. def test_read_excel_multiindex(self, request, engine, read_ext):
  858. # see gh-4679
  859. xfail_datetimes_with_pyxlsb(engine, request)
  860. unit = get_exp_unit(read_ext, engine)
  861. mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
  862. mi_file = "testmultiindex" + read_ext
  863. # "mi_column" sheet
  864. expected = DataFrame(
  865. [
  866. [1, 2.5, pd.Timestamp("2015-01-01"), True],
  867. [2, 3.5, pd.Timestamp("2015-01-02"), False],
  868. [3, 4.5, pd.Timestamp("2015-01-03"), False],
  869. [4, 5.5, pd.Timestamp("2015-01-04"), True],
  870. ],
  871. columns=mi,
  872. )
  873. expected[mi[2]] = expected[mi[2]].astype(f"M8[{unit}]")
  874. actual = pd.read_excel(
  875. mi_file, sheet_name="mi_column", header=[0, 1], index_col=0
  876. )
  877. tm.assert_frame_equal(actual, expected)
  878. # "mi_index" sheet
  879. expected.index = mi
  880. expected.columns = ["a", "b", "c", "d"]
  881. actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1])
  882. tm.assert_frame_equal(actual, expected)
  883. # "both" sheet
  884. expected.columns = mi
  885. actual = pd.read_excel(
  886. mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1]
  887. )
  888. tm.assert_frame_equal(actual, expected)
  889. # "mi_index_name" sheet
  890. expected.columns = ["a", "b", "c", "d"]
  891. expected.index = mi.set_names(["ilvl1", "ilvl2"])
  892. actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1])
  893. tm.assert_frame_equal(actual, expected)
  894. # "mi_column_name" sheet
  895. expected.index = list(range(4))
  896. expected.columns = mi.set_names(["c1", "c2"])
  897. actual = pd.read_excel(
  898. mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0
  899. )
  900. tm.assert_frame_equal(actual, expected)
  901. # see gh-11317
  902. # "name_with_int" sheet
  903. expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"])
  904. actual = pd.read_excel(
  905. mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1]
  906. )
  907. tm.assert_frame_equal(actual, expected)
  908. # "both_name" sheet
  909. expected.columns = mi.set_names(["c1", "c2"])
  910. expected.index = mi.set_names(["ilvl1", "ilvl2"])
  911. actual = pd.read_excel(
  912. mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1]
  913. )
  914. tm.assert_frame_equal(actual, expected)
  915. # "both_skiprows" sheet
  916. actual = pd.read_excel(
  917. mi_file,
  918. sheet_name="both_name_skiprows",
  919. index_col=[0, 1],
  920. header=[0, 1],
  921. skiprows=2,
  922. )
  923. tm.assert_frame_equal(actual, expected)
  924. @pytest.mark.parametrize(
  925. "sheet_name,idx_lvl2",
  926. [
  927. ("both_name_blank_after_mi_name", [np.nan, "b", "a", "b"]),
  928. ("both_name_multiple_blanks", [np.nan] * 4),
  929. ],
  930. )
  931. def test_read_excel_multiindex_blank_after_name(
  932. self, request, engine, read_ext, sheet_name, idx_lvl2
  933. ):
  934. # GH34673
  935. xfail_datetimes_with_pyxlsb(engine, request)
  936. mi_file = "testmultiindex" + read_ext
  937. mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"])
  938. unit = get_exp_unit(read_ext, engine)
  939. expected = DataFrame(
  940. [
  941. [1, 2.5, pd.Timestamp("2015-01-01"), True],
  942. [2, 3.5, pd.Timestamp("2015-01-02"), False],
  943. [3, 4.5, pd.Timestamp("2015-01-03"), False],
  944. [4, 5.5, pd.Timestamp("2015-01-04"), True],
  945. ],
  946. columns=mi,
  947. index=MultiIndex.from_arrays(
  948. (["foo", "foo", "bar", "bar"], idx_lvl2),
  949. names=["ilvl1", "ilvl2"],
  950. ),
  951. )
  952. expected[mi[2]] = expected[mi[2]].astype(f"M8[{unit}]")
  953. result = pd.read_excel(
  954. mi_file,
  955. sheet_name=sheet_name,
  956. index_col=[0, 1],
  957. header=[0, 1],
  958. )
  959. tm.assert_frame_equal(result, expected)
  960. def test_read_excel_multiindex_header_only(self, read_ext):
  961. # see gh-11733.
  962. #
  963. # Don't try to parse a header name if there isn't one.
  964. mi_file = "testmultiindex" + read_ext
  965. result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1])
  966. exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
  967. expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
  968. tm.assert_frame_equal(result, expected)
  969. def test_excel_old_index_format(self, read_ext):
  970. # see gh-4679
  971. filename = "test_index_name_pre17" + read_ext
  972. # We detect headers to determine if index names exist, so
  973. # that "index" name in the "names" version of the data will
  974. # now be interpreted as rows that include null data.
  975. data = np.array(
  976. [
  977. [np.nan, np.nan, np.nan, np.nan, np.nan],
  978. ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
  979. ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
  980. ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
  981. ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
  982. ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
  983. ],
  984. dtype=object,
  985. )
  986. columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
  987. mi = MultiIndex(
  988. levels=[
  989. ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
  990. ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
  991. ],
  992. codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
  993. names=[None, None],
  994. )
  995. si = Index(
  996. ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None
  997. )
  998. expected = DataFrame(data, index=si, columns=columns)
  999. actual = pd.read_excel(filename, sheet_name="single_names", index_col=0)
  1000. tm.assert_frame_equal(actual, expected)
  1001. expected.index = mi
  1002. actual = pd.read_excel(filename, sheet_name="multi_names", index_col=[0, 1])
  1003. tm.assert_frame_equal(actual, expected)
  1004. # The analogous versions of the "names" version data
  1005. # where there are explicitly no names for the indices.
  1006. data = np.array(
  1007. [
  1008. ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
  1009. ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
  1010. ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
  1011. ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
  1012. ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
  1013. ]
  1014. )
  1015. columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
  1016. mi = MultiIndex(
  1017. levels=[
  1018. ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
  1019. ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
  1020. ],
  1021. codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
  1022. names=[None, None],
  1023. )
  1024. si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None)
  1025. expected = DataFrame(data, index=si, columns=columns)
  1026. actual = pd.read_excel(filename, sheet_name="single_no_names", index_col=0)
  1027. tm.assert_frame_equal(actual, expected)
  1028. expected.index = mi
  1029. actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1])
  1030. tm.assert_frame_equal(actual, expected)
  1031. def test_read_excel_bool_header_arg(self, read_ext):
  1032. # GH 6114
  1033. msg = "Passing a bool to header is invalid"
  1034. for arg in [True, False]:
  1035. with pytest.raises(TypeError, match=msg):
  1036. pd.read_excel("test1" + read_ext, header=arg)
  1037. def test_read_excel_skiprows(self, request, engine, read_ext):
  1038. # GH 4903
  1039. xfail_datetimes_with_pyxlsb(engine, request)
  1040. unit = get_exp_unit(read_ext, engine)
  1041. actual = pd.read_excel(
  1042. "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2]
  1043. )
  1044. expected = DataFrame(
  1045. [
  1046. [1, 2.5, pd.Timestamp("2015-01-01"), True],
  1047. [2, 3.5, pd.Timestamp("2015-01-02"), False],
  1048. [3, 4.5, pd.Timestamp("2015-01-03"), False],
  1049. [4, 5.5, pd.Timestamp("2015-01-04"), True],
  1050. ],
  1051. columns=["a", "b", "c", "d"],
  1052. )
  1053. expected["c"] = expected["c"].astype(f"M8[{unit}]")
  1054. tm.assert_frame_equal(actual, expected)
  1055. actual = pd.read_excel(
  1056. "testskiprows" + read_ext,
  1057. sheet_name="skiprows_list",
  1058. skiprows=np.array([0, 2]),
  1059. )
  1060. tm.assert_frame_equal(actual, expected)
  1061. # GH36435
  1062. actual = pd.read_excel(
  1063. "testskiprows" + read_ext,
  1064. sheet_name="skiprows_list",
  1065. skiprows=lambda x: x in [0, 2],
  1066. )
  1067. tm.assert_frame_equal(actual, expected)
  1068. actual = pd.read_excel(
  1069. "testskiprows" + read_ext,
  1070. sheet_name="skiprows_list",
  1071. skiprows=3,
  1072. names=["a", "b", "c", "d"],
  1073. )
  1074. expected = DataFrame(
  1075. [
  1076. # [1, 2.5, pd.Timestamp("2015-01-01"), True],
  1077. [2, 3.5, pd.Timestamp("2015-01-02"), False],
  1078. [3, 4.5, pd.Timestamp("2015-01-03"), False],
  1079. [4, 5.5, pd.Timestamp("2015-01-04"), True],
  1080. ],
  1081. columns=["a", "b", "c", "d"],
  1082. )
  1083. expected["c"] = expected["c"].astype(f"M8[{unit}]")
  1084. tm.assert_frame_equal(actual, expected)
  1085. def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext):
  1086. # GH 4903
  1087. xfail_datetimes_with_pyxlsb(engine, request)
  1088. unit = get_exp_unit(read_ext, engine)
  1089. actual = pd.read_excel(
  1090. "testskiprows" + read_ext,
  1091. sheet_name="skiprows_list",
  1092. skiprows=lambda x: x not in [1, 3, 5],
  1093. )
  1094. expected = DataFrame(
  1095. [
  1096. [1, 2.5, pd.Timestamp("2015-01-01"), True],
  1097. # [2, 3.5, pd.Timestamp("2015-01-02"), False],
  1098. [3, 4.5, pd.Timestamp("2015-01-03"), False],
  1099. # [4, 5.5, pd.Timestamp("2015-01-04"), True],
  1100. ],
  1101. columns=["a", "b", "c", "d"],
  1102. )
  1103. expected["c"] = expected["c"].astype(f"M8[{unit}]")
  1104. tm.assert_frame_equal(actual, expected)
  1105. def test_read_excel_nrows(self, read_ext):
  1106. # GH 16645
  1107. num_rows_to_pull = 5
  1108. actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
  1109. expected = pd.read_excel("test1" + read_ext)
  1110. expected = expected[:num_rows_to_pull]
  1111. tm.assert_frame_equal(actual, expected)
  1112. def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext):
  1113. # GH 16645
  1114. expected = pd.read_excel("test1" + read_ext)
  1115. num_records_in_file = len(expected)
  1116. num_rows_to_pull = num_records_in_file + 10
  1117. actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
  1118. tm.assert_frame_equal(actual, expected)
  1119. def test_read_excel_nrows_non_integer_parameter(self, read_ext):
  1120. # GH 16645
  1121. msg = "'nrows' must be an integer >=0"
  1122. with pytest.raises(ValueError, match=msg):
  1123. pd.read_excel("test1" + read_ext, nrows="5")
  1124. @pytest.mark.parametrize(
  1125. "filename,sheet_name,header,index_col,skiprows",
  1126. [
  1127. ("testmultiindex", "mi_column", [0, 1], 0, None),
  1128. ("testmultiindex", "mi_index", None, [0, 1], None),
  1129. ("testmultiindex", "both", [0, 1], [0, 1], None),
  1130. ("testmultiindex", "mi_column_name", [0, 1], 0, None),
  1131. ("testskiprows", "skiprows_list", None, None, [0, 2]),
  1132. ("testskiprows", "skiprows_list", None, None, lambda x: x in (0, 2)),
  1133. ],
  1134. )
  1135. def test_read_excel_nrows_params(
  1136. self, read_ext, filename, sheet_name, header, index_col, skiprows
  1137. ):
  1138. """
  1139. For various parameters, we should get the same result whether we
  1140. limit the rows during load (nrows=3) or after (df.iloc[:3]).
  1141. """
  1142. # GH 46894
  1143. expected = pd.read_excel(
  1144. filename + read_ext,
  1145. sheet_name=sheet_name,
  1146. header=header,
  1147. index_col=index_col,
  1148. skiprows=skiprows,
  1149. ).iloc[:3]
  1150. actual = pd.read_excel(
  1151. filename + read_ext,
  1152. sheet_name=sheet_name,
  1153. header=header,
  1154. index_col=index_col,
  1155. skiprows=skiprows,
  1156. nrows=3,
  1157. )
  1158. tm.assert_frame_equal(actual, expected)
  1159. def test_deprecated_kwargs(self, read_ext):
  1160. with pytest.raises(TypeError, match="but 3 positional arguments"):
  1161. pd.read_excel("test1" + read_ext, "Sheet1", 0)
  1162. def test_no_header_with_list_index_col(self, read_ext):
  1163. # GH 31783
  1164. file_name = "testmultiindex" + read_ext
  1165. data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)]
  1166. idx = MultiIndex.from_tuples(
  1167. [("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1)
  1168. )
  1169. expected = DataFrame(data, index=idx, columns=(2, 3))
  1170. result = pd.read_excel(
  1171. file_name, sheet_name="index_col_none", index_col=[0, 1], header=None
  1172. )
  1173. tm.assert_frame_equal(expected, result)
  1174. def test_one_col_noskip_blank_line(self, read_ext):
  1175. # GH 39808
  1176. file_name = "one_col_blank_line" + read_ext
  1177. data = [0.5, np.nan, 1, 2]
  1178. expected = DataFrame(data, columns=["numbers"])
  1179. result = pd.read_excel(file_name)
  1180. tm.assert_frame_equal(result, expected)
  1181. def test_multiheader_two_blank_lines(self, read_ext):
  1182. # GH 40442
  1183. file_name = "testmultiindex" + read_ext
  1184. columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
  1185. data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]]
  1186. expected = DataFrame(data, columns=columns)
  1187. result = pd.read_excel(
  1188. file_name, sheet_name="mi_column_empty_rows", header=[0, 1]
  1189. )
  1190. tm.assert_frame_equal(result, expected)
  1191. def test_trailing_blanks(self, read_ext):
  1192. """
  1193. Sheets can contain blank cells with no data. Some of our readers
  1194. were including those cells, creating many empty rows and columns
  1195. """
  1196. file_name = "trailing_blanks" + read_ext
  1197. result = pd.read_excel(file_name)
  1198. assert result.shape == (3, 3)
  1199. def test_ignore_chartsheets_by_str(self, request, engine, read_ext):
  1200. # GH 41448
  1201. if read_ext == ".ods":
  1202. pytest.skip("chartsheets do not exist in the ODF format")
  1203. if engine == "pyxlsb":
  1204. request.applymarker(
  1205. pytest.mark.xfail(
  1206. reason="pyxlsb can't distinguish chartsheets from worksheets"
  1207. )
  1208. )
  1209. with pytest.raises(ValueError, match="Worksheet named 'Chart1' not found"):
  1210. pd.read_excel("chartsheet" + read_ext, sheet_name="Chart1")
  1211. def test_ignore_chartsheets_by_int(self, request, engine, read_ext):
  1212. # GH 41448
  1213. if read_ext == ".ods":
  1214. pytest.skip("chartsheets do not exist in the ODF format")
  1215. if engine == "pyxlsb":
  1216. request.applymarker(
  1217. pytest.mark.xfail(
  1218. reason="pyxlsb can't distinguish chartsheets from worksheets"
  1219. )
  1220. )
  1221. with pytest.raises(
  1222. ValueError, match="Worksheet index 1 is invalid, 1 worksheets found"
  1223. ):
  1224. pd.read_excel("chartsheet" + read_ext, sheet_name=1)
  1225. def test_euro_decimal_format(self, read_ext):
  1226. # copied from read_csv
  1227. result = pd.read_excel("test_decimal" + read_ext, decimal=",", skiprows=1)
  1228. expected = DataFrame(
  1229. [
  1230. [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
  1231. [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
  1232. [3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
  1233. ],
  1234. columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
  1235. )
  1236. tm.assert_frame_equal(result, expected)
  1237. class TestExcelFileRead:
  1238. def test_deprecate_bytes_input(self, engine, read_ext):
  1239. # GH 53830
  1240. msg = (
  1241. "Passing bytes to 'read_excel' is deprecated and "
  1242. "will be removed in a future version. To read from a "
  1243. "byte string, wrap it in a `BytesIO` object."
  1244. )
  1245. with tm.assert_produces_warning(
  1246. FutureWarning, match=msg, raise_on_extra_warnings=False
  1247. ):
  1248. with open("test1" + read_ext, "rb") as f:
  1249. pd.read_excel(f.read(), engine=engine)
  1250. @pytest.fixture(autouse=True)
  1251. def cd_and_set_engine(self, engine, datapath, monkeypatch):
  1252. """
  1253. Change directory and set engine for ExcelFile objects.
  1254. """
  1255. func = partial(pd.ExcelFile, engine=engine)
  1256. monkeypatch.chdir(datapath("io", "data", "excel"))
  1257. monkeypatch.setattr(pd, "ExcelFile", func)
  1258. def test_engine_used(self, read_ext, engine):
  1259. expected_defaults = {
  1260. "xlsx": "openpyxl",
  1261. "xlsm": "openpyxl",
  1262. "xlsb": "pyxlsb",
  1263. "xls": "xlrd",
  1264. "ods": "odf",
  1265. }
  1266. with pd.ExcelFile("test1" + read_ext) as excel:
  1267. result = excel.engine
  1268. if engine is not None:
  1269. expected = engine
  1270. else:
  1271. expected = expected_defaults[read_ext[1:]]
  1272. assert result == expected
  1273. def test_excel_passes_na(self, read_ext):
  1274. with pd.ExcelFile("test4" + read_ext) as excel:
  1275. parsed = pd.read_excel(
  1276. excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
  1277. )
  1278. expected = DataFrame(
  1279. [["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"]
  1280. )
  1281. tm.assert_frame_equal(parsed, expected)
  1282. with pd.ExcelFile("test4" + read_ext) as excel:
  1283. parsed = pd.read_excel(
  1284. excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
  1285. )
  1286. expected = DataFrame(
  1287. [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
  1288. )
  1289. tm.assert_frame_equal(parsed, expected)
  1290. # 13967
  1291. with pd.ExcelFile("test5" + read_ext) as excel:
  1292. parsed = pd.read_excel(
  1293. excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
  1294. )
  1295. expected = DataFrame(
  1296. [["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"]
  1297. )
  1298. tm.assert_frame_equal(parsed, expected)
  1299. with pd.ExcelFile("test5" + read_ext) as excel:
  1300. parsed = pd.read_excel(
  1301. excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
  1302. )
  1303. expected = DataFrame(
  1304. [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
  1305. )
  1306. tm.assert_frame_equal(parsed, expected)
  1307. @pytest.mark.parametrize("na_filter", [None, True, False])
  1308. def test_excel_passes_na_filter(self, read_ext, na_filter):
  1309. # gh-25453
  1310. kwargs = {}
  1311. if na_filter is not None:
  1312. kwargs["na_filter"] = na_filter
  1313. with pd.ExcelFile("test5" + read_ext) as excel:
  1314. parsed = pd.read_excel(
  1315. excel,
  1316. sheet_name="Sheet1",
  1317. keep_default_na=True,
  1318. na_values=["apple"],
  1319. **kwargs,
  1320. )
  1321. if na_filter is False:
  1322. expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]]
  1323. else:
  1324. expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]]
  1325. expected = DataFrame(expected, columns=["Test"])
  1326. tm.assert_frame_equal(parsed, expected)
  1327. def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref):
  1328. xfail_datetimes_with_pyxlsb(engine, request)
  1329. expected = df_ref
  1330. adjust_expected(expected, read_ext, engine)
  1331. with pd.ExcelFile("test1" + read_ext) as excel:
  1332. df1 = pd.read_excel(excel, sheet_name=0, index_col=0)
  1333. df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0)
  1334. tm.assert_frame_equal(df1, expected)
  1335. tm.assert_frame_equal(df2, expected)
  1336. with pd.ExcelFile("test1" + read_ext) as excel:
  1337. df1 = excel.parse(0, index_col=0)
  1338. df2 = excel.parse(1, skiprows=[1], index_col=0)
  1339. tm.assert_frame_equal(df1, expected)
  1340. tm.assert_frame_equal(df2, expected)
  1341. with pd.ExcelFile("test1" + read_ext) as excel:
  1342. df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1)
  1343. tm.assert_frame_equal(df3, df1.iloc[:-1])
  1344. with pd.ExcelFile("test1" + read_ext) as excel:
  1345. df3 = excel.parse(0, index_col=0, skipfooter=1)
  1346. tm.assert_frame_equal(df3, df1.iloc[:-1])
  1347. def test_sheet_name(self, request, engine, read_ext, df_ref):
  1348. xfail_datetimes_with_pyxlsb(engine, request)
  1349. expected = df_ref
  1350. adjust_expected(expected, read_ext, engine)
  1351. filename = "test1"
  1352. sheet_name = "Sheet1"
  1353. with pd.ExcelFile(filename + read_ext) as excel:
  1354. df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc
  1355. with pd.ExcelFile(filename + read_ext) as excel:
  1356. df2_parse = excel.parse(index_col=0, sheet_name=sheet_name)
  1357. tm.assert_frame_equal(df1_parse, expected)
  1358. tm.assert_frame_equal(df2_parse, expected)
  1359. @pytest.mark.parametrize(
  1360. "sheet_name",
  1361. [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
  1362. )
  1363. def test_bad_sheetname_raises(self, read_ext, sheet_name):
  1364. # GH 39250
  1365. msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
  1366. with pytest.raises(ValueError, match=msg):
  1367. with pd.ExcelFile("blank" + read_ext) as excel:
  1368. excel.parse(sheet_name=sheet_name)
  1369. def test_excel_read_buffer(self, engine, read_ext):
  1370. pth = "test1" + read_ext
  1371. expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine)
  1372. with open(pth, "rb") as f:
  1373. with pd.ExcelFile(f) as xls:
  1374. actual = pd.read_excel(xls, sheet_name="Sheet1", index_col=0)
  1375. tm.assert_frame_equal(expected, actual)
  1376. def test_reader_closes_file(self, engine, read_ext):
  1377. with open("test1" + read_ext, "rb") as f:
  1378. with pd.ExcelFile(f) as xlsx:
  1379. # parses okay
  1380. pd.read_excel(xlsx, sheet_name="Sheet1", index_col=0, engine=engine)
  1381. assert f.closed
  1382. def test_conflicting_excel_engines(self, read_ext):
  1383. # GH 26566
  1384. msg = "Engine should not be specified when passing an ExcelFile"
  1385. with pd.ExcelFile("test1" + read_ext) as xl:
  1386. with pytest.raises(ValueError, match=msg):
  1387. pd.read_excel(xl, engine="foo")
  1388. def test_excel_read_binary(self, engine, read_ext):
  1389. # GH 15914
  1390. expected = pd.read_excel("test1" + read_ext, engine=engine)
  1391. with open("test1" + read_ext, "rb") as f:
  1392. data = f.read()
  1393. actual = pd.read_excel(BytesIO(data), engine=engine)
  1394. tm.assert_frame_equal(expected, actual)
  1395. def test_excel_read_binary_via_read_excel(self, read_ext, engine):
  1396. # GH 38424
  1397. with open("test1" + read_ext, "rb") as f:
  1398. result = pd.read_excel(f, engine=engine)
  1399. expected = pd.read_excel("test1" + read_ext, engine=engine)
  1400. tm.assert_frame_equal(result, expected)
  1401. def test_read_excel_header_index_out_of_range(self, engine):
  1402. # GH#43143
  1403. with open("df_header_oob.xlsx", "rb") as f:
  1404. with pytest.raises(ValueError, match="exceeds maximum"):
  1405. pd.read_excel(f, header=[0, 1])
  1406. @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
  1407. def test_header_with_index_col(self, filename):
  1408. # GH 33476
  1409. idx = Index(["Z"], name="I2")
  1410. cols = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
  1411. expected = DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64")
  1412. result = pd.read_excel(
  1413. filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
  1414. )
  1415. tm.assert_frame_equal(expected, result)
  1416. def test_read_datetime_multiindex(self, request, engine, read_ext):
  1417. # GH 34748
  1418. xfail_datetimes_with_pyxlsb(engine, request)
  1419. f = "test_datetime_mi" + read_ext
  1420. with pd.ExcelFile(f) as excel:
  1421. actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine)
  1422. unit = get_exp_unit(read_ext, engine)
  1423. dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]")
  1424. expected_column_index = MultiIndex.from_arrays(
  1425. [dti[:1], dti[1:]],
  1426. names=[
  1427. dti[0].to_pydatetime(),
  1428. dti[1].to_pydatetime(),
  1429. ],
  1430. )
  1431. expected = DataFrame([], index=[], columns=expected_column_index)
  1432. tm.assert_frame_equal(expected, actual)
  1433. def test_engine_invalid_option(self, read_ext):
  1434. # read_ext includes the '.' hence the weird formatting
  1435. with pytest.raises(ValueError, match="Value must be one of *"):
  1436. with pd.option_context(f"io.excel{read_ext}.reader", "abc"):
  1437. pass
  1438. def test_ignore_chartsheets(self, request, engine, read_ext):
  1439. # GH 41448
  1440. if read_ext == ".ods":
  1441. pytest.skip("chartsheets do not exist in the ODF format")
  1442. if engine == "pyxlsb":
  1443. request.applymarker(
  1444. pytest.mark.xfail(
  1445. reason="pyxlsb can't distinguish chartsheets from worksheets"
  1446. )
  1447. )
  1448. with pd.ExcelFile("chartsheet" + read_ext) as excel:
  1449. assert excel.sheet_names == ["Sheet1"]
  1450. def test_corrupt_files_closed(self, engine, read_ext):
  1451. # GH41778
  1452. errors = (BadZipFile,)
  1453. if engine is None:
  1454. pytest.skip(f"Invalid test for engine={engine}")
  1455. elif engine == "xlrd":
  1456. import xlrd
  1457. errors = (BadZipFile, xlrd.biffh.XLRDError)
  1458. elif engine == "calamine":
  1459. from python_calamine import CalamineError
  1460. errors = (CalamineError,)
  1461. with tm.ensure_clean(f"corrupt{read_ext}") as file:
  1462. Path(file).write_text("corrupt", encoding="utf-8")
  1463. with tm.assert_produces_warning(False):
  1464. try:
  1465. pd.ExcelFile(file, engine=engine)
  1466. except errors:
  1467. pass