test_read.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. from contextlib import closing
  2. from pathlib import Path
  3. import re
  4. import numpy as np
  5. import pytest
  6. from pandas._libs.tslibs import Timestamp
  7. from pandas.compat import is_platform_windows
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. HDFStore,
  12. Index,
  13. Series,
  14. _testing as tm,
  15. date_range,
  16. read_hdf,
  17. )
  18. from pandas.tests.io.pytables.common import (
  19. _maybe_remove,
  20. ensure_clean_store,
  21. )
  22. from pandas.util import _test_decorators as td
  23. from pandas.io.pytables import TableIterator
  24. pytestmark = [pytest.mark.single_cpu]
  25. def test_read_missing_key_close_store(tmp_path, setup_path):
  26. # GH 25766
  27. path = tmp_path / setup_path
  28. df = DataFrame({"a": range(2), "b": range(2)})
  29. df.to_hdf(path, key="k1")
  30. with pytest.raises(KeyError, match="'No object named k2 in the file'"):
  31. read_hdf(path, "k2")
  32. # smoke test to test that file is properly closed after
  33. # read with KeyError before another write
  34. df.to_hdf(path, key="k2")
  35. def test_read_index_error_close_store(tmp_path, setup_path):
  36. # GH 25766
  37. path = tmp_path / setup_path
  38. df = DataFrame({"A": [], "B": []}, index=[])
  39. df.to_hdf(path, key="k1")
  40. with pytest.raises(IndexError, match=r"list index out of range"):
  41. read_hdf(path, "k1", stop=0)
  42. # smoke test to test that file is properly closed after
  43. # read with IndexError before another write
  44. df.to_hdf(path, key="k1")
  45. def test_read_missing_key_opened_store(tmp_path, setup_path):
  46. # GH 28699
  47. path = tmp_path / setup_path
  48. df = DataFrame({"a": range(2), "b": range(2)})
  49. df.to_hdf(path, key="k1")
  50. with HDFStore(path, "r") as store:
  51. with pytest.raises(KeyError, match="'No object named k2 in the file'"):
  52. read_hdf(store, "k2")
  53. # Test that the file is still open after a KeyError and that we can
  54. # still read from it.
  55. read_hdf(store, "k1")
  56. def test_read_column(setup_path):
  57. df = DataFrame(
  58. np.random.default_rng(2).standard_normal((10, 4)),
  59. columns=Index(list("ABCD")),
  60. index=date_range("2000-01-01", periods=10, freq="B"),
  61. )
  62. with ensure_clean_store(setup_path) as store:
  63. _maybe_remove(store, "df")
  64. # GH 17912
  65. # HDFStore.select_column should raise a KeyError
  66. # exception if the key is not a valid store
  67. with pytest.raises(KeyError, match="No object named df in the file"):
  68. store.select_column("df", "index")
  69. store.append("df", df)
  70. # error
  71. with pytest.raises(
  72. KeyError, match=re.escape("'column [foo] not found in the table'")
  73. ):
  74. store.select_column("df", "foo")
  75. msg = re.escape("select_column() got an unexpected keyword argument 'where'")
  76. with pytest.raises(TypeError, match=msg):
  77. store.select_column("df", "index", where=["index>5"])
  78. # valid
  79. result = store.select_column("df", "index")
  80. tm.assert_almost_equal(result.values, Series(df.index).values)
  81. assert isinstance(result, Series)
  82. # not a data indexable column
  83. msg = re.escape(
  84. "column [values_block_0] can not be extracted individually; "
  85. "it is not data indexable"
  86. )
  87. with pytest.raises(ValueError, match=msg):
  88. store.select_column("df", "values_block_0")
  89. # a data column
  90. df2 = df.copy()
  91. df2["string"] = "foo"
  92. store.append("df2", df2, data_columns=["string"])
  93. result = store.select_column("df2", "string")
  94. tm.assert_almost_equal(result.values, df2["string"].values)
  95. # a data column with NaNs, result excludes the NaNs
  96. df3 = df.copy()
  97. df3["string"] = "foo"
  98. df3.loc[df3.index[4:6], "string"] = np.nan
  99. store.append("df3", df3, data_columns=["string"])
  100. result = store.select_column("df3", "string")
  101. tm.assert_almost_equal(result.values, df3["string"].values)
  102. # start/stop
  103. result = store.select_column("df3", "string", start=2)
  104. tm.assert_almost_equal(result.values, df3["string"].values[2:])
  105. result = store.select_column("df3", "string", start=-2)
  106. tm.assert_almost_equal(result.values, df3["string"].values[-2:])
  107. result = store.select_column("df3", "string", stop=2)
  108. tm.assert_almost_equal(result.values, df3["string"].values[:2])
  109. result = store.select_column("df3", "string", stop=-2)
  110. tm.assert_almost_equal(result.values, df3["string"].values[:-2])
  111. result = store.select_column("df3", "string", start=2, stop=-2)
  112. tm.assert_almost_equal(result.values, df3["string"].values[2:-2])
  113. result = store.select_column("df3", "string", start=-2, stop=2)
  114. tm.assert_almost_equal(result.values, df3["string"].values[-2:2])
  115. # GH 10392 - make sure column name is preserved
  116. df4 = DataFrame({"A": np.random.default_rng(2).standard_normal(10), "B": "foo"})
  117. store.append("df4", df4, data_columns=True)
  118. expected = df4["B"]
  119. result = store.select_column("df4", "B")
  120. tm.assert_series_equal(result, expected)
  121. def test_pytables_native_read(datapath):
  122. with ensure_clean_store(
  123. datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r"
  124. ) as store:
  125. d2 = store["detector/readout"]
  126. assert isinstance(d2, DataFrame)
  127. @pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows")
  128. def test_pytables_native2_read(datapath):
  129. with ensure_clean_store(
  130. datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r"
  131. ) as store:
  132. str(store)
  133. d1 = store["detector"]
  134. assert isinstance(d1, DataFrame)
  135. def test_legacy_table_fixed_format_read_py2(datapath):
  136. # GH 24510
  137. # legacy table with fixed format written in Python 2
  138. with ensure_clean_store(
  139. datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r"
  140. ) as store:
  141. result = store.select("df")
  142. expected = DataFrame(
  143. [[1, 2, 3, "D"]],
  144. columns=["A", "B", "C", "D"],
  145. index=Index(["ABC"], name="INDEX_NAME"),
  146. )
  147. tm.assert_frame_equal(expected, result)
  148. def test_legacy_table_fixed_format_read_datetime_py2(datapath):
  149. # GH 31750
  150. # legacy table with fixed format and datetime64 column written in Python 2
  151. expected = DataFrame(
  152. [[Timestamp("2020-02-06T18:00")]],
  153. columns=["A"],
  154. index=Index(["date"]),
  155. dtype="M8[ns]",
  156. )
  157. with ensure_clean_store(
  158. datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"),
  159. mode="r",
  160. ) as store:
  161. result = store.select("df")
  162. tm.assert_frame_equal(expected, result)
  163. def test_legacy_table_read_py2(datapath):
  164. # issue: 24925
  165. # legacy table written in Python 2
  166. with ensure_clean_store(
  167. datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r"
  168. ) as store:
  169. result = store.select("table")
  170. expected = DataFrame({"a": ["a", "b"], "b": [2, 3]})
  171. tm.assert_frame_equal(expected, result)
  172. def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string):
  173. # GH10330
  174. # No check for non-string path_or-buf, and no test of open store
  175. df = DataFrame(
  176. np.random.default_rng(2).random((4, 5)),
  177. index=list("abcd"),
  178. columns=list("ABCDE"),
  179. )
  180. df.index.name = "letters"
  181. df = df.set_index(keys="E", append=True)
  182. path = tmp_path / setup_path
  183. if using_infer_string:
  184. # TODO(infer_string) make this work for string dtype
  185. msg = "Saving a MultiIndex with an extension dtype is not supported."
  186. with pytest.raises(NotImplementedError, match=msg):
  187. df.to_hdf(path, key="df", mode="w")
  188. return
  189. df.to_hdf(path, key="df", mode="w")
  190. direct = read_hdf(path, "df")
  191. with HDFStore(path, mode="r") as store:
  192. indirect = read_hdf(store, "df")
  193. tm.assert_frame_equal(direct, indirect)
  194. assert store.is_open
  195. def test_read_hdf_index_not_view(tmp_path, setup_path):
  196. # GH 37441
  197. # Ensure that the index of the DataFrame is not a view
  198. # into the original recarray that pytables reads in
  199. df = DataFrame(
  200. np.random.default_rng(2).random((4, 5)),
  201. index=[0, 1, 2, 3],
  202. columns=list("ABCDE"),
  203. )
  204. path = tmp_path / setup_path
  205. df.to_hdf(path, key="df", mode="w", format="table")
  206. df2 = read_hdf(path, "df")
  207. assert df2.index._data.base is None
  208. tm.assert_frame_equal(df, df2)
  209. def test_read_hdf_iterator(tmp_path, setup_path):
  210. df = DataFrame(
  211. np.random.default_rng(2).random((4, 5)),
  212. index=list("abcd"),
  213. columns=list("ABCDE"),
  214. )
  215. df.index.name = "letters"
  216. df = df.set_index(keys="E", append=True)
  217. path = tmp_path / setup_path
  218. df.to_hdf(path, key="df", mode="w", format="t")
  219. direct = read_hdf(path, "df")
  220. iterator = read_hdf(path, "df", iterator=True)
  221. with closing(iterator.store):
  222. assert isinstance(iterator, TableIterator)
  223. indirect = next(iterator.__iter__())
  224. tm.assert_frame_equal(direct, indirect)
  225. def test_read_nokey(tmp_path, setup_path):
  226. # GH10443
  227. df = DataFrame(
  228. np.random.default_rng(2).random((4, 5)),
  229. index=list("abcd"),
  230. columns=list("ABCDE"),
  231. )
  232. # Categorical dtype not supported for "fixed" format. So no need
  233. # to test with that dtype in the dataframe here.
  234. path = tmp_path / setup_path
  235. df.to_hdf(path, key="df", mode="a")
  236. reread = read_hdf(path)
  237. tm.assert_frame_equal(df, reread)
  238. df.to_hdf(path, key="df2", mode="a")
  239. msg = "key must be provided when HDF5 file contains multiple datasets."
  240. with pytest.raises(ValueError, match=msg):
  241. read_hdf(path)
  242. def test_read_nokey_table(tmp_path, setup_path):
  243. # GH13231
  244. df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")})
  245. path = tmp_path / setup_path
  246. df.to_hdf(path, key="df", mode="a", format="table")
  247. reread = read_hdf(path)
  248. tm.assert_frame_equal(df, reread)
  249. df.to_hdf(path, key="df2", mode="a", format="table")
  250. msg = "key must be provided when HDF5 file contains multiple datasets."
  251. with pytest.raises(ValueError, match=msg):
  252. read_hdf(path)
  253. def test_read_nokey_empty(tmp_path, setup_path):
  254. path = tmp_path / setup_path
  255. store = HDFStore(path)
  256. store.close()
  257. msg = re.escape(
  258. "Dataset(s) incompatible with Pandas data types, not table, or no "
  259. "datasets found in HDF5 file."
  260. )
  261. with pytest.raises(ValueError, match=msg):
  262. read_hdf(path)
  263. def test_read_from_pathlib_path(tmp_path, setup_path):
  264. # GH11773
  265. expected = DataFrame(
  266. np.random.default_rng(2).random((4, 5)),
  267. index=list("abcd"),
  268. columns=list("ABCDE"),
  269. )
  270. filename = tmp_path / setup_path
  271. path_obj = Path(filename)
  272. expected.to_hdf(path_obj, key="df", mode="a")
  273. actual = read_hdf(path_obj, key="df")
  274. tm.assert_frame_equal(expected, actual)
  275. @td.skip_if_no("py.path")
  276. def test_read_from_py_localpath(tmp_path, setup_path):
  277. # GH11773
  278. from py.path import local as LocalPath
  279. expected = DataFrame(
  280. np.random.default_rng(2).random((4, 5)),
  281. index=list("abcd"),
  282. columns=list("ABCDE"),
  283. )
  284. filename = tmp_path / setup_path
  285. path_obj = LocalPath(filename)
  286. expected.to_hdf(path_obj, key="df", mode="a")
  287. actual = read_hdf(path_obj, key="df")
  288. tm.assert_frame_equal(expected, actual)
  289. @pytest.mark.parametrize("format", ["fixed", "table"])
  290. def test_read_hdf_series_mode_r(tmp_path, format, setup_path):
  291. # GH 16583
  292. # Tests that reading a Series saved to an HDF file
  293. # still works if a mode='r' argument is supplied
  294. series = Series(range(10), dtype=np.float64)
  295. path = tmp_path / setup_path
  296. series.to_hdf(path, key="data", format=format)
  297. result = read_hdf(path, key="data", mode="r")
  298. tm.assert_series_equal(result, series)
  299. @pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning")
  300. @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
  301. def test_read_py2_hdf_file_in_py3(datapath):
  302. # GH 16781
  303. # tests reading a PeriodIndex DataFrame written in Python2 in Python3
  304. # the file was generated in Python 2.7 like so:
  305. #
  306. # df = DataFrame([1.,2,3], index=pd.PeriodIndex(
  307. # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
  308. # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p')
  309. expected = DataFrame(
  310. [1.0, 2, 3],
  311. index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"),
  312. )
  313. with ensure_clean_store(
  314. datapath(
  315. "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5"
  316. ),
  317. mode="r",
  318. ) as store:
  319. result = store["p"]
  320. tm.assert_frame_equal(result, expected)
  321. def test_read_infer_string(tmp_path, setup_path):
  322. # GH#54431
  323. df = DataFrame({"a": ["a", "b", None]})
  324. path = tmp_path / setup_path
  325. df.to_hdf(path, key="data", format="table")
  326. with pd.option_context("future.infer_string", True):
  327. result = read_hdf(path, key="data", mode="r")
  328. expected = DataFrame(
  329. {"a": ["a", "b", None]},
  330. dtype=pd.StringDtype(na_value=np.nan),
  331. columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
  332. )
  333. tm.assert_frame_equal(result, expected)