test_put.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas._libs.tslibs import Timestamp
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. HDFStore,
  9. Index,
  10. MultiIndex,
  11. Series,
  12. _testing as tm,
  13. concat,
  14. date_range,
  15. )
  16. from pandas.tests.io.pytables.common import (
  17. _maybe_remove,
  18. ensure_clean_store,
  19. )
  20. from pandas.util import _test_decorators as td
  21. pytestmark = [pytest.mark.single_cpu]
  22. def test_format_type(tmp_path, setup_path):
  23. df = DataFrame({"A": [1, 2]})
  24. with HDFStore(tmp_path / setup_path) as store:
  25. store.put("a", df, format="fixed")
  26. store.put("b", df, format="table")
  27. assert store.get_storer("a").format_type == "fixed"
  28. assert store.get_storer("b").format_type == "table"
  29. def test_format_kwarg_in_constructor(tmp_path, setup_path):
  30. # GH 13291
  31. msg = "format is not a defined argument for HDFStore"
  32. with pytest.raises(ValueError, match=msg):
  33. HDFStore(tmp_path / setup_path, format="table")
  34. def test_api_default_format(tmp_path, setup_path):
  35. # default_format option
  36. with ensure_clean_store(setup_path) as store:
  37. df = DataFrame(
  38. 1.1 * np.arange(120).reshape((30, 4)),
  39. columns=Index(list("ABCD")),
  40. index=Index([f"i-{i}" for i in range(30)]),
  41. )
  42. with pd.option_context("io.hdf.default_format", "fixed"):
  43. _maybe_remove(store, "df")
  44. store.put("df", df)
  45. assert not store.get_storer("df").is_table
  46. msg = "Can only append to Tables"
  47. with pytest.raises(ValueError, match=msg):
  48. store.append("df2", df)
  49. with pd.option_context("io.hdf.default_format", "table"):
  50. _maybe_remove(store, "df")
  51. store.put("df", df)
  52. assert store.get_storer("df").is_table
  53. _maybe_remove(store, "df2")
  54. store.append("df2", df)
  55. assert store.get_storer("df").is_table
  56. path = tmp_path / setup_path
  57. df = DataFrame(
  58. 1.1 * np.arange(120).reshape((30, 4)),
  59. columns=Index(list("ABCD")),
  60. index=Index([f"i-{i}" for i in range(30)]),
  61. )
  62. with pd.option_context("io.hdf.default_format", "fixed"):
  63. df.to_hdf(path, key="df")
  64. with HDFStore(path) as store:
  65. assert not store.get_storer("df").is_table
  66. with pytest.raises(ValueError, match=msg):
  67. df.to_hdf(path, key="df2", append=True)
  68. with pd.option_context("io.hdf.default_format", "table"):
  69. df.to_hdf(path, key="df3")
  70. with HDFStore(path) as store:
  71. assert store.get_storer("df3").is_table
  72. df.to_hdf(path, key="df4", append=True)
  73. with HDFStore(path) as store:
  74. assert store.get_storer("df4").is_table
  75. def test_put(setup_path):
  76. with ensure_clean_store(setup_path) as store:
  77. ts = Series(
  78. np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
  79. )
  80. df = DataFrame(
  81. np.random.default_rng(2).standard_normal((20, 4)),
  82. columns=Index(list("ABCD")),
  83. index=date_range("2000-01-01", periods=20, freq="B"),
  84. )
  85. store["a"] = ts
  86. store["b"] = df[:10]
  87. store["foo/bar/bah"] = df[:10]
  88. store["foo"] = df[:10]
  89. store["/foo"] = df[:10]
  90. store.put("c", df[:10], format="table")
  91. # not OK, not a table
  92. msg = "Can only append to Tables"
  93. with pytest.raises(ValueError, match=msg):
  94. store.put("b", df[10:], append=True)
  95. # node does not currently exist, test _is_table_type returns False
  96. # in this case
  97. _maybe_remove(store, "f")
  98. with pytest.raises(ValueError, match=msg):
  99. store.put("f", df[10:], append=True)
  100. # can't put to a table (use append instead)
  101. with pytest.raises(ValueError, match=msg):
  102. store.put("c", df[10:], append=True)
  103. # overwrite table
  104. store.put("c", df[:10], format="table", append=False)
  105. tm.assert_frame_equal(df[:10], store["c"])
  106. def test_put_string_index(setup_path):
  107. with ensure_clean_store(setup_path) as store:
  108. index = Index([f"I am a very long string index: {i}" for i in range(20)])
  109. s = Series(np.arange(20), index=index)
  110. df = DataFrame({"A": s, "B": s})
  111. store["a"] = s
  112. tm.assert_series_equal(store["a"], s)
  113. store["b"] = df
  114. tm.assert_frame_equal(store["b"], df)
  115. # mixed length
  116. index = Index(
  117. ["abcdefghijklmnopqrstuvwxyz1234567890"]
  118. + [f"I am a very long string index: {i}" for i in range(20)]
  119. )
  120. s = Series(np.arange(21), index=index)
  121. df = DataFrame({"A": s, "B": s})
  122. store["a"] = s
  123. tm.assert_series_equal(store["a"], s)
  124. store["b"] = df
  125. tm.assert_frame_equal(store["b"], df)
  126. def test_put_compression(setup_path):
  127. with ensure_clean_store(setup_path) as store:
  128. df = DataFrame(
  129. np.random.default_rng(2).standard_normal((10, 4)),
  130. columns=Index(list("ABCD")),
  131. index=date_range("2000-01-01", periods=10, freq="B"),
  132. )
  133. store.put("c", df, format="table", complib="zlib")
  134. tm.assert_frame_equal(store["c"], df)
  135. # can't compress if format='fixed'
  136. msg = "Compression not supported on Fixed format stores"
  137. with pytest.raises(ValueError, match=msg):
  138. store.put("b", df, format="fixed", complib="zlib")
  139. @td.skip_if_windows
  140. def test_put_compression_blosc(setup_path):
  141. df = DataFrame(
  142. np.random.default_rng(2).standard_normal((10, 4)),
  143. columns=Index(list("ABCD")),
  144. index=date_range("2000-01-01", periods=10, freq="B"),
  145. )
  146. with ensure_clean_store(setup_path) as store:
  147. # can't compress if format='fixed'
  148. msg = "Compression not supported on Fixed format stores"
  149. with pytest.raises(ValueError, match=msg):
  150. store.put("b", df, format="fixed", complib="blosc")
  151. store.put("c", df, format="table", complib="blosc")
  152. tm.assert_frame_equal(store["c"], df)
  153. def test_put_datetime_ser(setup_path):
  154. # https://github.com/pandas-dev/pandas/pull/60663
  155. ser = Series(3 * [Timestamp("20010102").as_unit("ns")])
  156. with ensure_clean_store(setup_path) as store:
  157. store.put("ser", ser)
  158. expected = ser.copy()
  159. result = store.get("ser")
  160. tm.assert_series_equal(result, expected)
  161. def test_put_mixed_type(setup_path, using_infer_string):
  162. df = DataFrame(
  163. np.random.default_rng(2).standard_normal((10, 4)),
  164. columns=Index(list("ABCD")),
  165. index=date_range("2000-01-01", periods=10, freq="B"),
  166. )
  167. df["obj1"] = "foo"
  168. df["obj2"] = "bar"
  169. df["bool1"] = df["A"] > 0
  170. df["bool2"] = df["B"] > 0
  171. df["bool3"] = True
  172. df["int1"] = 1
  173. df["int2"] = 2
  174. df["timestamp1"] = Timestamp("20010102").as_unit("ns")
  175. df["timestamp2"] = Timestamp("20010103").as_unit("ns")
  176. df["datetime1"] = Timestamp("20010102").as_unit("ns")
  177. df["datetime2"] = Timestamp("20010103").as_unit("ns")
  178. df.loc[df.index[3:6], ["obj1"]] = np.nan
  179. df = df._consolidate()
  180. with ensure_clean_store(setup_path) as store:
  181. _maybe_remove(store, "df")
  182. warning = None if using_infer_string else pd.errors.PerformanceWarning
  183. with tm.assert_produces_warning(warning):
  184. store.put("df", df)
  185. expected = store.get("df")
  186. tm.assert_frame_equal(expected, df)
  187. def test_put_str_frame(setup_path, string_dtype_arguments):
  188. # https://github.com/pandas-dev/pandas/pull/60663
  189. dtype = pd.StringDtype(*string_dtype_arguments)
  190. df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype)})
  191. with ensure_clean_store(setup_path) as store:
  192. _maybe_remove(store, "df")
  193. store.put("df", df)
  194. expected_dtype = "str" if dtype.na_value is np.nan else "string"
  195. expected = df.astype(expected_dtype)
  196. result = store.get("df")
  197. tm.assert_frame_equal(result, expected)
  198. def test_put_str_series(setup_path, string_dtype_arguments):
  199. # https://github.com/pandas-dev/pandas/pull/60663
  200. dtype = pd.StringDtype(*string_dtype_arguments)
  201. ser = Series(["x", pd.NA, "y"], dtype=dtype)
  202. with ensure_clean_store(setup_path) as store:
  203. _maybe_remove(store, "df")
  204. store.put("ser", ser)
  205. expected_dtype = "str" if dtype.na_value is np.nan else "string"
  206. expected = ser.astype(expected_dtype)
  207. result = store.get("ser")
  208. tm.assert_series_equal(result, expected)
  209. @pytest.mark.parametrize("format", ["table", "fixed"])
  210. @pytest.mark.parametrize(
  211. "index",
  212. [
  213. Index([str(i) for i in range(10)]),
  214. Index(np.arange(10, dtype=float)),
  215. Index(np.arange(10)),
  216. date_range("2020-01-01", periods=10),
  217. pd.period_range("2020-01-01", periods=10),
  218. ],
  219. )
  220. def test_store_index_types(setup_path, format, index):
  221. # GH5386
  222. # test storing various index types
  223. with ensure_clean_store(setup_path) as store:
  224. df = DataFrame(
  225. np.random.default_rng(2).standard_normal((10, 2)),
  226. columns=list("AB"),
  227. index=index,
  228. )
  229. _maybe_remove(store, "df")
  230. store.put("df", df, format=format)
  231. tm.assert_frame_equal(df, store["df"])
  232. def test_column_multiindex(setup_path, using_infer_string):
  233. # GH 4710
  234. # recreate multi-indexes properly
  235. index = MultiIndex.from_tuples(
  236. [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"]
  237. )
  238. df = DataFrame(np.arange(12).reshape(3, 4), columns=index)
  239. expected = df.set_axis(df.index.to_numpy())
  240. with ensure_clean_store(setup_path) as store:
  241. if using_infer_string:
  242. # TODO(infer_string) make this work for string dtype
  243. msg = "Saving a MultiIndex with an extension dtype is not supported."
  244. with pytest.raises(NotImplementedError, match=msg):
  245. store.put("df", df)
  246. return
  247. store.put("df", df)
  248. tm.assert_frame_equal(
  249. store["df"], expected, check_index_type=True, check_column_type=True
  250. )
  251. store.put("df1", df, format="table")
  252. tm.assert_frame_equal(
  253. store["df1"], expected, check_index_type=True, check_column_type=True
  254. )
  255. msg = re.escape("cannot use a multi-index on axis [1] with data_columns ['A']")
  256. with pytest.raises(ValueError, match=msg):
  257. store.put("df2", df, format="table", data_columns=["A"])
  258. msg = re.escape("cannot use a multi-index on axis [1] with data_columns True")
  259. with pytest.raises(ValueError, match=msg):
  260. store.put("df3", df, format="table", data_columns=True)
  261. # appending multi-column on existing table (see GH 6167)
  262. with ensure_clean_store(setup_path) as store:
  263. store.append("df2", df)
  264. store.append("df2", df)
  265. tm.assert_frame_equal(store["df2"], concat((df, df)))
  266. # non_index_axes name
  267. df = DataFrame(np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo"))
  268. expected = df.set_axis(df.index.to_numpy())
  269. with ensure_clean_store(setup_path) as store:
  270. store.put("df1", df, format="table")
  271. tm.assert_frame_equal(
  272. store["df1"], expected, check_index_type=True, check_column_type=True
  273. )
  274. def test_store_multiindex(setup_path):
  275. # validate multi-index names
  276. # GH 5527
  277. with ensure_clean_store(setup_path) as store:
  278. def make_index(names=None):
  279. dti = date_range("2013-12-01", "2013-12-02")
  280. mi = MultiIndex.from_product([dti, range(2), range(3)], names=names)
  281. return mi
  282. # no names
  283. _maybe_remove(store, "df")
  284. df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index())
  285. store.append("df", df)
  286. tm.assert_frame_equal(store.select("df"), df)
  287. # partial names
  288. _maybe_remove(store, "df")
  289. df = DataFrame(
  290. np.zeros((12, 2)),
  291. columns=["a", "b"],
  292. index=make_index(["date", None, None]),
  293. )
  294. store.append("df", df)
  295. tm.assert_frame_equal(store.select("df"), df)
  296. # series
  297. _maybe_remove(store, "ser")
  298. ser = Series(np.zeros(12), index=make_index(["date", None, None]))
  299. store.append("ser", ser)
  300. xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"]))
  301. tm.assert_series_equal(store.select("ser"), xp)
  302. # dup with column
  303. _maybe_remove(store, "df")
  304. df = DataFrame(
  305. np.zeros((12, 2)),
  306. columns=["a", "b"],
  307. index=make_index(["date", "a", "t"]),
  308. )
  309. msg = "duplicate names/columns in the multi-index when storing as a table"
  310. with pytest.raises(ValueError, match=msg):
  311. store.append("df", df)
  312. # dup within level
  313. _maybe_remove(store, "df")
  314. df = DataFrame(
  315. np.zeros((12, 2)),
  316. columns=["a", "b"],
  317. index=make_index(["date", "date", "date"]),
  318. )
  319. with pytest.raises(ValueError, match=msg):
  320. store.append("df", df)
  321. # fully names
  322. _maybe_remove(store, "df")
  323. df = DataFrame(
  324. np.zeros((12, 2)),
  325. columns=["a", "b"],
  326. index=make_index(["date", "s", "t"]),
  327. )
  328. store.append("df", df)
  329. tm.assert_frame_equal(store.select("df"), df)
  330. @pytest.mark.parametrize("format", ["fixed", "table"])
  331. def test_store_periodindex(tmp_path, setup_path, format):
  332. # GH 7796
  333. # test of PeriodIndex in HDFStore
  334. df = DataFrame(
  335. np.random.default_rng(2).standard_normal((5, 1)),
  336. index=pd.period_range("20220101", freq="M", periods=5),
  337. )
  338. path = tmp_path / setup_path
  339. df.to_hdf(path, key="df", mode="w", format=format)
  340. expected = pd.read_hdf(path, "df")
  341. tm.assert_frame_equal(df, expected)