test_append.py 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015
  1. import datetime
  2. from datetime import timedelta
  3. import re
  4. import numpy as np
  5. import pytest
  6. from pandas._libs.tslibs import Timestamp
  7. import pandas.util._test_decorators as td
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. Index,
  12. Series,
  13. _testing as tm,
  14. concat,
  15. date_range,
  16. read_hdf,
  17. )
  18. from pandas.tests.io.pytables.common import (
  19. _maybe_remove,
  20. ensure_clean_store,
  21. )
  22. pytestmark = [pytest.mark.single_cpu]
  23. tables = pytest.importorskip("tables")
  24. @pytest.mark.filterwarnings("ignore::tables.NaturalNameWarning")
  25. def test_append(setup_path):
  26. with ensure_clean_store(setup_path) as store:
  27. # this is allowed by almost always don't want to do it
  28. # tables.NaturalNameWarning):
  29. df = DataFrame(
  30. np.random.default_rng(2).standard_normal((20, 4)),
  31. columns=Index(list("ABCD")),
  32. index=date_range("2000-01-01", periods=20, freq="B"),
  33. )
  34. _maybe_remove(store, "df1")
  35. store.append("df1", df[:10])
  36. store.append("df1", df[10:])
  37. tm.assert_frame_equal(store["df1"], df)
  38. _maybe_remove(store, "df2")
  39. store.put("df2", df[:10], format="table")
  40. store.append("df2", df[10:])
  41. tm.assert_frame_equal(store["df2"], df)
  42. _maybe_remove(store, "df3")
  43. store.append("/df3", df[:10])
  44. store.append("/df3", df[10:])
  45. tm.assert_frame_equal(store["df3"], df)
  46. # this is allowed by almost always don't want to do it
  47. # tables.NaturalNameWarning
  48. _maybe_remove(store, "/df3 foo")
  49. store.append("/df3 foo", df[:10])
  50. store.append("/df3 foo", df[10:])
  51. tm.assert_frame_equal(store["df3 foo"], df)
  52. # dtype issues - mizxed type in a single object column
  53. df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
  54. df["mixed_column"] = "testing"
  55. df.loc[2, "mixed_column"] = np.nan
  56. _maybe_remove(store, "df")
  57. store.append("df", df)
  58. tm.assert_frame_equal(store["df"], df)
  59. # uints - test storage of uints
  60. uint_data = DataFrame(
  61. {
  62. "u08": Series(
  63. np.random.default_rng(2).integers(0, high=255, size=5),
  64. dtype=np.uint8,
  65. ),
  66. "u16": Series(
  67. np.random.default_rng(2).integers(0, high=65535, size=5),
  68. dtype=np.uint16,
  69. ),
  70. "u32": Series(
  71. np.random.default_rng(2).integers(0, high=2**30, size=5),
  72. dtype=np.uint32,
  73. ),
  74. "u64": Series(
  75. [2**58, 2**59, 2**60, 2**61, 2**62],
  76. dtype=np.uint64,
  77. ),
  78. },
  79. index=np.arange(5),
  80. )
  81. _maybe_remove(store, "uints")
  82. store.append("uints", uint_data)
  83. tm.assert_frame_equal(store["uints"], uint_data, check_index_type=True)
  84. # uints - test storage of uints in indexable columns
  85. _maybe_remove(store, "uints")
  86. # 64-bit indices not yet supported
  87. store.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
  88. tm.assert_frame_equal(store["uints"], uint_data, check_index_type=True)
  89. def test_append_series(setup_path):
  90. with ensure_clean_store(setup_path) as store:
  91. # basic
  92. ss = Series(range(20), dtype=np.float64, index=[f"i_{i}" for i in range(20)])
  93. ts = Series(
  94. np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
  95. )
  96. ns = Series(np.arange(100))
  97. store.append("ss", ss)
  98. result = store["ss"]
  99. tm.assert_series_equal(result, ss)
  100. assert result.name is None
  101. store.append("ts", ts)
  102. result = store["ts"]
  103. tm.assert_series_equal(result, ts)
  104. assert result.name is None
  105. ns.name = "foo"
  106. store.append("ns", ns)
  107. result = store["ns"]
  108. tm.assert_series_equal(result, ns)
  109. assert result.name == ns.name
  110. # select on the values
  111. expected = ns[ns > 60]
  112. result = store.select("ns", "foo>60")
  113. tm.assert_series_equal(result, expected)
  114. # select on the index and values
  115. expected = ns[(ns > 70) & (ns.index < 90)]
  116. result = store.select("ns", "foo>70 and index<90")
  117. tm.assert_series_equal(result, expected, check_index_type=True)
  118. # multi-index
  119. mi = DataFrame(np.random.default_rng(2).standard_normal((5, 1)), columns=["A"])
  120. mi["B"] = np.arange(len(mi))
  121. mi["C"] = "foo"
  122. mi.loc[3:5, "C"] = "bar"
  123. mi.set_index(["C", "B"], inplace=True)
  124. s = mi.stack(future_stack=True)
  125. s.index = s.index.droplevel(2)
  126. store.append("mi", s)
  127. tm.assert_series_equal(store["mi"], s, check_index_type=True)
  128. def test_append_some_nans(setup_path):
  129. with ensure_clean_store(setup_path) as store:
  130. df = DataFrame(
  131. {
  132. "A": Series(np.random.default_rng(2).standard_normal(20)).astype(
  133. "int32"
  134. ),
  135. "A1": np.random.default_rng(2).standard_normal(20),
  136. "A2": np.random.default_rng(2).standard_normal(20),
  137. "B": "foo",
  138. "C": "bar",
  139. "D": Timestamp("2001-01-01").as_unit("ns"),
  140. "E": Timestamp("2001-01-02").as_unit("ns"),
  141. },
  142. index=np.arange(20),
  143. )
  144. # some nans
  145. _maybe_remove(store, "df1")
  146. df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
  147. store.append("df1", df[:10])
  148. store.append("df1", df[10:])
  149. tm.assert_frame_equal(store["df1"], df, check_index_type=True)
  150. # first column
  151. df1 = df.copy()
  152. df1["A1"] = np.nan
  153. _maybe_remove(store, "df1")
  154. store.append("df1", df1[:10])
  155. store.append("df1", df1[10:])
  156. tm.assert_frame_equal(store["df1"], df1, check_index_type=True)
  157. # 2nd column
  158. df2 = df.copy()
  159. df2["A2"] = np.nan
  160. _maybe_remove(store, "df2")
  161. store.append("df2", df2[:10])
  162. store.append("df2", df2[10:])
  163. tm.assert_frame_equal(store["df2"], df2, check_index_type=True)
  164. # datetimes
  165. df3 = df.copy()
  166. df3["E"] = np.nan
  167. _maybe_remove(store, "df3")
  168. store.append("df3", df3[:10])
  169. store.append("df3", df3[10:])
  170. tm.assert_frame_equal(store["df3"], df3, check_index_type=True)
  171. def test_append_all_nans(setup_path, using_infer_string):
  172. with ensure_clean_store(setup_path) as store:
  173. df = DataFrame(
  174. {
  175. "A1": np.random.default_rng(2).standard_normal(20),
  176. "A2": np.random.default_rng(2).standard_normal(20),
  177. },
  178. index=np.arange(20),
  179. )
  180. df.loc[0:15, :] = np.nan
  181. # nan some entire rows (dropna=True)
  182. _maybe_remove(store, "df")
  183. store.append("df", df[:10], dropna=True)
  184. store.append("df", df[10:], dropna=True)
  185. tm.assert_frame_equal(store["df"], df[-4:], check_index_type=True)
  186. # nan some entire rows (dropna=False)
  187. _maybe_remove(store, "df2")
  188. store.append("df2", df[:10], dropna=False)
  189. store.append("df2", df[10:], dropna=False)
  190. tm.assert_frame_equal(store["df2"], df, check_index_type=True)
  191. # tests the option io.hdf.dropna_table
  192. with pd.option_context("io.hdf.dropna_table", False):
  193. _maybe_remove(store, "df3")
  194. store.append("df3", df[:10])
  195. store.append("df3", df[10:])
  196. tm.assert_frame_equal(store["df3"], df)
  197. with pd.option_context("io.hdf.dropna_table", True):
  198. _maybe_remove(store, "df4")
  199. store.append("df4", df[:10])
  200. store.append("df4", df[10:])
  201. tm.assert_frame_equal(store["df4"], df[-4:])
  202. # nan some entire rows (string are still written!)
  203. df = DataFrame(
  204. {
  205. "A1": np.random.default_rng(2).standard_normal(20),
  206. "A2": np.random.default_rng(2).standard_normal(20),
  207. "B": "foo",
  208. "C": "bar",
  209. },
  210. index=np.arange(20),
  211. )
  212. df.loc[0:15, :] = np.nan
  213. _maybe_remove(store, "df")
  214. store.append("df", df[:10], dropna=True)
  215. store.append("df", df[10:], dropna=True)
  216. result = store["df"]
  217. expected = df
  218. if using_infer_string:
  219. # TODO: Test is incorrect when not using_infer_string.
  220. # Should take the last 4 rows uncondiationally.
  221. expected = expected[-4:]
  222. tm.assert_frame_equal(result, expected, check_index_type=True)
  223. _maybe_remove(store, "df2")
  224. store.append("df2", df[:10], dropna=False)
  225. store.append("df2", df[10:], dropna=False)
  226. tm.assert_frame_equal(store["df2"], df, check_index_type=True)
  227. # nan some entire rows (but since we have dates they are still
  228. # written!)
  229. df = DataFrame(
  230. {
  231. "A1": np.random.default_rng(2).standard_normal(20),
  232. "A2": np.random.default_rng(2).standard_normal(20),
  233. "B": "foo",
  234. "C": "bar",
  235. "D": Timestamp("2001-01-01").as_unit("ns"),
  236. "E": Timestamp("2001-01-02").as_unit("ns"),
  237. },
  238. index=np.arange(20),
  239. )
  240. df.loc[0:15, :] = np.nan
  241. _maybe_remove(store, "df")
  242. store.append("df", df[:10], dropna=True)
  243. store.append("df", df[10:], dropna=True)
  244. tm.assert_frame_equal(store["df"], df, check_index_type=True)
  245. _maybe_remove(store, "df2")
  246. store.append("df2", df[:10], dropna=False)
  247. store.append("df2", df[10:], dropna=False)
  248. tm.assert_frame_equal(store["df2"], df, check_index_type=True)
  249. def test_append_frame_column_oriented(setup_path):
  250. with ensure_clean_store(setup_path) as store:
  251. # column oriented
  252. df = DataFrame(
  253. np.random.default_rng(2).standard_normal((10, 4)),
  254. columns=Index(list("ABCD")),
  255. index=date_range("2000-01-01", periods=10, freq="B"),
  256. )
  257. df.index = df.index._with_freq(None) # freq doesn't round-trip
  258. _maybe_remove(store, "df1")
  259. store.append("df1", df.iloc[:, :2], axes=["columns"])
  260. store.append("df1", df.iloc[:, 2:])
  261. tm.assert_frame_equal(store["df1"], df)
  262. result = store.select("df1", "columns=A")
  263. expected = df.reindex(columns=["A"])
  264. tm.assert_frame_equal(expected, result)
  265. # selection on the non-indexable
  266. result = store.select("df1", ("columns=A", "index=df.index[0:4]"))
  267. expected = df.reindex(columns=["A"], index=df.index[0:4])
  268. tm.assert_frame_equal(expected, result)
  269. # this isn't supported
  270. msg = re.escape(
  271. "passing a filterable condition to a non-table indexer "
  272. "[Filter: Not Initialized]"
  273. )
  274. with pytest.raises(TypeError, match=msg):
  275. store.select("df1", "columns=A and index>df.index[4]")
  276. def test_append_with_different_block_ordering(setup_path):
  277. # GH 4096; using same frames, but different block orderings
  278. with ensure_clean_store(setup_path) as store:
  279. for i in range(10):
  280. df = DataFrame(
  281. np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
  282. )
  283. df["index"] = range(10)
  284. df["index"] += i * 10
  285. df["int64"] = Series([1] * len(df), dtype="int64")
  286. df["int16"] = Series([1] * len(df), dtype="int16")
  287. if i % 2 == 0:
  288. del df["int64"]
  289. df["int64"] = Series([1] * len(df), dtype="int64")
  290. if i % 3 == 0:
  291. a = df.pop("A")
  292. df["A"] = a
  293. df.set_index("index", inplace=True)
  294. store.append("df", df)
  295. # test a different ordering but with more fields (like invalid
  296. # combinations)
  297. with ensure_clean_store(setup_path) as store:
  298. df = DataFrame(
  299. np.random.default_rng(2).standard_normal((10, 2)),
  300. columns=list("AB"),
  301. dtype="float64",
  302. )
  303. df["int64"] = Series([1] * len(df), dtype="int64")
  304. df["int16"] = Series([1] * len(df), dtype="int16")
  305. store.append("df", df)
  306. # store additional fields in different blocks
  307. df["int16_2"] = Series([1] * len(df), dtype="int16")
  308. msg = re.escape(
  309. "cannot match existing table structure for [int16] on appending data"
  310. )
  311. with pytest.raises(ValueError, match=msg):
  312. store.append("df", df)
  313. # store multiple additional fields in different blocks
  314. df["float_3"] = Series([1.0] * len(df), dtype="float64")
  315. msg = re.escape(
  316. "cannot match existing table structure for [A,B] on appending data"
  317. )
  318. with pytest.raises(ValueError, match=msg):
  319. store.append("df", df)
  320. def test_append_with_strings(setup_path):
  321. with ensure_clean_store(setup_path) as store:
  322. def check_col(key, name, size):
  323. assert (
  324. getattr(store.get_storer(key).table.description, name).itemsize == size
  325. )
  326. # avoid truncation on elements
  327. df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
  328. store.append("df_big", df)
  329. tm.assert_frame_equal(store.select("df_big"), df)
  330. check_col("df_big", "values_block_1", 15)
  331. # appending smaller string ok
  332. df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
  333. store.append("df_big", df2)
  334. expected = concat([df, df2])
  335. tm.assert_frame_equal(store.select("df_big"), expected)
  336. check_col("df_big", "values_block_1", 15)
  337. # avoid truncation on elements
  338. df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
  339. store.append("df_big2", df, min_itemsize={"values": 50})
  340. tm.assert_frame_equal(store.select("df_big2"), df)
  341. check_col("df_big2", "values_block_1", 50)
  342. # bigger string on next append
  343. store.append("df_new", df)
  344. df_new = DataFrame([[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]])
  345. msg = (
  346. r"Trying to store a string with len \[26\] in "
  347. r"\[values_block_1\] column but\n"
  348. r"this column has a limit of \[15\]!\n"
  349. "Consider using min_itemsize to preset the sizes on these "
  350. "columns"
  351. )
  352. with pytest.raises(ValueError, match=msg):
  353. store.append("df_new", df_new)
  354. # min_itemsize on Series index (GH 11412)
  355. df = DataFrame(
  356. {
  357. "A": [0.0, 1.0, 2.0, 3.0, 4.0],
  358. "B": [0.0, 1.0, 0.0, 1.0, 0.0],
  359. "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
  360. "D": date_range("20130101", periods=5),
  361. }
  362. ).set_index("C")
  363. store.append("ss", df["B"], min_itemsize={"index": 4})
  364. tm.assert_series_equal(store.select("ss"), df["B"])
  365. # same as above, with data_columns=True
  366. store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4})
  367. tm.assert_series_equal(store.select("ss2"), df["B"])
  368. # min_itemsize in index without appending (GH 10381)
  369. store.put("ss3", df, format="table", min_itemsize={"index": 6})
  370. # just make sure there is a longer string:
  371. df2 = df.copy().reset_index().assign(C="longer").set_index("C")
  372. store.append("ss3", df2)
  373. tm.assert_frame_equal(store.select("ss3"), concat([df, df2]))
  374. # same as above, with a Series
  375. store.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
  376. store.append("ss4", df2["B"])
  377. tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]]))
  378. # with nans
  379. _maybe_remove(store, "df")
  380. df = DataFrame(
  381. np.random.default_rng(2).standard_normal((10, 4)),
  382. columns=Index(list("ABCD")),
  383. index=date_range("2000-01-01", periods=10, freq="B"),
  384. )
  385. df["string"] = "foo"
  386. df.loc[df.index[1:4], "string"] = np.nan
  387. df["string2"] = "bar"
  388. df.loc[df.index[4:8], "string2"] = np.nan
  389. df["string3"] = "bah"
  390. df.loc[df.index[1:], "string3"] = np.nan
  391. store.append("df", df)
  392. result = store.select("df")
  393. tm.assert_frame_equal(result, df)
  394. with ensure_clean_store(setup_path) as store:
  395. df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))
  396. # a min_itemsize that creates a data_column
  397. _maybe_remove(store, "df")
  398. store.append("df", df, min_itemsize={"A": 200})
  399. check_col("df", "A", 200)
  400. assert store.get_storer("df").data_columns == ["A"]
  401. # a min_itemsize that creates a data_column2
  402. _maybe_remove(store, "df")
  403. store.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
  404. check_col("df", "A", 200)
  405. assert store.get_storer("df").data_columns == ["B", "A"]
  406. # a min_itemsize that creates a data_column2
  407. _maybe_remove(store, "df")
  408. store.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
  409. check_col("df", "B", 200)
  410. check_col("df", "values_block_0", 200)
  411. assert store.get_storer("df").data_columns == ["B"]
  412. # infer the .typ on subsequent appends
  413. _maybe_remove(store, "df")
  414. store.append("df", df[:5], min_itemsize=200)
  415. store.append("df", df[5:], min_itemsize=200)
  416. tm.assert_frame_equal(store["df"], df)
  417. # invalid min_itemsize keys
  418. df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
  419. _maybe_remove(store, "df")
  420. msg = re.escape(
  421. "min_itemsize has the key [foo] which is not an axis or data_column"
  422. )
  423. with pytest.raises(ValueError, match=msg):
  424. store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
  425. def test_append_with_empty_string(setup_path):
  426. with ensure_clean_store(setup_path) as store:
  427. # with all empty strings (GH 12242)
  428. df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]})
  429. store.append("df", df[:-1], min_itemsize={"x": 1})
  430. store.append("df", df[-1:], min_itemsize={"x": 1})
  431. tm.assert_frame_equal(store.select("df"), df)
  432. def test_append_with_data_columns(setup_path):
  433. with ensure_clean_store(setup_path) as store:
  434. df = DataFrame(
  435. np.random.default_rng(2).standard_normal((10, 4)),
  436. columns=Index(list("ABCD")),
  437. index=date_range("2000-01-01", periods=10, freq="B"),
  438. )
  439. df.iloc[0, df.columns.get_loc("B")] = 1.0
  440. _maybe_remove(store, "df")
  441. store.append("df", df[:2], data_columns=["B"])
  442. store.append("df", df[2:])
  443. tm.assert_frame_equal(store["df"], df)
  444. # check that we have indices created
  445. assert store._handle.root.df.table.cols.index.is_indexed is True
  446. assert store._handle.root.df.table.cols.B.is_indexed is True
  447. # data column searching
  448. result = store.select("df", "B>0")
  449. expected = df[df.B > 0]
  450. tm.assert_frame_equal(result, expected)
  451. # data column searching (with an indexable and a data_columns)
  452. result = store.select("df", "B>0 and index>df.index[3]")
  453. df_new = df.reindex(index=df.index[4:])
  454. expected = df_new[df_new.B > 0]
  455. tm.assert_frame_equal(result, expected)
  456. # data column selection with a string data_column
  457. df_new = df.copy()
  458. df_new["string"] = "foo"
  459. df_new.loc[df_new.index[1:4], "string"] = np.nan
  460. df_new.loc[df_new.index[5:6], "string"] = "bar"
  461. _maybe_remove(store, "df")
  462. store.append("df", df_new, data_columns=["string"])
  463. result = store.select("df", "string='foo'")
  464. expected = df_new[df_new.string == "foo"]
  465. tm.assert_frame_equal(result, expected)
  466. # using min_itemsize and a data column
  467. def check_col(key, name, size):
  468. assert (
  469. getattr(store.get_storer(key).table.description, name).itemsize == size
  470. )
  471. with ensure_clean_store(setup_path) as store:
  472. _maybe_remove(store, "df")
  473. store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30})
  474. check_col("df", "string", 30)
  475. _maybe_remove(store, "df")
  476. store.append("df", df_new, data_columns=["string"], min_itemsize=30)
  477. check_col("df", "string", 30)
  478. _maybe_remove(store, "df")
  479. store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30})
  480. check_col("df", "string", 30)
  481. with ensure_clean_store(setup_path) as store:
  482. df_new["string2"] = "foobarbah"
  483. df_new["string_block1"] = "foobarbah1"
  484. df_new["string_block2"] = "foobarbah2"
  485. _maybe_remove(store, "df")
  486. store.append(
  487. "df",
  488. df_new,
  489. data_columns=["string", "string2"],
  490. min_itemsize={"string": 30, "string2": 40, "values": 50},
  491. )
  492. check_col("df", "string", 30)
  493. check_col("df", "string2", 40)
  494. check_col("df", "values_block_1", 50)
  495. with ensure_clean_store(setup_path) as store:
  496. # multiple data columns
  497. df_new = df.copy()
  498. df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
  499. df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
  500. df_new["string"] = "foo"
  501. sl = df_new.columns.get_loc("string")
  502. df_new.iloc[1:4, sl] = np.nan
  503. df_new.iloc[5:6, sl] = "bar"
  504. df_new["string2"] = "foo"
  505. sl = df_new.columns.get_loc("string2")
  506. df_new.iloc[2:5, sl] = np.nan
  507. df_new.iloc[7:8, sl] = "bar"
  508. _maybe_remove(store, "df")
  509. store.append("df", df_new, data_columns=["A", "B", "string", "string2"])
  510. result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0")
  511. expected = df_new[
  512. (df_new.string == "foo")
  513. & (df_new.string2 == "foo")
  514. & (df_new.A > 0)
  515. & (df_new.B < 0)
  516. ]
  517. tm.assert_frame_equal(result, expected, check_freq=False)
  518. # FIXME: 2020-05-07 freq check randomly fails in the CI
  519. # yield an empty frame
  520. result = store.select("df", "string='foo' and string2='cool'")
  521. expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
  522. tm.assert_frame_equal(result, expected)
  523. with ensure_clean_store(setup_path) as store:
  524. # doc example
  525. df_dc = df.copy()
  526. df_dc["string"] = "foo"
  527. df_dc.loc[df_dc.index[4:6], "string"] = np.nan
  528. df_dc.loc[df_dc.index[7:9], "string"] = "bar"
  529. df_dc["string2"] = "cool"
  530. df_dc["datetime"] = Timestamp("20010102").as_unit("ns")
  531. df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan
  532. _maybe_remove(store, "df_dc")
  533. store.append(
  534. "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
  535. )
  536. result = store.select("df_dc", "B>0")
  537. expected = df_dc[df_dc.B > 0]
  538. tm.assert_frame_equal(result, expected)
  539. result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
  540. expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
  541. tm.assert_frame_equal(result, expected, check_freq=False)
  542. # FIXME: 2020-12-07 intermittent build failures here with freq of
  543. # None instead of BDay(4)
  544. with ensure_clean_store(setup_path) as store:
  545. # doc example part 2
  546. index = date_range("1/1/2000", periods=8)
  547. df_dc = DataFrame(
  548. np.random.default_rng(2).standard_normal((8, 3)),
  549. index=index,
  550. columns=["A", "B", "C"],
  551. )
  552. df_dc["string"] = "foo"
  553. df_dc.loc[df_dc.index[4:6], "string"] = np.nan
  554. df_dc.loc[df_dc.index[7:9], "string"] = "bar"
  555. df_dc[["B", "C"]] = df_dc[["B", "C"]].abs()
  556. df_dc["string2"] = "cool"
  557. # on-disk operations
  558. store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])
  559. result = store.select("df_dc", "B>0")
  560. expected = df_dc[df_dc.B > 0]
  561. tm.assert_frame_equal(result, expected)
  562. result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
  563. expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
  564. tm.assert_frame_equal(result, expected)
  565. def test_append_hierarchical(tmp_path, setup_path, multiindex_dataframe_random_data):
  566. df = multiindex_dataframe_random_data
  567. df.columns.name = None
  568. with ensure_clean_store(setup_path) as store:
  569. store.append("mi", df)
  570. result = store.select("mi")
  571. tm.assert_frame_equal(result, df)
  572. # GH 3748
  573. result = store.select("mi", columns=["A", "B"])
  574. expected = df.reindex(columns=["A", "B"])
  575. tm.assert_frame_equal(result, expected)
  576. path = tmp_path / "test.hdf"
  577. df.to_hdf(path, key="df", format="table")
  578. result = read_hdf(path, "df", columns=["A", "B"])
  579. expected = df.reindex(columns=["A", "B"])
  580. tm.assert_frame_equal(result, expected)
  581. def test_append_misc(setup_path):
  582. with ensure_clean_store(setup_path) as store:
  583. df = DataFrame(
  584. 1.1 * np.arange(120).reshape((30, 4)),
  585. columns=Index(list("ABCD")),
  586. index=Index([f"i-{i}" for i in range(30)]),
  587. )
  588. store.append("df", df, chunksize=1)
  589. result = store.select("df")
  590. tm.assert_frame_equal(result, df)
  591. store.append("df1", df, expectedrows=10)
  592. result = store.select("df1")
  593. tm.assert_frame_equal(result, df)
  594. @pytest.mark.parametrize("chunksize", [10, 200, 1000])
  595. def test_append_misc_chunksize(setup_path, chunksize):
  596. # more chunksize in append tests
  597. df = DataFrame(
  598. 1.1 * np.arange(120).reshape((30, 4)),
  599. columns=Index(list("ABCD")),
  600. index=Index([f"i-{i}" for i in range(30)]),
  601. )
  602. df["string"] = "foo"
  603. df["float322"] = 1.0
  604. df["float322"] = df["float322"].astype("float32")
  605. df["bool"] = df["float322"] > 0
  606. df["time1"] = Timestamp("20130101").as_unit("ns")
  607. df["time2"] = Timestamp("20130102").as_unit("ns")
  608. with ensure_clean_store(setup_path, mode="w") as store:
  609. store.append("obj", df, chunksize=chunksize)
  610. result = store.select("obj")
  611. tm.assert_frame_equal(result, df)
  612. def test_append_misc_empty_frame(setup_path):
  613. # empty frame, GH4273
  614. with ensure_clean_store(setup_path) as store:
  615. # 0 len
  616. df_empty = DataFrame(columns=list("ABC"))
  617. store.append("df", df_empty)
  618. with pytest.raises(KeyError, match="'No object named df in the file'"):
  619. store.select("df")
  620. # repeated append of 0/non-zero frames
  621. df = DataFrame(np.random.default_rng(2).random((10, 3)), columns=list("ABC"))
  622. store.append("df", df)
  623. tm.assert_frame_equal(store.select("df"), df)
  624. store.append("df", df_empty)
  625. tm.assert_frame_equal(store.select("df"), df)
  626. # store
  627. df = DataFrame(columns=list("ABC"))
  628. store.put("df2", df)
  629. tm.assert_frame_equal(store.select("df2"), df)
  630. # TODO(ArrayManager) currently we rely on falling back to BlockManager, but
  631. # the conversion from AM->BM converts the invalid object dtype column into
  632. # a datetime64 column no longer raising an error
  633. @td.skip_array_manager_not_yet_implemented
  634. def test_append_raise(setup_path, using_infer_string):
  635. with ensure_clean_store(setup_path) as store:
  636. # test append with invalid input to get good error messages
  637. # list in column
  638. df = DataFrame(
  639. 1.1 * np.arange(120).reshape((30, 4)),
  640. columns=Index(list("ABCD")),
  641. index=Index([f"i-{i}" for i in range(30)]),
  642. )
  643. df["invalid"] = [["a"]] * len(df)
  644. assert df.dtypes["invalid"] == np.object_
  645. msg = re.escape(
  646. """Cannot serialize the column [invalid]
  647. because its data contents are not [string] but [mixed] object dtype"""
  648. )
  649. with pytest.raises(TypeError, match=msg):
  650. store.append("df", df)
  651. # multiple invalid columns
  652. df["invalid2"] = [["a"]] * len(df)
  653. df["invalid3"] = [["a"]] * len(df)
  654. with pytest.raises(TypeError, match=msg):
  655. store.append("df", df)
  656. # datetime with embedded nans as object
  657. df = DataFrame(
  658. 1.1 * np.arange(120).reshape((30, 4)),
  659. columns=Index(list("ABCD")),
  660. index=Index([f"i-{i}" for i in range(30)]),
  661. )
  662. s = Series(datetime.datetime(2001, 1, 2), index=df.index)
  663. s = s.astype(object)
  664. s[0:5] = np.nan
  665. df["invalid"] = s
  666. assert df.dtypes["invalid"] == np.object_
  667. msg = "too many timezones in this block, create separate data columns"
  668. with pytest.raises(TypeError, match=msg):
  669. store.append("df", df)
  670. # directly ndarray
  671. msg = "value must be None, Series, or DataFrame"
  672. with pytest.raises(TypeError, match=msg):
  673. store.append("df", np.arange(10))
  674. # series directly
  675. msg = re.escape(
  676. "cannot properly create the storer for: "
  677. "[group->df,value-><class 'pandas.core.series.Series'>]"
  678. )
  679. with pytest.raises(TypeError, match=msg):
  680. store.append("df", Series(np.arange(10)))
  681. # appending an incompatible table
  682. df = DataFrame(
  683. 1.1 * np.arange(120).reshape((30, 4)),
  684. columns=Index(list("ABCD")),
  685. index=Index([f"i-{i}" for i in range(30)]),
  686. )
  687. store.append("df", df)
  688. df["foo"] = "foo"
  689. msg = re.escape(
  690. "invalid combination of [non_index_axes] on appending data "
  691. "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table "
  692. "[(1, ['A', 'B', 'C', 'D'])]"
  693. )
  694. with pytest.raises(ValueError, match=msg):
  695. store.append("df", df)
  696. # incompatible type (GH 41897)
  697. _maybe_remove(store, "df")
  698. df["foo"] = Timestamp("20130101")
  699. store.append("df", df)
  700. df["foo"] = "bar"
  701. msg = re.escape(
  702. "Cannot serialize the column [foo] "
  703. "because its data contents are not [string] "
  704. "but [datetime64[s]] object dtype"
  705. )
  706. with pytest.raises(ValueError, match=msg):
  707. store.append("df", df)
  708. def test_append_with_timedelta(setup_path):
  709. # GH 3577
  710. # append timedelta
  711. ts = Timestamp("20130101").as_unit("ns")
  712. df = DataFrame(
  713. {
  714. "A": ts,
  715. "B": [ts + timedelta(days=i, seconds=10) for i in range(10)],
  716. }
  717. )
  718. df["C"] = df["A"] - df["B"]
  719. df.loc[3:5, "C"] = np.nan
  720. with ensure_clean_store(setup_path) as store:
  721. # table
  722. _maybe_remove(store, "df")
  723. store.append("df", df, data_columns=True)
  724. result = store.select("df")
  725. tm.assert_frame_equal(result, df)
  726. result = store.select("df", where="C<100000")
  727. tm.assert_frame_equal(result, df)
  728. result = store.select("df", where="C<pd.Timedelta('-3D')")
  729. tm.assert_frame_equal(result, df.iloc[3:])
  730. result = store.select("df", "C<'-3D'")
  731. tm.assert_frame_equal(result, df.iloc[3:])
  732. # a bit hacky here as we don't really deal with the NaT properly
  733. result = store.select("df", "C<'-500000s'")
  734. result = result.dropna(subset=["C"])
  735. tm.assert_frame_equal(result, df.iloc[6:])
  736. result = store.select("df", "C<'-3.5D'")
  737. result = result.iloc[1:]
  738. tm.assert_frame_equal(result, df.iloc[4:])
  739. # fixed
  740. _maybe_remove(store, "df2")
  741. store.put("df2", df)
  742. result = store.select("df2")
  743. tm.assert_frame_equal(result, df)
  744. def test_append_to_multiple(setup_path):
  745. df1 = DataFrame(
  746. np.random.default_rng(2).standard_normal((10, 4)),
  747. columns=Index(list("ABCD")),
  748. index=date_range("2000-01-01", periods=10, freq="B"),
  749. )
  750. df2 = df1.copy().rename(columns="{}_2".format)
  751. df2["foo"] = "bar"
  752. df = concat([df1, df2], axis=1)
  753. with ensure_clean_store(setup_path) as store:
  754. # exceptions
  755. msg = "append_to_multiple requires a selector that is in passed dict"
  756. with pytest.raises(ValueError, match=msg):
  757. store.append_to_multiple(
  758. {"df1": ["A", "B"], "df2": None}, df, selector="df3"
  759. )
  760. with pytest.raises(ValueError, match=msg):
  761. store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3")
  762. msg = (
  763. "append_to_multiple must have a dictionary specified as the way to "
  764. "split the value"
  765. )
  766. with pytest.raises(ValueError, match=msg):
  767. store.append_to_multiple("df1", df, "df1")
  768. # regular operation
  769. store.append_to_multiple({"df1": ["A", "B"], "df2": None}, df, selector="df1")
  770. result = store.select_as_multiple(
  771. ["df1", "df2"], where=["A>0", "B>0"], selector="df1"
  772. )
  773. expected = df[(df.A > 0) & (df.B > 0)]
  774. tm.assert_frame_equal(result, expected)
  775. def test_append_to_multiple_dropna(setup_path):
  776. df1 = DataFrame(
  777. np.random.default_rng(2).standard_normal((10, 4)),
  778. columns=Index(list("ABCD")),
  779. index=date_range("2000-01-01", periods=10, freq="B"),
  780. )
  781. df2 = DataFrame(
  782. np.random.default_rng(2).standard_normal((10, 4)),
  783. columns=Index(list("ABCD")),
  784. index=date_range("2000-01-01", periods=10, freq="B"),
  785. ).rename(columns="{}_2".format)
  786. df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
  787. df = concat([df1, df2], axis=1)
  788. with ensure_clean_store(setup_path) as store:
  789. # dropna=True should guarantee rows are synchronized
  790. store.append_to_multiple(
  791. {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True
  792. )
  793. result = store.select_as_multiple(["df1", "df2"])
  794. expected = df.dropna()
  795. tm.assert_frame_equal(result, expected, check_index_type=True)
  796. tm.assert_index_equal(store.select("df1").index, store.select("df2").index)
  797. def test_append_to_multiple_dropna_false(setup_path):
  798. df1 = DataFrame(
  799. np.random.default_rng(2).standard_normal((10, 4)),
  800. columns=Index(list("ABCD")),
  801. index=date_range("2000-01-01", periods=10, freq="B"),
  802. )
  803. df2 = df1.copy().rename(columns="{}_2".format)
  804. df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
  805. df = concat([df1, df2], axis=1)
  806. with ensure_clean_store(setup_path) as store, pd.option_context(
  807. "io.hdf.dropna_table", True
  808. ):
  809. # dropna=False shouldn't synchronize row indexes
  810. store.append_to_multiple(
  811. {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False
  812. )
  813. msg = "all tables must have exactly the same nrows!"
  814. with pytest.raises(ValueError, match=msg):
  815. store.select_as_multiple(["df1a", "df2a"])
  816. assert not store.select("df1a").index.equals(store.select("df2a").index)
  817. def test_append_to_multiple_min_itemsize(setup_path):
  818. # GH 11238
  819. df = DataFrame(
  820. {
  821. "IX": np.arange(1, 21),
  822. "Num": np.arange(1, 21),
  823. "BigNum": np.arange(1, 21) * 88,
  824. "Str": ["a" for _ in range(20)],
  825. "LongStr": ["abcde" for _ in range(20)],
  826. }
  827. )
  828. expected = df.iloc[[0]]
  829. with ensure_clean_store(setup_path) as store:
  830. store.append_to_multiple(
  831. {
  832. "index": ["IX"],
  833. "nums": ["Num", "BigNum"],
  834. "strs": ["Str", "LongStr"],
  835. },
  836. df.iloc[[0]],
  837. "index",
  838. min_itemsize={"Str": 10, "LongStr": 100, "Num": 2},
  839. )
  840. result = store.select_as_multiple(["index", "nums", "strs"])
  841. tm.assert_frame_equal(result, expected, check_index_type=True)
  842. def test_append_string_nan_rep(setup_path):
  843. # GH 16300
  844. df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10))
  845. df_nan = df.copy()
  846. df_nan.loc[0:4, :] = np.nan
  847. msg = "NaN representation is too large for existing column size"
  848. with ensure_clean_store(setup_path) as store:
  849. # string column too small
  850. store.append("sa", df["A"])
  851. with pytest.raises(ValueError, match=msg):
  852. store.append("sa", df_nan["A"])
  853. # nan_rep too big
  854. store.append("sb", df["B"], nan_rep="bars")
  855. with pytest.raises(ValueError, match=msg):
  856. store.append("sb", df_nan["B"])
  857. # smaller modified nan_rep
  858. store.append("sc", df["A"], nan_rep="n")
  859. store.append("sc", df_nan["A"])
  860. result = store["sc"]
  861. expected = concat([df["A"], df_nan["A"]])
  862. tm.assert_series_equal(result, expected)