test_select.py 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046
  1. import numpy as np
  2. import pytest
  3. from pandas._libs.tslibs import Timestamp
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. HDFStore,
  8. Index,
  9. MultiIndex,
  10. Series,
  11. _testing as tm,
  12. bdate_range,
  13. concat,
  14. date_range,
  15. isna,
  16. read_hdf,
  17. )
  18. from pandas.tests.io.pytables.common import (
  19. _maybe_remove,
  20. ensure_clean_store,
  21. )
  22. from pandas.io.pytables import Term
  23. pytestmark = [pytest.mark.single_cpu]
  24. def test_select_columns_in_where(setup_path):
  25. # GH 6169
  26. # recreate multi-indexes when columns is passed
  27. # in the `where` argument
  28. index = MultiIndex(
  29. levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
  30. codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  31. names=["foo_name", "bar_name"],
  32. )
  33. # With a DataFrame
  34. df = DataFrame(
  35. np.random.default_rng(2).standard_normal((10, 3)),
  36. index=index,
  37. columns=["A", "B", "C"],
  38. )
  39. with ensure_clean_store(setup_path) as store:
  40. store.put("df", df, format="table")
  41. expected = df[["A"]]
  42. tm.assert_frame_equal(store.select("df", columns=["A"]), expected)
  43. tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected)
  44. # With a Series
  45. s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="A")
  46. with ensure_clean_store(setup_path) as store:
  47. store.put("s", s, format="table")
  48. tm.assert_series_equal(store.select("s", where="columns=['A']"), s)
  49. def test_select_with_dups(setup_path):
  50. # single dtypes
  51. df = DataFrame(
  52. np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "A", "B", "B"]
  53. )
  54. df.index = date_range("20130101 9:30", periods=10, freq="min")
  55. with ensure_clean_store(setup_path) as store:
  56. store.append("df", df)
  57. result = store.select("df")
  58. expected = df
  59. tm.assert_frame_equal(result, expected, by_blocks=True)
  60. result = store.select("df", columns=df.columns)
  61. expected = df
  62. tm.assert_frame_equal(result, expected, by_blocks=True)
  63. result = store.select("df", columns=["A"])
  64. expected = df.loc[:, ["A"]]
  65. tm.assert_frame_equal(result, expected)
  66. # dups across dtypes
  67. df = concat(
  68. [
  69. DataFrame(
  70. np.random.default_rng(2).standard_normal((10, 4)),
  71. columns=["A", "A", "B", "B"],
  72. ),
  73. DataFrame(
  74. np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
  75. columns=["A", "C"],
  76. ),
  77. ],
  78. axis=1,
  79. )
  80. df.index = date_range("20130101 9:30", periods=10, freq="min")
  81. with ensure_clean_store(setup_path) as store:
  82. store.append("df", df)
  83. result = store.select("df")
  84. expected = df
  85. tm.assert_frame_equal(result, expected, by_blocks=True)
  86. result = store.select("df", columns=df.columns)
  87. expected = df
  88. tm.assert_frame_equal(result, expected, by_blocks=True)
  89. expected = df.loc[:, ["A"]]
  90. result = store.select("df", columns=["A"])
  91. tm.assert_frame_equal(result, expected, by_blocks=True)
  92. expected = df.loc[:, ["B", "A"]]
  93. result = store.select("df", columns=["B", "A"])
  94. tm.assert_frame_equal(result, expected, by_blocks=True)
  95. # duplicates on both index and columns
  96. with ensure_clean_store(setup_path) as store:
  97. store.append("df", df)
  98. store.append("df", df)
  99. expected = df.loc[:, ["B", "A"]]
  100. expected = concat([expected, expected])
  101. result = store.select("df", columns=["B", "A"])
  102. tm.assert_frame_equal(result, expected, by_blocks=True)
  103. def test_select(setup_path):
  104. with ensure_clean_store(setup_path) as store:
  105. # select with columns=
  106. df = DataFrame(
  107. np.random.default_rng(2).standard_normal((10, 4)),
  108. columns=Index(list("ABCD")),
  109. index=date_range("2000-01-01", periods=10, freq="B"),
  110. )
  111. _maybe_remove(store, "df")
  112. store.append("df", df)
  113. result = store.select("df", columns=["A", "B"])
  114. expected = df.reindex(columns=["A", "B"])
  115. tm.assert_frame_equal(expected, result)
  116. # equivalently
  117. result = store.select("df", [("columns=['A', 'B']")])
  118. expected = df.reindex(columns=["A", "B"])
  119. tm.assert_frame_equal(expected, result)
  120. # with a data column
  121. _maybe_remove(store, "df")
  122. store.append("df", df, data_columns=["A"])
  123. result = store.select("df", ["A > 0"], columns=["A", "B"])
  124. expected = df[df.A > 0].reindex(columns=["A", "B"])
  125. tm.assert_frame_equal(expected, result)
  126. # all a data columns
  127. _maybe_remove(store, "df")
  128. store.append("df", df, data_columns=True)
  129. result = store.select("df", ["A > 0"], columns=["A", "B"])
  130. expected = df[df.A > 0].reindex(columns=["A", "B"])
  131. tm.assert_frame_equal(expected, result)
  132. # with a data column, but different columns
  133. _maybe_remove(store, "df")
  134. store.append("df", df, data_columns=["A"])
  135. result = store.select("df", ["A > 0"], columns=["C", "D"])
  136. expected = df[df.A > 0].reindex(columns=["C", "D"])
  137. tm.assert_frame_equal(expected, result)
  138. def test_select_dtypes(setup_path):
  139. with ensure_clean_store(setup_path) as store:
  140. # with a Timestamp data column (GH #2637)
  141. df = DataFrame(
  142. {
  143. "ts": bdate_range("2012-01-01", periods=300),
  144. "A": np.random.default_rng(2).standard_normal(300),
  145. }
  146. )
  147. _maybe_remove(store, "df")
  148. store.append("df", df, data_columns=["ts", "A"])
  149. result = store.select("df", "ts>=Timestamp('2012-02-01')")
  150. expected = df[df.ts >= Timestamp("2012-02-01")]
  151. tm.assert_frame_equal(expected, result)
  152. # bool columns (GH #2849)
  153. df = DataFrame(
  154. np.random.default_rng(2).standard_normal((5, 2)), columns=["A", "B"]
  155. )
  156. df["object"] = "foo"
  157. df.loc[4:5, "object"] = "bar"
  158. df["boolv"] = df["A"] > 0
  159. _maybe_remove(store, "df")
  160. store.append("df", df, data_columns=True)
  161. expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa: E712
  162. for v in [True, "true", 1]:
  163. result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
  164. tm.assert_frame_equal(expected, result)
  165. expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa: E712
  166. for v in [False, "false", 0]:
  167. result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
  168. tm.assert_frame_equal(expected, result)
  169. # integer index
  170. df = DataFrame(
  171. {
  172. "A": np.random.default_rng(2).random(20),
  173. "B": np.random.default_rng(2).random(20),
  174. }
  175. )
  176. _maybe_remove(store, "df_int")
  177. store.append("df_int", df)
  178. result = store.select("df_int", "index<10 and columns=['A']")
  179. expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
  180. tm.assert_frame_equal(expected, result)
  181. # float index
  182. df = DataFrame(
  183. {
  184. "A": np.random.default_rng(2).random(20),
  185. "B": np.random.default_rng(2).random(20),
  186. "index": np.arange(20, dtype="f8"),
  187. }
  188. )
  189. _maybe_remove(store, "df_float")
  190. store.append("df_float", df)
  191. result = store.select("df_float", "index<10.0 and columns=['A']")
  192. expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
  193. tm.assert_frame_equal(expected, result)
  194. with ensure_clean_store(setup_path) as store:
  195. # floats w/o NaN
  196. df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
  197. df["cols"] = (df["cols"] + 10).apply(str)
  198. store.append("df1", df, data_columns=True)
  199. result = store.select("df1", where="values>2.0")
  200. expected = df[df["values"] > 2.0]
  201. tm.assert_frame_equal(expected, result)
  202. # floats with NaN
  203. df.iloc[0] = np.nan
  204. expected = df[df["values"] > 2.0]
  205. store.append("df2", df, data_columns=True, index=False)
  206. result = store.select("df2", where="values>2.0")
  207. tm.assert_frame_equal(expected, result)
  208. # https://github.com/PyTables/PyTables/issues/282
  209. # bug in selection when 0th row has a np.nan and an index
  210. # store.append('df3',df,data_columns=True)
  211. # result = store.select(
  212. # 'df3', where='values>2.0')
  213. # tm.assert_frame_equal(expected, result)
  214. # not in first position float with NaN ok too
  215. df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
  216. df["cols"] = (df["cols"] + 10).apply(str)
  217. df.iloc[1] = np.nan
  218. expected = df[df["values"] > 2.0]
  219. store.append("df4", df, data_columns=True)
  220. result = store.select("df4", where="values>2.0")
  221. tm.assert_frame_equal(expected, result)
  222. # test selection with comparison against numpy scalar
  223. # GH 11283
  224. with ensure_clean_store(setup_path) as store:
  225. df = DataFrame(
  226. 1.1 * np.arange(120).reshape((30, 4)),
  227. columns=Index(list("ABCD")),
  228. index=Index([f"i-{i}" for i in range(30)]),
  229. )
  230. expected = df[df["A"] > 0]
  231. store.append("df", df, data_columns=True)
  232. np_zero = np.float64(0) # noqa: F841
  233. result = store.select("df", where=["A>np_zero"])
  234. tm.assert_frame_equal(expected, result)
  235. def test_select_with_many_inputs(setup_path):
  236. with ensure_clean_store(setup_path) as store:
  237. df = DataFrame(
  238. {
  239. "ts": bdate_range("2012-01-01", periods=300),
  240. "A": np.random.default_rng(2).standard_normal(300),
  241. "B": range(300),
  242. "users": ["a"] * 50
  243. + ["b"] * 50
  244. + ["c"] * 100
  245. + [f"a{i:03d}" for i in range(100)],
  246. }
  247. )
  248. _maybe_remove(store, "df")
  249. store.append("df", df, data_columns=["ts", "A", "B", "users"])
  250. # regular select
  251. result = store.select("df", "ts>=Timestamp('2012-02-01')")
  252. expected = df[df.ts >= Timestamp("2012-02-01")]
  253. tm.assert_frame_equal(expected, result)
  254. # small selector
  255. result = store.select("df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']")
  256. expected = df[
  257. (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"])
  258. ]
  259. tm.assert_frame_equal(expected, result)
  260. # big selector along the columns
  261. selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)]
  262. result = store.select("df", "ts>=Timestamp('2012-02-01') and users=selector")
  263. expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)]
  264. tm.assert_frame_equal(expected, result)
  265. selector = range(100, 200)
  266. result = store.select("df", "B=selector")
  267. expected = df[df.B.isin(selector)]
  268. tm.assert_frame_equal(expected, result)
  269. assert len(result) == 100
  270. # big selector along the index
  271. selector = Index(df.ts[0:100].values)
  272. result = store.select("df", "ts=selector")
  273. expected = df[df.ts.isin(selector.values)]
  274. tm.assert_frame_equal(expected, result)
  275. assert len(result) == 100
  276. def test_select_iterator(tmp_path, setup_path):
  277. # single table
  278. with ensure_clean_store(setup_path) as store:
  279. df = DataFrame(
  280. np.random.default_rng(2).standard_normal((10, 4)),
  281. columns=Index(list("ABCD")),
  282. index=date_range("2000-01-01", periods=10, freq="B"),
  283. )
  284. _maybe_remove(store, "df")
  285. store.append("df", df)
  286. expected = store.select("df")
  287. results = list(store.select("df", iterator=True))
  288. result = concat(results)
  289. tm.assert_frame_equal(expected, result)
  290. results = list(store.select("df", chunksize=2))
  291. assert len(results) == 5
  292. result = concat(results)
  293. tm.assert_frame_equal(expected, result)
  294. results = list(store.select("df", chunksize=2))
  295. result = concat(results)
  296. tm.assert_frame_equal(result, expected)
  297. path = tmp_path / setup_path
  298. df = DataFrame(
  299. np.random.default_rng(2).standard_normal((10, 4)),
  300. columns=Index(list("ABCD")),
  301. index=date_range("2000-01-01", periods=10, freq="B"),
  302. )
  303. df.to_hdf(path, key="df_non_table")
  304. msg = "can only use an iterator or chunksize on a table"
  305. with pytest.raises(TypeError, match=msg):
  306. read_hdf(path, "df_non_table", chunksize=2)
  307. with pytest.raises(TypeError, match=msg):
  308. read_hdf(path, "df_non_table", iterator=True)
  309. path = tmp_path / setup_path
  310. df = DataFrame(
  311. np.random.default_rng(2).standard_normal((10, 4)),
  312. columns=Index(list("ABCD")),
  313. index=date_range("2000-01-01", periods=10, freq="B"),
  314. )
  315. df.to_hdf(path, key="df", format="table")
  316. results = list(read_hdf(path, "df", chunksize=2))
  317. result = concat(results)
  318. assert len(results) == 5
  319. tm.assert_frame_equal(result, df)
  320. tm.assert_frame_equal(result, read_hdf(path, "df"))
  321. # multiple
  322. with ensure_clean_store(setup_path) as store:
  323. df1 = DataFrame(
  324. np.random.default_rng(2).standard_normal((10, 4)),
  325. columns=Index(list("ABCD")),
  326. index=date_range("2000-01-01", periods=10, freq="B"),
  327. )
  328. store.append("df1", df1, data_columns=True)
  329. df2 = df1.copy().rename(columns="{}_2".format)
  330. df2["foo"] = "bar"
  331. store.append("df2", df2)
  332. df = concat([df1, df2], axis=1)
  333. # full selection
  334. expected = store.select_as_multiple(["df1", "df2"], selector="df1")
  335. results = list(
  336. store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=2)
  337. )
  338. result = concat(results)
  339. tm.assert_frame_equal(expected, result)
  340. def test_select_iterator_complete_8014(setup_path):
  341. # GH 8014
  342. # using iterator and where clause
  343. chunksize = 1e4
  344. # no iterator
  345. with ensure_clean_store(setup_path) as store:
  346. expected = DataFrame(
  347. np.random.default_rng(2).standard_normal((100064, 4)),
  348. columns=Index(list("ABCD")),
  349. index=date_range("2000-01-01", periods=100064, freq="s"),
  350. )
  351. _maybe_remove(store, "df")
  352. store.append("df", expected)
  353. beg_dt = expected.index[0]
  354. end_dt = expected.index[-1]
  355. # select w/o iteration and no where clause works
  356. result = store.select("df")
  357. tm.assert_frame_equal(expected, result)
  358. # select w/o iterator and where clause, single term, begin
  359. # of range, works
  360. where = f"index >= '{beg_dt}'"
  361. result = store.select("df", where=where)
  362. tm.assert_frame_equal(expected, result)
  363. # select w/o iterator and where clause, single term, end
  364. # of range, works
  365. where = f"index <= '{end_dt}'"
  366. result = store.select("df", where=where)
  367. tm.assert_frame_equal(expected, result)
  368. # select w/o iterator and where clause, inclusive range,
  369. # works
  370. where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
  371. result = store.select("df", where=where)
  372. tm.assert_frame_equal(expected, result)
  373. # with iterator, full range
  374. with ensure_clean_store(setup_path) as store:
  375. expected = DataFrame(
  376. np.random.default_rng(2).standard_normal((100064, 4)),
  377. columns=Index(list("ABCD")),
  378. index=date_range("2000-01-01", periods=100064, freq="s"),
  379. )
  380. _maybe_remove(store, "df")
  381. store.append("df", expected)
  382. beg_dt = expected.index[0]
  383. end_dt = expected.index[-1]
  384. # select w/iterator and no where clause works
  385. results = list(store.select("df", chunksize=chunksize))
  386. result = concat(results)
  387. tm.assert_frame_equal(expected, result)
  388. # select w/iterator and where clause, single term, begin of range
  389. where = f"index >= '{beg_dt}'"
  390. results = list(store.select("df", where=where, chunksize=chunksize))
  391. result = concat(results)
  392. tm.assert_frame_equal(expected, result)
  393. # select w/iterator and where clause, single term, end of range
  394. where = f"index <= '{end_dt}'"
  395. results = list(store.select("df", where=where, chunksize=chunksize))
  396. result = concat(results)
  397. tm.assert_frame_equal(expected, result)
  398. # select w/iterator and where clause, inclusive range
  399. where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
  400. results = list(store.select("df", where=where, chunksize=chunksize))
  401. result = concat(results)
  402. tm.assert_frame_equal(expected, result)
  403. def test_select_iterator_non_complete_8014(setup_path):
  404. # GH 8014
  405. # using iterator and where clause
  406. chunksize = 1e4
  407. # with iterator, non complete range
  408. with ensure_clean_store(setup_path) as store:
  409. expected = DataFrame(
  410. np.random.default_rng(2).standard_normal((100064, 4)),
  411. columns=Index(list("ABCD")),
  412. index=date_range("2000-01-01", periods=100064, freq="s"),
  413. )
  414. _maybe_remove(store, "df")
  415. store.append("df", expected)
  416. beg_dt = expected.index[1]
  417. end_dt = expected.index[-2]
  418. # select w/iterator and where clause, single term, begin of range
  419. where = f"index >= '{beg_dt}'"
  420. results = list(store.select("df", where=where, chunksize=chunksize))
  421. result = concat(results)
  422. rexpected = expected[expected.index >= beg_dt]
  423. tm.assert_frame_equal(rexpected, result)
  424. # select w/iterator and where clause, single term, end of range
  425. where = f"index <= '{end_dt}'"
  426. results = list(store.select("df", where=where, chunksize=chunksize))
  427. result = concat(results)
  428. rexpected = expected[expected.index <= end_dt]
  429. tm.assert_frame_equal(rexpected, result)
  430. # select w/iterator and where clause, inclusive range
  431. where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
  432. results = list(store.select("df", where=where, chunksize=chunksize))
  433. result = concat(results)
  434. rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
  435. tm.assert_frame_equal(rexpected, result)
  436. # with iterator, empty where
  437. with ensure_clean_store(setup_path) as store:
  438. expected = DataFrame(
  439. np.random.default_rng(2).standard_normal((100064, 4)),
  440. columns=Index(list("ABCD")),
  441. index=date_range("2000-01-01", periods=100064, freq="s"),
  442. )
  443. _maybe_remove(store, "df")
  444. store.append("df", expected)
  445. end_dt = expected.index[-1]
  446. # select w/iterator and where clause, single term, begin of range
  447. where = f"index > '{end_dt}'"
  448. results = list(store.select("df", where=where, chunksize=chunksize))
  449. assert 0 == len(results)
  450. def test_select_iterator_many_empty_frames(setup_path):
  451. # GH 8014
  452. # using iterator and where clause can return many empty
  453. # frames.
  454. chunksize = 10_000
  455. # with iterator, range limited to the first chunk
  456. with ensure_clean_store(setup_path) as store:
  457. expected = DataFrame(
  458. np.random.default_rng(2).standard_normal((100064, 4)),
  459. columns=Index(list("ABCD")),
  460. index=date_range("2000-01-01", periods=100064, freq="s"),
  461. )
  462. _maybe_remove(store, "df")
  463. store.append("df", expected)
  464. beg_dt = expected.index[0]
  465. end_dt = expected.index[chunksize - 1]
  466. # select w/iterator and where clause, single term, begin of range
  467. where = f"index >= '{beg_dt}'"
  468. results = list(store.select("df", where=where, chunksize=chunksize))
  469. result = concat(results)
  470. rexpected = expected[expected.index >= beg_dt]
  471. tm.assert_frame_equal(rexpected, result)
  472. # select w/iterator and where clause, single term, end of range
  473. where = f"index <= '{end_dt}'"
  474. results = list(store.select("df", where=where, chunksize=chunksize))
  475. assert len(results) == 1
  476. result = concat(results)
  477. rexpected = expected[expected.index <= end_dt]
  478. tm.assert_frame_equal(rexpected, result)
  479. # select w/iterator and where clause, inclusive range
  480. where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
  481. results = list(store.select("df", where=where, chunksize=chunksize))
  482. # should be 1, is 10
  483. assert len(results) == 1
  484. result = concat(results)
  485. rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
  486. tm.assert_frame_equal(rexpected, result)
  487. # select w/iterator and where clause which selects
  488. # *nothing*.
  489. #
  490. # To be consistent with Python idiom I suggest this should
  491. # return [] e.g. `for e in []: print True` never prints
  492. # True.
  493. where = f"index <= '{beg_dt}' & index >= '{end_dt}'"
  494. results = list(store.select("df", where=where, chunksize=chunksize))
  495. # should be []
  496. assert len(results) == 0
  497. def test_frame_select(setup_path):
  498. df = DataFrame(
  499. np.random.default_rng(2).standard_normal((10, 4)),
  500. columns=Index(list("ABCD")),
  501. index=date_range("2000-01-01", periods=10, freq="B"),
  502. )
  503. with ensure_clean_store(setup_path) as store:
  504. store.put("frame", df, format="table")
  505. date = df.index[len(df) // 2]
  506. crit1 = Term("index>=date")
  507. assert crit1.env.scope["date"] == date
  508. crit2 = "columns=['A', 'D']"
  509. crit3 = "columns=A"
  510. result = store.select("frame", [crit1, crit2])
  511. expected = df.loc[date:, ["A", "D"]]
  512. tm.assert_frame_equal(result, expected)
  513. result = store.select("frame", [crit3])
  514. expected = df.loc[:, ["A"]]
  515. tm.assert_frame_equal(result, expected)
  516. # invalid terms
  517. df = DataFrame(
  518. np.random.default_rng(2).standard_normal((10, 4)),
  519. columns=Index(list("ABCD")),
  520. index=date_range("2000-01-01", periods=10, freq="B"),
  521. )
  522. store.append("df_time", df)
  523. msg = "day is out of range for month: 0"
  524. with pytest.raises(ValueError, match=msg):
  525. store.select("df_time", "index>0")
  526. # can't select if not written as table
  527. # store['frame'] = df
  528. # with pytest.raises(ValueError):
  529. # store.select('frame', [crit1, crit2])
  530. def test_frame_select_complex(setup_path):
  531. # select via complex criteria
  532. df = DataFrame(
  533. np.random.default_rng(2).standard_normal((10, 4)),
  534. columns=Index(list("ABCD")),
  535. index=date_range("2000-01-01", periods=10, freq="B"),
  536. )
  537. df["string"] = "foo"
  538. df.loc[df.index[0:4], "string"] = "bar"
  539. with ensure_clean_store(setup_path) as store:
  540. store.put("df", df, format="table", data_columns=["string"])
  541. # empty
  542. result = store.select("df", 'index>df.index[3] & string="bar"')
  543. expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")]
  544. tm.assert_frame_equal(result, expected)
  545. result = store.select("df", 'index>df.index[3] & string="foo"')
  546. expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")]
  547. tm.assert_frame_equal(result, expected)
  548. # or
  549. result = store.select("df", 'index>df.index[3] | string="bar"')
  550. expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")]
  551. tm.assert_frame_equal(result, expected)
  552. result = store.select(
  553. "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"'
  554. )
  555. expected = df.loc[
  556. ((df.index > df.index[3]) & (df.index <= df.index[6]))
  557. | (df.string == "bar")
  558. ]
  559. tm.assert_frame_equal(result, expected)
  560. # invert
  561. result = store.select("df", 'string!="bar"')
  562. expected = df.loc[df.string != "bar"]
  563. tm.assert_frame_equal(result, expected)
  564. # invert not implemented in numexpr :(
  565. msg = "cannot use an invert condition when passing to numexpr"
  566. with pytest.raises(NotImplementedError, match=msg):
  567. store.select("df", '~(string="bar")')
  568. # invert ok for filters
  569. result = store.select("df", "~(columns=['A','B'])")
  570. expected = df.loc[:, df.columns.difference(["A", "B"])]
  571. tm.assert_frame_equal(result, expected)
  572. # in
  573. result = store.select("df", "index>df.index[3] & columns in ['A','B']")
  574. expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"])
  575. tm.assert_frame_equal(result, expected)
  576. def test_frame_select_complex2(tmp_path):
  577. pp = tmp_path / "params.hdf"
  578. hh = tmp_path / "hist.hdf"
  579. # use non-trivial selection criteria
  580. params = DataFrame({"A": [1, 1, 2, 2, 3]})
  581. params.to_hdf(pp, key="df", mode="w", format="table", data_columns=["A"])
  582. selection = read_hdf(pp, "df", where="A=[2,3]")
  583. hist = DataFrame(
  584. np.random.default_rng(2).standard_normal((25, 1)),
  585. columns=["data"],
  586. index=MultiIndex.from_tuples(
  587. [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"]
  588. ),
  589. )
  590. hist.to_hdf(hh, key="df", mode="w", format="table")
  591. expected = read_hdf(hh, "df", where="l1=[2, 3, 4]")
  592. # scope with list like
  593. l0 = selection.index.tolist() # noqa: F841
  594. with HDFStore(hh) as store:
  595. result = store.select("df", where="l1=l0")
  596. tm.assert_frame_equal(result, expected)
  597. result = read_hdf(hh, "df", where="l1=l0")
  598. tm.assert_frame_equal(result, expected)
  599. # index
  600. index = selection.index # noqa: F841
  601. result = read_hdf(hh, "df", where="l1=index")
  602. tm.assert_frame_equal(result, expected)
  603. result = read_hdf(hh, "df", where="l1=selection.index")
  604. tm.assert_frame_equal(result, expected)
  605. result = read_hdf(hh, "df", where="l1=selection.index.tolist()")
  606. tm.assert_frame_equal(result, expected)
  607. result = read_hdf(hh, "df", where="l1=list(selection.index)")
  608. tm.assert_frame_equal(result, expected)
  609. # scope with index
  610. with HDFStore(hh) as store:
  611. result = store.select("df", where="l1=index")
  612. tm.assert_frame_equal(result, expected)
  613. result = store.select("df", where="l1=selection.index")
  614. tm.assert_frame_equal(result, expected)
  615. result = store.select("df", where="l1=selection.index.tolist()")
  616. tm.assert_frame_equal(result, expected)
  617. result = store.select("df", where="l1=list(selection.index)")
  618. tm.assert_frame_equal(result, expected)
  619. def test_invalid_filtering(setup_path):
  620. # can't use more than one filter (atm)
  621. df = DataFrame(
  622. np.random.default_rng(2).standard_normal((10, 4)),
  623. columns=Index(list("ABCD")),
  624. index=date_range("2000-01-01", periods=10, freq="B"),
  625. )
  626. with ensure_clean_store(setup_path) as store:
  627. store.put("df", df, format="table")
  628. msg = "unable to collapse Joint Filters"
  629. # not implemented
  630. with pytest.raises(NotImplementedError, match=msg):
  631. store.select("df", "columns=['A'] | columns=['B']")
  632. # in theory we could deal with this
  633. with pytest.raises(NotImplementedError, match=msg):
  634. store.select("df", "columns=['A','B'] & columns=['C']")
  635. def test_string_select(setup_path):
  636. # GH 2973
  637. with ensure_clean_store(setup_path) as store:
  638. df = DataFrame(
  639. np.random.default_rng(2).standard_normal((10, 4)),
  640. columns=Index(list("ABCD")),
  641. index=date_range("2000-01-01", periods=10, freq="B"),
  642. )
  643. # test string ==/!=
  644. df["x"] = "none"
  645. df.loc[df.index[2:7], "x"] = ""
  646. store.append("df", df, data_columns=["x"])
  647. result = store.select("df", "x=none")
  648. expected = df[df.x == "none"]
  649. tm.assert_frame_equal(result, expected)
  650. result = store.select("df", "x!=none")
  651. expected = df[df.x != "none"]
  652. tm.assert_frame_equal(result, expected)
  653. df2 = df.copy()
  654. df2.loc[df2.x == "", "x"] = np.nan
  655. store.append("df2", df2, data_columns=["x"])
  656. result = store.select("df2", "x!=none")
  657. expected = df2[isna(df2.x)]
  658. tm.assert_frame_equal(result, expected)
  659. # int ==/!=
  660. df["int"] = 1
  661. df.loc[df.index[2:7], "int"] = 2
  662. store.append("df3", df, data_columns=["int"])
  663. result = store.select("df3", "int=2")
  664. expected = df[df.int == 2]
  665. tm.assert_frame_equal(result, expected)
  666. result = store.select("df3", "int!=2")
  667. expected = df[df.int != 2]
  668. tm.assert_frame_equal(result, expected)
  669. def test_select_as_multiple(setup_path):
  670. df1 = DataFrame(
  671. np.random.default_rng(2).standard_normal((10, 4)),
  672. columns=Index(list("ABCD")),
  673. index=date_range("2000-01-01", periods=10, freq="B"),
  674. )
  675. df2 = df1.copy().rename(columns="{}_2".format)
  676. df2["foo"] = "bar"
  677. with ensure_clean_store(setup_path) as store:
  678. msg = "keys must be a list/tuple"
  679. # no tables stored
  680. with pytest.raises(TypeError, match=msg):
  681. store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
  682. store.append("df1", df1, data_columns=["A", "B"])
  683. store.append("df2", df2)
  684. # exceptions
  685. with pytest.raises(TypeError, match=msg):
  686. store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
  687. with pytest.raises(TypeError, match=msg):
  688. store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1")
  689. msg = "'No object named df3 in the file'"
  690. with pytest.raises(KeyError, match=msg):
  691. store.select_as_multiple(
  692. ["df1", "df3"], where=["A>0", "B>0"], selector="df1"
  693. )
  694. with pytest.raises(KeyError, match=msg):
  695. store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1")
  696. with pytest.raises(KeyError, match="'No object named df4 in the file'"):
  697. store.select_as_multiple(
  698. ["df1", "df2"], where=["A>0", "B>0"], selector="df4"
  699. )
  700. # default select
  701. result = store.select("df1", ["A>0", "B>0"])
  702. expected = store.select_as_multiple(
  703. ["df1"], where=["A>0", "B>0"], selector="df1"
  704. )
  705. tm.assert_frame_equal(result, expected)
  706. expected = store.select_as_multiple("df1", where=["A>0", "B>0"], selector="df1")
  707. tm.assert_frame_equal(result, expected)
  708. # multiple
  709. result = store.select_as_multiple(
  710. ["df1", "df2"], where=["A>0", "B>0"], selector="df1"
  711. )
  712. expected = concat([df1, df2], axis=1)
  713. expected = expected[(expected.A > 0) & (expected.B > 0)]
  714. tm.assert_frame_equal(result, expected, check_freq=False)
  715. # FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds
  716. # multiple (diff selector)
  717. result = store.select_as_multiple(
  718. ["df1", "df2"], where="index>df2.index[4]", selector="df2"
  719. )
  720. expected = concat([df1, df2], axis=1)
  721. expected = expected[5:]
  722. tm.assert_frame_equal(result, expected)
  723. # test exception for diff rows
  724. df3 = df1.copy().head(2)
  725. store.append("df3", df3)
  726. msg = "all tables must have exactly the same nrows!"
  727. with pytest.raises(ValueError, match=msg):
  728. store.select_as_multiple(
  729. ["df1", "df3"], where=["A>0", "B>0"], selector="df1"
  730. )
  731. def test_nan_selection_bug_4858(setup_path):
  732. with ensure_clean_store(setup_path) as store:
  733. df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64")
  734. df["cols"] = (df["cols"] + 10).apply(str)
  735. df.iloc[0] = np.nan
  736. expected = DataFrame(
  737. {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]},
  738. index=[3, 4, 5],
  739. )
  740. # write w/o the index on that particular column
  741. store.append("df", df, data_columns=True, index=["cols"])
  742. result = store.select("df", where="values>2.0")
  743. tm.assert_frame_equal(result, expected)
  744. def test_query_with_nested_special_character(setup_path):
  745. df = DataFrame(
  746. {
  747. "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"],
  748. "b": [1, 2, 3, 4, 5, 6, 7, 8],
  749. }
  750. )
  751. expected = df[df.a == "test & test"]
  752. with ensure_clean_store(setup_path) as store:
  753. store.append("test", df, format="table", data_columns=True)
  754. result = store.select("test", 'a = "test & test"')
  755. tm.assert_frame_equal(expected, result)
  756. def test_query_long_float_literal(setup_path):
  757. # GH 14241
  758. df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]})
  759. with ensure_clean_store(setup_path) as store:
  760. store.append("test", df, format="table", data_columns=True)
  761. cutoff = 1000000000.0006
  762. result = store.select("test", f"A < {cutoff:.4f}")
  763. assert result.empty
  764. cutoff = 1000000000.0010
  765. result = store.select("test", f"A > {cutoff:.4f}")
  766. expected = df.loc[[1, 2], :]
  767. tm.assert_frame_equal(expected, result)
  768. exact = 1000000000.0011
  769. result = store.select("test", f"A == {exact:.4f}")
  770. expected = df.loc[[1], :]
  771. tm.assert_frame_equal(expected, result)
  772. def test_query_compare_column_type(setup_path):
  773. # GH 15492
  774. df = DataFrame(
  775. {
  776. "date": ["2014-01-01", "2014-01-02"],
  777. "real_date": date_range("2014-01-01", periods=2),
  778. "float": [1.1, 1.2],
  779. "int": [1, 2],
  780. },
  781. columns=["date", "real_date", "float", "int"],
  782. )
  783. with ensure_clean_store(setup_path) as store:
  784. store.append("test", df, format="table", data_columns=True)
  785. ts = Timestamp("2014-01-01") # noqa: F841
  786. result = store.select("test", where="real_date > ts")
  787. expected = df.loc[[1], :]
  788. tm.assert_frame_equal(expected, result)
  789. for op in ["<", ">", "=="]:
  790. # non strings to string column always fail
  791. for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]:
  792. query = f"date {op} v"
  793. msg = f"Cannot compare {v} of type {type(v)} to string column"
  794. with pytest.raises(TypeError, match=msg):
  795. store.select("test", where=query)
  796. # strings to other columns must be convertible to type
  797. v = "a"
  798. for col in ["int", "float", "real_date"]:
  799. query = f"{col} {op} v"
  800. if col == "real_date":
  801. msg = 'Given date string "a" not likely a datetime'
  802. else:
  803. msg = "could not convert string to"
  804. with pytest.raises(ValueError, match=msg):
  805. store.select("test", where=query)
  806. for v, col in zip(
  807. ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"]
  808. ):
  809. query = f"{col} {op} v"
  810. result = store.select("test", where=query)
  811. if op == "==":
  812. expected = df.loc[[0], :]
  813. elif op == ">":
  814. expected = df.loc[[1], :]
  815. else:
  816. expected = df.loc[[], :]
  817. tm.assert_frame_equal(expected, result)
  818. @pytest.mark.parametrize("where", ["", (), (None,), [], [None]])
  819. def test_select_empty_where(tmp_path, where):
  820. # GH26610
  821. df = DataFrame([1, 2, 3])
  822. path = tmp_path / "empty_where.h5"
  823. with HDFStore(path) as store:
  824. store.put("df", df, "t")
  825. result = read_hdf(store, "df", where=where)
  826. tm.assert_frame_equal(result, df)
  827. def test_select_large_integer(tmp_path):
  828. path = tmp_path / "large_int.h5"
  829. df = DataFrame(
  830. zip(
  831. ["a", "b", "c", "d"],
  832. [-9223372036854775801, -9223372036854775802, -9223372036854775803, 123],
  833. ),
  834. columns=["x", "y"],
  835. )
  836. with HDFStore(path) as s:
  837. s.append("data", df, data_columns=True, index=False)
  838. result = s.select("data", where="y==-9223372036854775801").get("y").get(0)
  839. expected = df["y"][0]
  840. assert expected == result