| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046 |
- import numpy as np
- import pytest
- from pandas._libs.tslibs import Timestamp
- import pandas as pd
- from pandas import (
- DataFrame,
- HDFStore,
- Index,
- MultiIndex,
- Series,
- _testing as tm,
- bdate_range,
- concat,
- date_range,
- isna,
- read_hdf,
- )
- from pandas.tests.io.pytables.common import (
- _maybe_remove,
- ensure_clean_store,
- )
- from pandas.io.pytables import Term
- pytestmark = [pytest.mark.single_cpu]
- def test_select_columns_in_where(setup_path):
- # GH 6169
- # recreate multi-indexes when columns is passed
- # in the `where` argument
- index = MultiIndex(
- levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
- codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
- names=["foo_name", "bar_name"],
- )
- # With a DataFrame
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 3)),
- index=index,
- columns=["A", "B", "C"],
- )
- with ensure_clean_store(setup_path) as store:
- store.put("df", df, format="table")
- expected = df[["A"]]
- tm.assert_frame_equal(store.select("df", columns=["A"]), expected)
- tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected)
- # With a Series
- s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="A")
- with ensure_clean_store(setup_path) as store:
- store.put("s", s, format="table")
- tm.assert_series_equal(store.select("s", where="columns=['A']"), s)
- def test_select_with_dups(setup_path):
- # single dtypes
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "A", "B", "B"]
- )
- df.index = date_range("20130101 9:30", periods=10, freq="min")
- with ensure_clean_store(setup_path) as store:
- store.append("df", df)
- result = store.select("df")
- expected = df
- tm.assert_frame_equal(result, expected, by_blocks=True)
- result = store.select("df", columns=df.columns)
- expected = df
- tm.assert_frame_equal(result, expected, by_blocks=True)
- result = store.select("df", columns=["A"])
- expected = df.loc[:, ["A"]]
- tm.assert_frame_equal(result, expected)
- # dups across dtypes
- df = concat(
- [
- DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=["A", "A", "B", "B"],
- ),
- DataFrame(
- np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
- columns=["A", "C"],
- ),
- ],
- axis=1,
- )
- df.index = date_range("20130101 9:30", periods=10, freq="min")
- with ensure_clean_store(setup_path) as store:
- store.append("df", df)
- result = store.select("df")
- expected = df
- tm.assert_frame_equal(result, expected, by_blocks=True)
- result = store.select("df", columns=df.columns)
- expected = df
- tm.assert_frame_equal(result, expected, by_blocks=True)
- expected = df.loc[:, ["A"]]
- result = store.select("df", columns=["A"])
- tm.assert_frame_equal(result, expected, by_blocks=True)
- expected = df.loc[:, ["B", "A"]]
- result = store.select("df", columns=["B", "A"])
- tm.assert_frame_equal(result, expected, by_blocks=True)
- # duplicates on both index and columns
- with ensure_clean_store(setup_path) as store:
- store.append("df", df)
- store.append("df", df)
- expected = df.loc[:, ["B", "A"]]
- expected = concat([expected, expected])
- result = store.select("df", columns=["B", "A"])
- tm.assert_frame_equal(result, expected, by_blocks=True)
- def test_select(setup_path):
- with ensure_clean_store(setup_path) as store:
- # select with columns=
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- _maybe_remove(store, "df")
- store.append("df", df)
- result = store.select("df", columns=["A", "B"])
- expected = df.reindex(columns=["A", "B"])
- tm.assert_frame_equal(expected, result)
- # equivalently
- result = store.select("df", [("columns=['A', 'B']")])
- expected = df.reindex(columns=["A", "B"])
- tm.assert_frame_equal(expected, result)
- # with a data column
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=["A"])
- result = store.select("df", ["A > 0"], columns=["A", "B"])
- expected = df[df.A > 0].reindex(columns=["A", "B"])
- tm.assert_frame_equal(expected, result)
- # all a data columns
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=True)
- result = store.select("df", ["A > 0"], columns=["A", "B"])
- expected = df[df.A > 0].reindex(columns=["A", "B"])
- tm.assert_frame_equal(expected, result)
- # with a data column, but different columns
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=["A"])
- result = store.select("df", ["A > 0"], columns=["C", "D"])
- expected = df[df.A > 0].reindex(columns=["C", "D"])
- tm.assert_frame_equal(expected, result)
- def test_select_dtypes(setup_path):
- with ensure_clean_store(setup_path) as store:
- # with a Timestamp data column (GH #2637)
- df = DataFrame(
- {
- "ts": bdate_range("2012-01-01", periods=300),
- "A": np.random.default_rng(2).standard_normal(300),
- }
- )
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=["ts", "A"])
- result = store.select("df", "ts>=Timestamp('2012-02-01')")
- expected = df[df.ts >= Timestamp("2012-02-01")]
- tm.assert_frame_equal(expected, result)
- # bool columns (GH #2849)
- df = DataFrame(
- np.random.default_rng(2).standard_normal((5, 2)), columns=["A", "B"]
- )
- df["object"] = "foo"
- df.loc[4:5, "object"] = "bar"
- df["boolv"] = df["A"] > 0
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=True)
- expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa: E712
- for v in [True, "true", 1]:
- result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
- tm.assert_frame_equal(expected, result)
- expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa: E712
- for v in [False, "false", 0]:
- result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
- tm.assert_frame_equal(expected, result)
- # integer index
- df = DataFrame(
- {
- "A": np.random.default_rng(2).random(20),
- "B": np.random.default_rng(2).random(20),
- }
- )
- _maybe_remove(store, "df_int")
- store.append("df_int", df)
- result = store.select("df_int", "index<10 and columns=['A']")
- expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
- tm.assert_frame_equal(expected, result)
- # float index
- df = DataFrame(
- {
- "A": np.random.default_rng(2).random(20),
- "B": np.random.default_rng(2).random(20),
- "index": np.arange(20, dtype="f8"),
- }
- )
- _maybe_remove(store, "df_float")
- store.append("df_float", df)
- result = store.select("df_float", "index<10.0 and columns=['A']")
- expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
- tm.assert_frame_equal(expected, result)
- with ensure_clean_store(setup_path) as store:
- # floats w/o NaN
- df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
- df["cols"] = (df["cols"] + 10).apply(str)
- store.append("df1", df, data_columns=True)
- result = store.select("df1", where="values>2.0")
- expected = df[df["values"] > 2.0]
- tm.assert_frame_equal(expected, result)
- # floats with NaN
- df.iloc[0] = np.nan
- expected = df[df["values"] > 2.0]
- store.append("df2", df, data_columns=True, index=False)
- result = store.select("df2", where="values>2.0")
- tm.assert_frame_equal(expected, result)
- # https://github.com/PyTables/PyTables/issues/282
- # bug in selection when 0th row has a np.nan and an index
- # store.append('df3',df,data_columns=True)
- # result = store.select(
- # 'df3', where='values>2.0')
- # tm.assert_frame_equal(expected, result)
- # not in first position float with NaN ok too
- df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
- df["cols"] = (df["cols"] + 10).apply(str)
- df.iloc[1] = np.nan
- expected = df[df["values"] > 2.0]
- store.append("df4", df, data_columns=True)
- result = store.select("df4", where="values>2.0")
- tm.assert_frame_equal(expected, result)
- # test selection with comparison against numpy scalar
- # GH 11283
- with ensure_clean_store(setup_path) as store:
- df = DataFrame(
- 1.1 * np.arange(120).reshape((30, 4)),
- columns=Index(list("ABCD")),
- index=Index([f"i-{i}" for i in range(30)]),
- )
- expected = df[df["A"] > 0]
- store.append("df", df, data_columns=True)
- np_zero = np.float64(0) # noqa: F841
- result = store.select("df", where=["A>np_zero"])
- tm.assert_frame_equal(expected, result)
- def test_select_with_many_inputs(setup_path):
- with ensure_clean_store(setup_path) as store:
- df = DataFrame(
- {
- "ts": bdate_range("2012-01-01", periods=300),
- "A": np.random.default_rng(2).standard_normal(300),
- "B": range(300),
- "users": ["a"] * 50
- + ["b"] * 50
- + ["c"] * 100
- + [f"a{i:03d}" for i in range(100)],
- }
- )
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=["ts", "A", "B", "users"])
- # regular select
- result = store.select("df", "ts>=Timestamp('2012-02-01')")
- expected = df[df.ts >= Timestamp("2012-02-01")]
- tm.assert_frame_equal(expected, result)
- # small selector
- result = store.select("df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']")
- expected = df[
- (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"])
- ]
- tm.assert_frame_equal(expected, result)
- # big selector along the columns
- selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)]
- result = store.select("df", "ts>=Timestamp('2012-02-01') and users=selector")
- expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)]
- tm.assert_frame_equal(expected, result)
- selector = range(100, 200)
- result = store.select("df", "B=selector")
- expected = df[df.B.isin(selector)]
- tm.assert_frame_equal(expected, result)
- assert len(result) == 100
- # big selector along the index
- selector = Index(df.ts[0:100].values)
- result = store.select("df", "ts=selector")
- expected = df[df.ts.isin(selector.values)]
- tm.assert_frame_equal(expected, result)
- assert len(result) == 100
- def test_select_iterator(tmp_path, setup_path):
- # single table
- with ensure_clean_store(setup_path) as store:
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- _maybe_remove(store, "df")
- store.append("df", df)
- expected = store.select("df")
- results = list(store.select("df", iterator=True))
- result = concat(results)
- tm.assert_frame_equal(expected, result)
- results = list(store.select("df", chunksize=2))
- assert len(results) == 5
- result = concat(results)
- tm.assert_frame_equal(expected, result)
- results = list(store.select("df", chunksize=2))
- result = concat(results)
- tm.assert_frame_equal(result, expected)
- path = tmp_path / setup_path
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- df.to_hdf(path, key="df_non_table")
- msg = "can only use an iterator or chunksize on a table"
- with pytest.raises(TypeError, match=msg):
- read_hdf(path, "df_non_table", chunksize=2)
- with pytest.raises(TypeError, match=msg):
- read_hdf(path, "df_non_table", iterator=True)
- path = tmp_path / setup_path
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- df.to_hdf(path, key="df", format="table")
- results = list(read_hdf(path, "df", chunksize=2))
- result = concat(results)
- assert len(results) == 5
- tm.assert_frame_equal(result, df)
- tm.assert_frame_equal(result, read_hdf(path, "df"))
- # multiple
- with ensure_clean_store(setup_path) as store:
- df1 = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- store.append("df1", df1, data_columns=True)
- df2 = df1.copy().rename(columns="{}_2".format)
- df2["foo"] = "bar"
- store.append("df2", df2)
- df = concat([df1, df2], axis=1)
- # full selection
- expected = store.select_as_multiple(["df1", "df2"], selector="df1")
- results = list(
- store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=2)
- )
- result = concat(results)
- tm.assert_frame_equal(expected, result)
- def test_select_iterator_complete_8014(setup_path):
- # GH 8014
- # using iterator and where clause
- chunksize = 1e4
- # no iterator
- with ensure_clean_store(setup_path) as store:
- expected = DataFrame(
- np.random.default_rng(2).standard_normal((100064, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=100064, freq="s"),
- )
- _maybe_remove(store, "df")
- store.append("df", expected)
- beg_dt = expected.index[0]
- end_dt = expected.index[-1]
- # select w/o iteration and no where clause works
- result = store.select("df")
- tm.assert_frame_equal(expected, result)
- # select w/o iterator and where clause, single term, begin
- # of range, works
- where = f"index >= '{beg_dt}'"
- result = store.select("df", where=where)
- tm.assert_frame_equal(expected, result)
- # select w/o iterator and where clause, single term, end
- # of range, works
- where = f"index <= '{end_dt}'"
- result = store.select("df", where=where)
- tm.assert_frame_equal(expected, result)
- # select w/o iterator and where clause, inclusive range,
- # works
- where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
- result = store.select("df", where=where)
- tm.assert_frame_equal(expected, result)
- # with iterator, full range
- with ensure_clean_store(setup_path) as store:
- expected = DataFrame(
- np.random.default_rng(2).standard_normal((100064, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=100064, freq="s"),
- )
- _maybe_remove(store, "df")
- store.append("df", expected)
- beg_dt = expected.index[0]
- end_dt = expected.index[-1]
- # select w/iterator and no where clause works
- results = list(store.select("df", chunksize=chunksize))
- result = concat(results)
- tm.assert_frame_equal(expected, result)
- # select w/iterator and where clause, single term, begin of range
- where = f"index >= '{beg_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- result = concat(results)
- tm.assert_frame_equal(expected, result)
- # select w/iterator and where clause, single term, end of range
- where = f"index <= '{end_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- result = concat(results)
- tm.assert_frame_equal(expected, result)
- # select w/iterator and where clause, inclusive range
- where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- result = concat(results)
- tm.assert_frame_equal(expected, result)
- def test_select_iterator_non_complete_8014(setup_path):
- # GH 8014
- # using iterator and where clause
- chunksize = 1e4
- # with iterator, non complete range
- with ensure_clean_store(setup_path) as store:
- expected = DataFrame(
- np.random.default_rng(2).standard_normal((100064, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=100064, freq="s"),
- )
- _maybe_remove(store, "df")
- store.append("df", expected)
- beg_dt = expected.index[1]
- end_dt = expected.index[-2]
- # select w/iterator and where clause, single term, begin of range
- where = f"index >= '{beg_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- result = concat(results)
- rexpected = expected[expected.index >= beg_dt]
- tm.assert_frame_equal(rexpected, result)
- # select w/iterator and where clause, single term, end of range
- where = f"index <= '{end_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- result = concat(results)
- rexpected = expected[expected.index <= end_dt]
- tm.assert_frame_equal(rexpected, result)
- # select w/iterator and where clause, inclusive range
- where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- result = concat(results)
- rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
- tm.assert_frame_equal(rexpected, result)
- # with iterator, empty where
- with ensure_clean_store(setup_path) as store:
- expected = DataFrame(
- np.random.default_rng(2).standard_normal((100064, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=100064, freq="s"),
- )
- _maybe_remove(store, "df")
- store.append("df", expected)
- end_dt = expected.index[-1]
- # select w/iterator and where clause, single term, begin of range
- where = f"index > '{end_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- assert 0 == len(results)
- def test_select_iterator_many_empty_frames(setup_path):
- # GH 8014
- # using iterator and where clause can return many empty
- # frames.
- chunksize = 10_000
- # with iterator, range limited to the first chunk
- with ensure_clean_store(setup_path) as store:
- expected = DataFrame(
- np.random.default_rng(2).standard_normal((100064, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=100064, freq="s"),
- )
- _maybe_remove(store, "df")
- store.append("df", expected)
- beg_dt = expected.index[0]
- end_dt = expected.index[chunksize - 1]
- # select w/iterator and where clause, single term, begin of range
- where = f"index >= '{beg_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- result = concat(results)
- rexpected = expected[expected.index >= beg_dt]
- tm.assert_frame_equal(rexpected, result)
- # select w/iterator and where clause, single term, end of range
- where = f"index <= '{end_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- assert len(results) == 1
- result = concat(results)
- rexpected = expected[expected.index <= end_dt]
- tm.assert_frame_equal(rexpected, result)
- # select w/iterator and where clause, inclusive range
- where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- # should be 1, is 10
- assert len(results) == 1
- result = concat(results)
- rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
- tm.assert_frame_equal(rexpected, result)
- # select w/iterator and where clause which selects
- # *nothing*.
- #
- # To be consistent with Python idiom I suggest this should
- # return [] e.g. `for e in []: print True` never prints
- # True.
- where = f"index <= '{beg_dt}' & index >= '{end_dt}'"
- results = list(store.select("df", where=where, chunksize=chunksize))
- # should be []
- assert len(results) == 0
- def test_frame_select(setup_path):
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- with ensure_clean_store(setup_path) as store:
- store.put("frame", df, format="table")
- date = df.index[len(df) // 2]
- crit1 = Term("index>=date")
- assert crit1.env.scope["date"] == date
- crit2 = "columns=['A', 'D']"
- crit3 = "columns=A"
- result = store.select("frame", [crit1, crit2])
- expected = df.loc[date:, ["A", "D"]]
- tm.assert_frame_equal(result, expected)
- result = store.select("frame", [crit3])
- expected = df.loc[:, ["A"]]
- tm.assert_frame_equal(result, expected)
- # invalid terms
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- store.append("df_time", df)
- msg = "day is out of range for month: 0"
- with pytest.raises(ValueError, match=msg):
- store.select("df_time", "index>0")
- # can't select if not written as table
- # store['frame'] = df
- # with pytest.raises(ValueError):
- # store.select('frame', [crit1, crit2])
- def test_frame_select_complex(setup_path):
- # select via complex criteria
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- df["string"] = "foo"
- df.loc[df.index[0:4], "string"] = "bar"
- with ensure_clean_store(setup_path) as store:
- store.put("df", df, format="table", data_columns=["string"])
- # empty
- result = store.select("df", 'index>df.index[3] & string="bar"')
- expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")]
- tm.assert_frame_equal(result, expected)
- result = store.select("df", 'index>df.index[3] & string="foo"')
- expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")]
- tm.assert_frame_equal(result, expected)
- # or
- result = store.select("df", 'index>df.index[3] | string="bar"')
- expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")]
- tm.assert_frame_equal(result, expected)
- result = store.select(
- "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"'
- )
- expected = df.loc[
- ((df.index > df.index[3]) & (df.index <= df.index[6]))
- | (df.string == "bar")
- ]
- tm.assert_frame_equal(result, expected)
- # invert
- result = store.select("df", 'string!="bar"')
- expected = df.loc[df.string != "bar"]
- tm.assert_frame_equal(result, expected)
- # invert not implemented in numexpr :(
- msg = "cannot use an invert condition when passing to numexpr"
- with pytest.raises(NotImplementedError, match=msg):
- store.select("df", '~(string="bar")')
- # invert ok for filters
- result = store.select("df", "~(columns=['A','B'])")
- expected = df.loc[:, df.columns.difference(["A", "B"])]
- tm.assert_frame_equal(result, expected)
- # in
- result = store.select("df", "index>df.index[3] & columns in ['A','B']")
- expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"])
- tm.assert_frame_equal(result, expected)
- def test_frame_select_complex2(tmp_path):
- pp = tmp_path / "params.hdf"
- hh = tmp_path / "hist.hdf"
- # use non-trivial selection criteria
- params = DataFrame({"A": [1, 1, 2, 2, 3]})
- params.to_hdf(pp, key="df", mode="w", format="table", data_columns=["A"])
- selection = read_hdf(pp, "df", where="A=[2,3]")
- hist = DataFrame(
- np.random.default_rng(2).standard_normal((25, 1)),
- columns=["data"],
- index=MultiIndex.from_tuples(
- [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"]
- ),
- )
- hist.to_hdf(hh, key="df", mode="w", format="table")
- expected = read_hdf(hh, "df", where="l1=[2, 3, 4]")
- # scope with list like
- l0 = selection.index.tolist() # noqa: F841
- with HDFStore(hh) as store:
- result = store.select("df", where="l1=l0")
- tm.assert_frame_equal(result, expected)
- result = read_hdf(hh, "df", where="l1=l0")
- tm.assert_frame_equal(result, expected)
- # index
- index = selection.index # noqa: F841
- result = read_hdf(hh, "df", where="l1=index")
- tm.assert_frame_equal(result, expected)
- result = read_hdf(hh, "df", where="l1=selection.index")
- tm.assert_frame_equal(result, expected)
- result = read_hdf(hh, "df", where="l1=selection.index.tolist()")
- tm.assert_frame_equal(result, expected)
- result = read_hdf(hh, "df", where="l1=list(selection.index)")
- tm.assert_frame_equal(result, expected)
- # scope with index
- with HDFStore(hh) as store:
- result = store.select("df", where="l1=index")
- tm.assert_frame_equal(result, expected)
- result = store.select("df", where="l1=selection.index")
- tm.assert_frame_equal(result, expected)
- result = store.select("df", where="l1=selection.index.tolist()")
- tm.assert_frame_equal(result, expected)
- result = store.select("df", where="l1=list(selection.index)")
- tm.assert_frame_equal(result, expected)
- def test_invalid_filtering(setup_path):
- # can't use more than one filter (atm)
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- with ensure_clean_store(setup_path) as store:
- store.put("df", df, format="table")
- msg = "unable to collapse Joint Filters"
- # not implemented
- with pytest.raises(NotImplementedError, match=msg):
- store.select("df", "columns=['A'] | columns=['B']")
- # in theory we could deal with this
- with pytest.raises(NotImplementedError, match=msg):
- store.select("df", "columns=['A','B'] & columns=['C']")
- def test_string_select(setup_path):
- # GH 2973
- with ensure_clean_store(setup_path) as store:
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- # test string ==/!=
- df["x"] = "none"
- df.loc[df.index[2:7], "x"] = ""
- store.append("df", df, data_columns=["x"])
- result = store.select("df", "x=none")
- expected = df[df.x == "none"]
- tm.assert_frame_equal(result, expected)
- result = store.select("df", "x!=none")
- expected = df[df.x != "none"]
- tm.assert_frame_equal(result, expected)
- df2 = df.copy()
- df2.loc[df2.x == "", "x"] = np.nan
- store.append("df2", df2, data_columns=["x"])
- result = store.select("df2", "x!=none")
- expected = df2[isna(df2.x)]
- tm.assert_frame_equal(result, expected)
- # int ==/!=
- df["int"] = 1
- df.loc[df.index[2:7], "int"] = 2
- store.append("df3", df, data_columns=["int"])
- result = store.select("df3", "int=2")
- expected = df[df.int == 2]
- tm.assert_frame_equal(result, expected)
- result = store.select("df3", "int!=2")
- expected = df[df.int != 2]
- tm.assert_frame_equal(result, expected)
- def test_select_as_multiple(setup_path):
- df1 = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD")),
- index=date_range("2000-01-01", periods=10, freq="B"),
- )
- df2 = df1.copy().rename(columns="{}_2".format)
- df2["foo"] = "bar"
- with ensure_clean_store(setup_path) as store:
- msg = "keys must be a list/tuple"
- # no tables stored
- with pytest.raises(TypeError, match=msg):
- store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
- store.append("df1", df1, data_columns=["A", "B"])
- store.append("df2", df2)
- # exceptions
- with pytest.raises(TypeError, match=msg):
- store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
- with pytest.raises(TypeError, match=msg):
- store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1")
- msg = "'No object named df3 in the file'"
- with pytest.raises(KeyError, match=msg):
- store.select_as_multiple(
- ["df1", "df3"], where=["A>0", "B>0"], selector="df1"
- )
- with pytest.raises(KeyError, match=msg):
- store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1")
- with pytest.raises(KeyError, match="'No object named df4 in the file'"):
- store.select_as_multiple(
- ["df1", "df2"], where=["A>0", "B>0"], selector="df4"
- )
- # default select
- result = store.select("df1", ["A>0", "B>0"])
- expected = store.select_as_multiple(
- ["df1"], where=["A>0", "B>0"], selector="df1"
- )
- tm.assert_frame_equal(result, expected)
- expected = store.select_as_multiple("df1", where=["A>0", "B>0"], selector="df1")
- tm.assert_frame_equal(result, expected)
- # multiple
- result = store.select_as_multiple(
- ["df1", "df2"], where=["A>0", "B>0"], selector="df1"
- )
- expected = concat([df1, df2], axis=1)
- expected = expected[(expected.A > 0) & (expected.B > 0)]
- tm.assert_frame_equal(result, expected, check_freq=False)
- # FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds
- # multiple (diff selector)
- result = store.select_as_multiple(
- ["df1", "df2"], where="index>df2.index[4]", selector="df2"
- )
- expected = concat([df1, df2], axis=1)
- expected = expected[5:]
- tm.assert_frame_equal(result, expected)
- # test exception for diff rows
- df3 = df1.copy().head(2)
- store.append("df3", df3)
- msg = "all tables must have exactly the same nrows!"
- with pytest.raises(ValueError, match=msg):
- store.select_as_multiple(
- ["df1", "df3"], where=["A>0", "B>0"], selector="df1"
- )
- def test_nan_selection_bug_4858(setup_path):
- with ensure_clean_store(setup_path) as store:
- df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64")
- df["cols"] = (df["cols"] + 10).apply(str)
- df.iloc[0] = np.nan
- expected = DataFrame(
- {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]},
- index=[3, 4, 5],
- )
- # write w/o the index on that particular column
- store.append("df", df, data_columns=True, index=["cols"])
- result = store.select("df", where="values>2.0")
- tm.assert_frame_equal(result, expected)
- def test_query_with_nested_special_character(setup_path):
- df = DataFrame(
- {
- "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"],
- "b": [1, 2, 3, 4, 5, 6, 7, 8],
- }
- )
- expected = df[df.a == "test & test"]
- with ensure_clean_store(setup_path) as store:
- store.append("test", df, format="table", data_columns=True)
- result = store.select("test", 'a = "test & test"')
- tm.assert_frame_equal(expected, result)
- def test_query_long_float_literal(setup_path):
- # GH 14241
- df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]})
- with ensure_clean_store(setup_path) as store:
- store.append("test", df, format="table", data_columns=True)
- cutoff = 1000000000.0006
- result = store.select("test", f"A < {cutoff:.4f}")
- assert result.empty
- cutoff = 1000000000.0010
- result = store.select("test", f"A > {cutoff:.4f}")
- expected = df.loc[[1, 2], :]
- tm.assert_frame_equal(expected, result)
- exact = 1000000000.0011
- result = store.select("test", f"A == {exact:.4f}")
- expected = df.loc[[1], :]
- tm.assert_frame_equal(expected, result)
- def test_query_compare_column_type(setup_path):
- # GH 15492
- df = DataFrame(
- {
- "date": ["2014-01-01", "2014-01-02"],
- "real_date": date_range("2014-01-01", periods=2),
- "float": [1.1, 1.2],
- "int": [1, 2],
- },
- columns=["date", "real_date", "float", "int"],
- )
- with ensure_clean_store(setup_path) as store:
- store.append("test", df, format="table", data_columns=True)
- ts = Timestamp("2014-01-01") # noqa: F841
- result = store.select("test", where="real_date > ts")
- expected = df.loc[[1], :]
- tm.assert_frame_equal(expected, result)
- for op in ["<", ">", "=="]:
- # non strings to string column always fail
- for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]:
- query = f"date {op} v"
- msg = f"Cannot compare {v} of type {type(v)} to string column"
- with pytest.raises(TypeError, match=msg):
- store.select("test", where=query)
- # strings to other columns must be convertible to type
- v = "a"
- for col in ["int", "float", "real_date"]:
- query = f"{col} {op} v"
- if col == "real_date":
- msg = 'Given date string "a" not likely a datetime'
- else:
- msg = "could not convert string to"
- with pytest.raises(ValueError, match=msg):
- store.select("test", where=query)
- for v, col in zip(
- ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"]
- ):
- query = f"{col} {op} v"
- result = store.select("test", where=query)
- if op == "==":
- expected = df.loc[[0], :]
- elif op == ">":
- expected = df.loc[[1], :]
- else:
- expected = df.loc[[], :]
- tm.assert_frame_equal(expected, result)
- @pytest.mark.parametrize("where", ["", (), (None,), [], [None]])
- def test_select_empty_where(tmp_path, where):
- # GH26610
- df = DataFrame([1, 2, 3])
- path = tmp_path / "empty_where.h5"
- with HDFStore(path) as store:
- store.put("df", df, "t")
- result = read_hdf(store, "df", where=where)
- tm.assert_frame_equal(result, df)
- def test_select_large_integer(tmp_path):
- path = tmp_path / "large_int.h5"
- df = DataFrame(
- zip(
- ["a", "b", "c", "d"],
- [-9223372036854775801, -9223372036854775802, -9223372036854775803, 123],
- ),
- columns=["x", "y"],
- )
- with HDFStore(path) as s:
- s.append("data", df, data_columns=True, index=False)
- result = s.select("data", where="y==-9223372036854775801").get("y").get(0)
- expected = df["y"][0]
- assert expected == result
|