test_query_eval.py 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437
  1. import operator
  2. import numpy as np
  3. import pytest
  4. from pandas.errors import (
  5. NumExprClobberingError,
  6. UndefinedVariableError,
  7. )
  8. import pandas.util._test_decorators as td
  9. import pandas as pd
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. MultiIndex,
  14. Series,
  15. date_range,
  16. )
  17. import pandas._testing as tm
  18. from pandas.core.computation.check import NUMEXPR_INSTALLED
  19. @pytest.fixture(params=["python", "pandas"], ids=lambda x: x)
  20. def parser(request):
  21. return request.param
  22. @pytest.fixture(
  23. params=["python", pytest.param("numexpr", marks=td.skip_if_no("numexpr"))],
  24. ids=lambda x: x,
  25. )
  26. def engine(request):
  27. return request.param
  28. def skip_if_no_pandas_parser(parser):
  29. if parser != "pandas":
  30. pytest.skip(f"cannot evaluate with parser={parser}")
  31. class TestCompat:
  32. @pytest.fixture
  33. def df(self):
  34. return DataFrame({"A": [1, 2, 3]})
  35. @pytest.fixture
  36. def expected1(self, df):
  37. return df[df.A > 0]
  38. @pytest.fixture
  39. def expected2(self, df):
  40. return df.A + 1
  41. def test_query_default(self, df, expected1, expected2):
  42. # GH 12749
  43. # this should always work, whether NUMEXPR_INSTALLED or not
  44. result = df.query("A>0")
  45. tm.assert_frame_equal(result, expected1)
  46. result = df.eval("A+1")
  47. tm.assert_series_equal(result, expected2, check_names=False)
  48. def test_query_None(self, df, expected1, expected2):
  49. result = df.query("A>0", engine=None)
  50. tm.assert_frame_equal(result, expected1)
  51. result = df.eval("A+1", engine=None)
  52. tm.assert_series_equal(result, expected2, check_names=False)
  53. def test_query_python(self, df, expected1, expected2):
  54. result = df.query("A>0", engine="python")
  55. tm.assert_frame_equal(result, expected1)
  56. result = df.eval("A+1", engine="python")
  57. tm.assert_series_equal(result, expected2, check_names=False)
  58. def test_query_numexpr(self, df, expected1, expected2):
  59. if NUMEXPR_INSTALLED:
  60. result = df.query("A>0", engine="numexpr")
  61. tm.assert_frame_equal(result, expected1)
  62. result = df.eval("A+1", engine="numexpr")
  63. tm.assert_series_equal(result, expected2, check_names=False)
  64. else:
  65. msg = (
  66. r"'numexpr' is not installed or an unsupported version. "
  67. r"Cannot use engine='numexpr' for query/eval if 'numexpr' is "
  68. r"not installed"
  69. )
  70. with pytest.raises(ImportError, match=msg):
  71. df.query("A>0", engine="numexpr")
  72. with pytest.raises(ImportError, match=msg):
  73. df.eval("A+1", engine="numexpr")
  74. class TestDataFrameEval:
  75. # smaller hits python, larger hits numexpr
  76. @pytest.mark.parametrize("n", [4, 4000])
  77. @pytest.mark.parametrize(
  78. "op_str,op,rop",
  79. [
  80. ("+", "__add__", "__radd__"),
  81. ("-", "__sub__", "__rsub__"),
  82. ("*", "__mul__", "__rmul__"),
  83. ("/", "__truediv__", "__rtruediv__"),
  84. ],
  85. )
  86. def test_ops(self, op_str, op, rop, n):
  87. # tst ops and reversed ops in evaluation
  88. # GH7198
  89. df = DataFrame(1, index=range(n), columns=list("abcd"))
  90. df.iloc[0] = 2
  91. m = df.mean()
  92. base = DataFrame( # noqa: F841
  93. np.tile(m.values, n).reshape(n, -1), columns=list("abcd")
  94. )
  95. expected = eval(f"base {op_str} df")
  96. # ops as strings
  97. result = eval(f"m {op_str} df")
  98. tm.assert_frame_equal(result, expected)
  99. # these are commutative
  100. if op in ["+", "*"]:
  101. result = getattr(df, op)(m)
  102. tm.assert_frame_equal(result, expected)
  103. # these are not
  104. elif op in ["-", "/"]:
  105. result = getattr(df, rop)(m)
  106. tm.assert_frame_equal(result, expected)
  107. def test_dataframe_sub_numexpr_path(self):
  108. # GH7192: Note we need a large number of rows to ensure this
  109. # goes through the numexpr path
  110. df = DataFrame({"A": np.random.default_rng(2).standard_normal(25000)})
  111. df.iloc[0:5] = np.nan
  112. expected = 1 - np.isnan(df.iloc[0:25])
  113. result = (1 - np.isnan(df)).iloc[0:25]
  114. tm.assert_frame_equal(result, expected)
  115. def test_query_non_str(self):
  116. # GH 11485
  117. df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]})
  118. msg = "expr must be a string to be evaluated"
  119. with pytest.raises(ValueError, match=msg):
  120. df.query(lambda x: x.B == "b")
  121. with pytest.raises(ValueError, match=msg):
  122. df.query(111)
  123. def test_query_empty_string(self):
  124. # GH 13139
  125. df = DataFrame({"A": [1, 2, 3]})
  126. msg = "expr cannot be an empty string"
  127. with pytest.raises(ValueError, match=msg):
  128. df.query("")
  129. def test_eval_resolvers_as_list(self):
  130. # GH 14095
  131. df = DataFrame(
  132. np.random.default_rng(2).standard_normal((10, 2)), columns=list("ab")
  133. )
  134. dict1 = {"a": 1}
  135. dict2 = {"b": 2}
  136. assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
  137. assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
  138. def test_eval_resolvers_combined(self):
  139. # GH 34966
  140. df = DataFrame(
  141. np.random.default_rng(2).standard_normal((10, 2)), columns=list("ab")
  142. )
  143. dict1 = {"c": 2}
  144. # Both input and default index/column resolvers should be usable
  145. result = df.eval("a + b * c", resolvers=[dict1])
  146. expected = df["a"] + df["b"] * dict1["c"]
  147. tm.assert_series_equal(result, expected)
  148. def test_eval_object_dtype_binop(self):
  149. # GH#24883
  150. df = DataFrame({"a1": ["Y", "N"]})
  151. res = df.eval("c = ((a1 == 'Y') & True)")
  152. expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]})
  153. tm.assert_frame_equal(res, expected)
  154. def test_extension_array_eval(self, engine, parser, request):
  155. # GH#58748
  156. if engine == "numexpr":
  157. mark = pytest.mark.xfail(
  158. reason="numexpr does not support extension array dtypes"
  159. )
  160. request.applymarker(mark)
  161. df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])})
  162. result = df.eval("a / b", engine=engine, parser=parser)
  163. expected = Series(pd.array([0.25, 0.40, 0.50]))
  164. tm.assert_series_equal(result, expected)
  165. def test_complex_eval(self, engine, parser):
  166. # GH#21374
  167. df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]})
  168. result = df.eval("a/b", engine=engine, parser=parser)
  169. expected = Series([1.5 + 0.5j])
  170. tm.assert_series_equal(result, expected)
  171. class TestDataFrameQueryWithMultiIndex:
  172. def test_query_with_named_multiindex(self, parser, engine):
  173. skip_if_no_pandas_parser(parser)
  174. a = np.random.default_rng(2).choice(["red", "green"], size=10)
  175. b = np.random.default_rng(2).choice(["eggs", "ham"], size=10)
  176. index = MultiIndex.from_arrays([a, b], names=["color", "food"])
  177. df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)), index=index)
  178. ind = Series(
  179. df.index.get_level_values("color").values, index=index, name="color"
  180. )
  181. # equality
  182. res1 = df.query('color == "red"', parser=parser, engine=engine)
  183. res2 = df.query('"red" == color', parser=parser, engine=engine)
  184. exp = df[ind == "red"]
  185. tm.assert_frame_equal(res1, exp)
  186. tm.assert_frame_equal(res2, exp)
  187. # inequality
  188. res1 = df.query('color != "red"', parser=parser, engine=engine)
  189. res2 = df.query('"red" != color', parser=parser, engine=engine)
  190. exp = df[ind != "red"]
  191. tm.assert_frame_equal(res1, exp)
  192. tm.assert_frame_equal(res2, exp)
  193. # list equality (really just set membership)
  194. res1 = df.query('color == ["red"]', parser=parser, engine=engine)
  195. res2 = df.query('["red"] == color', parser=parser, engine=engine)
  196. exp = df[ind.isin(["red"])]
  197. tm.assert_frame_equal(res1, exp)
  198. tm.assert_frame_equal(res2, exp)
  199. res1 = df.query('color != ["red"]', parser=parser, engine=engine)
  200. res2 = df.query('["red"] != color', parser=parser, engine=engine)
  201. exp = df[~ind.isin(["red"])]
  202. tm.assert_frame_equal(res1, exp)
  203. tm.assert_frame_equal(res2, exp)
  204. # in/not in ops
  205. res1 = df.query('["red"] in color', parser=parser, engine=engine)
  206. res2 = df.query('"red" in color', parser=parser, engine=engine)
  207. exp = df[ind.isin(["red"])]
  208. tm.assert_frame_equal(res1, exp)
  209. tm.assert_frame_equal(res2, exp)
  210. res1 = df.query('["red"] not in color', parser=parser, engine=engine)
  211. res2 = df.query('"red" not in color', parser=parser, engine=engine)
  212. exp = df[~ind.isin(["red"])]
  213. tm.assert_frame_equal(res1, exp)
  214. tm.assert_frame_equal(res2, exp)
  215. def test_query_with_unnamed_multiindex(self, parser, engine):
  216. skip_if_no_pandas_parser(parser)
  217. a = np.random.default_rng(2).choice(["red", "green"], size=10)
  218. b = np.random.default_rng(2).choice(["eggs", "ham"], size=10)
  219. index = MultiIndex.from_arrays([a, b])
  220. df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)), index=index)
  221. ind = Series(df.index.get_level_values(0).values, index=index)
  222. res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
  223. res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
  224. exp = df[ind == "red"]
  225. tm.assert_frame_equal(res1, exp)
  226. tm.assert_frame_equal(res2, exp)
  227. # inequality
  228. res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
  229. res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
  230. exp = df[ind != "red"]
  231. tm.assert_frame_equal(res1, exp)
  232. tm.assert_frame_equal(res2, exp)
  233. # list equality (really just set membership)
  234. res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
  235. res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
  236. exp = df[ind.isin(["red"])]
  237. tm.assert_frame_equal(res1, exp)
  238. tm.assert_frame_equal(res2, exp)
  239. res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
  240. res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
  241. exp = df[~ind.isin(["red"])]
  242. tm.assert_frame_equal(res1, exp)
  243. tm.assert_frame_equal(res2, exp)
  244. # in/not in ops
  245. res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
  246. res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
  247. exp = df[ind.isin(["red"])]
  248. tm.assert_frame_equal(res1, exp)
  249. tm.assert_frame_equal(res2, exp)
  250. res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine)
  251. res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
  252. exp = df[~ind.isin(["red"])]
  253. tm.assert_frame_equal(res1, exp)
  254. tm.assert_frame_equal(res2, exp)
  255. # ## LEVEL 1
  256. ind = Series(df.index.get_level_values(1).values, index=index)
  257. res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
  258. res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
  259. exp = df[ind == "eggs"]
  260. tm.assert_frame_equal(res1, exp)
  261. tm.assert_frame_equal(res2, exp)
  262. # inequality
  263. res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
  264. res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
  265. exp = df[ind != "eggs"]
  266. tm.assert_frame_equal(res1, exp)
  267. tm.assert_frame_equal(res2, exp)
  268. # list equality (really just set membership)
  269. res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
  270. res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
  271. exp = df[ind.isin(["eggs"])]
  272. tm.assert_frame_equal(res1, exp)
  273. tm.assert_frame_equal(res2, exp)
  274. res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
  275. res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
  276. exp = df[~ind.isin(["eggs"])]
  277. tm.assert_frame_equal(res1, exp)
  278. tm.assert_frame_equal(res2, exp)
  279. # in/not in ops
  280. res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
  281. res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
  282. exp = df[ind.isin(["eggs"])]
  283. tm.assert_frame_equal(res1, exp)
  284. tm.assert_frame_equal(res2, exp)
  285. res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine)
  286. res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
  287. exp = df[~ind.isin(["eggs"])]
  288. tm.assert_frame_equal(res1, exp)
  289. tm.assert_frame_equal(res2, exp)
  290. def test_query_with_partially_named_multiindex(self, parser, engine):
  291. skip_if_no_pandas_parser(parser)
  292. a = np.random.default_rng(2).choice(["red", "green"], size=10)
  293. b = np.arange(10)
  294. index = MultiIndex.from_arrays([a, b])
  295. index.names = [None, "rating"]
  296. df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)), index=index)
  297. res = df.query("rating == 1", parser=parser, engine=engine)
  298. ind = Series(
  299. df.index.get_level_values("rating").values, index=index, name="rating"
  300. )
  301. exp = df[ind == 1]
  302. tm.assert_frame_equal(res, exp)
  303. res = df.query("rating != 1", parser=parser, engine=engine)
  304. ind = Series(
  305. df.index.get_level_values("rating").values, index=index, name="rating"
  306. )
  307. exp = df[ind != 1]
  308. tm.assert_frame_equal(res, exp)
  309. res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
  310. ind = Series(df.index.get_level_values(0).values, index=index)
  311. exp = df[ind == "red"]
  312. tm.assert_frame_equal(res, exp)
  313. res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
  314. ind = Series(df.index.get_level_values(0).values, index=index)
  315. exp = df[ind != "red"]
  316. tm.assert_frame_equal(res, exp)
  317. def test_query_multiindex_get_index_resolvers(self):
  318. df = DataFrame(
  319. np.ones((10, 3)),
  320. index=MultiIndex.from_arrays(
  321. [range(10) for _ in range(2)], names=["spam", "eggs"]
  322. ),
  323. )
  324. resolvers = df._get_index_resolvers()
  325. def to_series(mi, level):
  326. level_values = mi.get_level_values(level)
  327. s = level_values.to_series()
  328. s.index = mi
  329. return s
  330. col_series = df.columns.to_series()
  331. expected = {
  332. "index": df.index,
  333. "columns": col_series,
  334. "spam": to_series(df.index, "spam"),
  335. "eggs": to_series(df.index, "eggs"),
  336. "clevel_0": col_series,
  337. }
  338. for k, v in resolvers.items():
  339. if isinstance(v, Index):
  340. assert v.is_(expected[k])
  341. elif isinstance(v, Series):
  342. tm.assert_series_equal(v, expected[k])
  343. else:
  344. raise AssertionError("object must be a Series or Index")
  345. @td.skip_if_no("numexpr")
  346. class TestDataFrameQueryNumExprPandas:
  347. @pytest.fixture
  348. def engine(self):
  349. return "numexpr"
  350. @pytest.fixture
  351. def parser(self):
  352. return "pandas"
  353. def test_date_query_with_attribute_access(self, engine, parser):
  354. skip_if_no_pandas_parser(parser)
  355. df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  356. df["dates1"] = date_range("1/1/2012", periods=5)
  357. df["dates2"] = date_range("1/1/2013", periods=5)
  358. df["dates3"] = date_range("1/1/2014", periods=5)
  359. res = df.query(
  360. "@df.dates1 < 20130101 < @df.dates3", engine=engine, parser=parser
  361. )
  362. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  363. tm.assert_frame_equal(res, expec)
  364. def test_date_query_no_attribute_access(self, engine, parser):
  365. df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  366. df["dates1"] = date_range("1/1/2012", periods=5)
  367. df["dates2"] = date_range("1/1/2013", periods=5)
  368. df["dates3"] = date_range("1/1/2014", periods=5)
  369. res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
  370. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  371. tm.assert_frame_equal(res, expec)
  372. def test_date_query_with_NaT(self, engine, parser):
  373. n = 10
  374. df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
  375. df["dates1"] = date_range("1/1/2012", periods=n)
  376. df["dates2"] = date_range("1/1/2013", periods=n)
  377. df["dates3"] = date_range("1/1/2014", periods=n)
  378. df.loc[np.random.default_rng(2).random(n) > 0.5, "dates1"] = pd.NaT
  379. df.loc[np.random.default_rng(2).random(n) > 0.5, "dates3"] = pd.NaT
  380. res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
  381. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  382. tm.assert_frame_equal(res, expec)
  383. def test_date_index_query(self, engine, parser):
  384. n = 10
  385. df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
  386. df["dates1"] = date_range("1/1/2012", periods=n)
  387. df["dates3"] = date_range("1/1/2014", periods=n)
  388. return_value = df.set_index("dates1", inplace=True, drop=True)
  389. assert return_value is None
  390. res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
  391. expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
  392. tm.assert_frame_equal(res, expec)
  393. def test_date_index_query_with_NaT(self, engine, parser):
  394. n = 10
  395. # Cast to object to avoid implicit cast when setting entry to pd.NaT below
  396. df = DataFrame(np.random.default_rng(2).standard_normal((n, 3))).astype(
  397. {0: object}
  398. )
  399. df["dates1"] = date_range("1/1/2012", periods=n)
  400. df["dates3"] = date_range("1/1/2014", periods=n)
  401. df.iloc[0, 0] = pd.NaT
  402. return_value = df.set_index("dates1", inplace=True, drop=True)
  403. assert return_value is None
  404. res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
  405. expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
  406. tm.assert_frame_equal(res, expec)
  407. def test_date_index_query_with_NaT_duplicates(self, engine, parser):
  408. n = 10
  409. d = {}
  410. d["dates1"] = date_range("1/1/2012", periods=n)
  411. d["dates3"] = date_range("1/1/2014", periods=n)
  412. df = DataFrame(d)
  413. df.loc[np.random.default_rng(2).random(n) > 0.5, "dates1"] = pd.NaT
  414. return_value = df.set_index("dates1", inplace=True, drop=True)
  415. assert return_value is None
  416. res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
  417. expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)]
  418. tm.assert_frame_equal(res, expec)
  419. def test_date_query_with_non_date(self, engine, parser):
  420. n = 10
  421. df = DataFrame(
  422. {"dates": date_range("1/1/2012", periods=n), "nondate": np.arange(n)}
  423. )
  424. result = df.query("dates == nondate", parser=parser, engine=engine)
  425. assert len(result) == 0
  426. result = df.query("dates != nondate", parser=parser, engine=engine)
  427. tm.assert_frame_equal(result, df)
  428. msg = r"Invalid comparison between dtype=datetime64\[ns\] and ndarray"
  429. for op in ["<", ">", "<=", ">="]:
  430. with pytest.raises(TypeError, match=msg):
  431. df.query(f"dates {op} nondate", parser=parser, engine=engine)
  432. def test_query_syntax_error(self, engine, parser):
  433. df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)})
  434. msg = "invalid syntax"
  435. with pytest.raises(SyntaxError, match=msg):
  436. df.query("i - +", engine=engine, parser=parser)
  437. def test_query_scope(self, engine, parser):
  438. skip_if_no_pandas_parser(parser)
  439. df = DataFrame(
  440. np.random.default_rng(2).standard_normal((20, 2)), columns=list("ab")
  441. )
  442. a, b = 1, 2 # noqa: F841
  443. res = df.query("a > b", engine=engine, parser=parser)
  444. expected = df[df.a > df.b]
  445. tm.assert_frame_equal(res, expected)
  446. res = df.query("@a > b", engine=engine, parser=parser)
  447. expected = df[a > df.b]
  448. tm.assert_frame_equal(res, expected)
  449. # no local variable c
  450. with pytest.raises(
  451. UndefinedVariableError, match="local variable 'c' is not defined"
  452. ):
  453. df.query("@a > b > @c", engine=engine, parser=parser)
  454. # no column named 'c'
  455. with pytest.raises(UndefinedVariableError, match="name 'c' is not defined"):
  456. df.query("@a > b > c", engine=engine, parser=parser)
  457. def test_query_doesnt_pickup_local(self, engine, parser):
  458. n = m = 10
  459. df = DataFrame(
  460. np.random.default_rng(2).integers(m, size=(n, 3)), columns=list("abc")
  461. )
  462. # we don't pick up the local 'sin'
  463. with pytest.raises(UndefinedVariableError, match="name 'sin' is not defined"):
  464. df.query("sin > 5", engine=engine, parser=parser)
  465. def test_query_builtin(self, engine, parser):
  466. n = m = 10
  467. df = DataFrame(
  468. np.random.default_rng(2).integers(m, size=(n, 3)), columns=list("abc")
  469. )
  470. df.index.name = "sin"
  471. msg = "Variables in expression.+"
  472. with pytest.raises(NumExprClobberingError, match=msg):
  473. df.query("sin > 5", engine=engine, parser=parser)
  474. def test_query(self, engine, parser):
  475. df = DataFrame(
  476. np.random.default_rng(2).standard_normal((10, 3)), columns=["a", "b", "c"]
  477. )
  478. tm.assert_frame_equal(
  479. df.query("a < b", engine=engine, parser=parser), df[df.a < df.b]
  480. )
  481. tm.assert_frame_equal(
  482. df.query("a + b > b * c", engine=engine, parser=parser),
  483. df[df.a + df.b > df.b * df.c],
  484. )
  485. def test_query_index_with_name(self, engine, parser):
  486. df = DataFrame(
  487. np.random.default_rng(2).integers(10, size=(10, 3)),
  488. index=Index(range(10), name="blob"),
  489. columns=["a", "b", "c"],
  490. )
  491. res = df.query("(blob < 5) & (a < b)", engine=engine, parser=parser)
  492. expec = df[(df.index < 5) & (df.a < df.b)]
  493. tm.assert_frame_equal(res, expec)
  494. res = df.query("blob < b", engine=engine, parser=parser)
  495. expec = df[df.index < df.b]
  496. tm.assert_frame_equal(res, expec)
  497. def test_query_index_without_name(self, engine, parser):
  498. df = DataFrame(
  499. np.random.default_rng(2).integers(10, size=(10, 3)),
  500. index=range(10),
  501. columns=["a", "b", "c"],
  502. )
  503. # "index" should refer to the index
  504. res = df.query("index < b", engine=engine, parser=parser)
  505. expec = df[df.index < df.b]
  506. tm.assert_frame_equal(res, expec)
  507. # test against a scalar
  508. res = df.query("index < 5", engine=engine, parser=parser)
  509. expec = df[df.index < 5]
  510. tm.assert_frame_equal(res, expec)
  511. def test_nested_scope(self, engine, parser):
  512. skip_if_no_pandas_parser(parser)
  513. df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  514. df2 = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  515. expected = df[(df > 0) & (df2 > 0)]
  516. result = df.query("(@df > 0) & (@df2 > 0)", engine=engine, parser=parser)
  517. tm.assert_frame_equal(result, expected)
  518. result = pd.eval("df[df > 0 and df2 > 0]", engine=engine, parser=parser)
  519. tm.assert_frame_equal(result, expected)
  520. result = pd.eval(
  521. "df[df > 0 and df2 > 0 and df[df > 0] > 0]", engine=engine, parser=parser
  522. )
  523. expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
  524. tm.assert_frame_equal(result, expected)
  525. result = pd.eval("df[(df>0) & (df2>0)]", engine=engine, parser=parser)
  526. expected = df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser)
  527. tm.assert_frame_equal(result, expected)
  528. def test_nested_raises_on_local_self_reference(self, engine, parser):
  529. df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  530. # can't reference ourself b/c we're a local so @ is necessary
  531. with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"):
  532. df.query("df > 0", engine=engine, parser=parser)
  533. def test_local_syntax(self, engine, parser):
  534. skip_if_no_pandas_parser(parser)
  535. df = DataFrame(
  536. np.random.default_rng(2).standard_normal((100, 10)),
  537. columns=list("abcdefghij"),
  538. )
  539. b = 1
  540. expect = df[df.a < b]
  541. result = df.query("a < @b", engine=engine, parser=parser)
  542. tm.assert_frame_equal(result, expect)
  543. expect = df[df.a < df.b]
  544. result = df.query("a < b", engine=engine, parser=parser)
  545. tm.assert_frame_equal(result, expect)
  546. def test_chained_cmp_and_in(self, engine, parser):
  547. skip_if_no_pandas_parser(parser)
  548. cols = list("abc")
  549. df = DataFrame(
  550. np.random.default_rng(2).standard_normal((100, len(cols))), columns=cols
  551. )
  552. res = df.query(
  553. "a < b < c and a not in b not in c", engine=engine, parser=parser
  554. )
  555. ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b)
  556. expec = df[ind]
  557. tm.assert_frame_equal(res, expec)
  558. def test_local_variable_with_in(self, engine, parser):
  559. skip_if_no_pandas_parser(parser)
  560. a = Series(np.random.default_rng(2).integers(3, size=15), name="a")
  561. b = Series(np.random.default_rng(2).integers(10, size=15), name="b")
  562. df = DataFrame({"a": a, "b": b})
  563. expected = df.loc[(df.b - 1).isin(a)]
  564. result = df.query("b - 1 in a", engine=engine, parser=parser)
  565. tm.assert_frame_equal(expected, result)
  566. b = Series(np.random.default_rng(2).integers(10, size=15), name="b")
  567. expected = df.loc[(b - 1).isin(a)]
  568. result = df.query("@b - 1 in a", engine=engine, parser=parser)
  569. tm.assert_frame_equal(expected, result)
  570. def test_at_inside_string(self, engine, parser):
  571. skip_if_no_pandas_parser(parser)
  572. c = 1 # noqa: F841
  573. df = DataFrame({"a": ["a", "a", "b", "b", "@c", "@c"]})
  574. result = df.query('a == "@c"', engine=engine, parser=parser)
  575. expected = df[df.a == "@c"]
  576. tm.assert_frame_equal(result, expected)
  577. def test_query_undefined_local(self):
  578. engine, parser = self.engine, self.parser
  579. skip_if_no_pandas_parser(parser)
  580. df = DataFrame(np.random.default_rng(2).random((10, 2)), columns=list("ab"))
  581. with pytest.raises(
  582. UndefinedVariableError, match="local variable 'c' is not defined"
  583. ):
  584. df.query("a == @c", engine=engine, parser=parser)
  585. def test_index_resolvers_come_after_columns_with_the_same_name(
  586. self, engine, parser
  587. ):
  588. n = 1 # noqa: F841
  589. a = np.r_[20:101:20]
  590. df = DataFrame(
  591. {"index": a, "b": np.random.default_rng(2).standard_normal(a.size)}
  592. )
  593. df.index.name = "index"
  594. result = df.query("index > 5", engine=engine, parser=parser)
  595. expected = df[df["index"] > 5]
  596. tm.assert_frame_equal(result, expected)
  597. df = DataFrame(
  598. {"index": a, "b": np.random.default_rng(2).standard_normal(a.size)}
  599. )
  600. result = df.query("ilevel_0 > 5", engine=engine, parser=parser)
  601. expected = df.loc[df.index[df.index > 5]]
  602. tm.assert_frame_equal(result, expected)
  603. df = DataFrame({"a": a, "b": np.random.default_rng(2).standard_normal(a.size)})
  604. df.index.name = "a"
  605. result = df.query("a > 5", engine=engine, parser=parser)
  606. expected = df[df.a > 5]
  607. tm.assert_frame_equal(result, expected)
  608. result = df.query("index > 5", engine=engine, parser=parser)
  609. expected = df.loc[df.index[df.index > 5]]
  610. tm.assert_frame_equal(result, expected)
  611. @pytest.mark.parametrize("op, f", [["==", operator.eq], ["!=", operator.ne]])
  612. def test_inf(self, op, f, engine, parser):
  613. n = 10
  614. df = DataFrame(
  615. {
  616. "a": np.random.default_rng(2).random(n),
  617. "b": np.random.default_rng(2).random(n),
  618. }
  619. )
  620. df.loc[::2, 0] = np.inf
  621. q = f"a {op} inf"
  622. expected = df[f(df.a, np.inf)]
  623. result = df.query(q, engine=engine, parser=parser)
  624. tm.assert_frame_equal(result, expected)
  625. def test_check_tz_aware_index_query(self, tz_aware_fixture):
  626. # https://github.com/pandas-dev/pandas/issues/29463
  627. tz = tz_aware_fixture
  628. df_index = date_range(
  629. start="2019-01-01", freq="1d", periods=10, tz=tz, name="time"
  630. )
  631. expected = DataFrame(index=df_index)
  632. df = DataFrame(index=df_index)
  633. result = df.query('"2018-01-03 00:00:00+00" < time')
  634. tm.assert_frame_equal(result, expected)
  635. expected = DataFrame(df_index)
  636. result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
  637. tm.assert_frame_equal(result, expected)
  638. def test_method_calls_in_query(self, engine, parser):
  639. # https://github.com/pandas-dev/pandas/issues/22435
  640. n = 10
  641. df = DataFrame(
  642. {
  643. "a": 2 * np.random.default_rng(2).random(n),
  644. "b": np.random.default_rng(2).random(n),
  645. }
  646. )
  647. expected = df[df["a"].astype("int") == 0]
  648. result = df.query("a.astype('int') == 0", engine=engine, parser=parser)
  649. tm.assert_frame_equal(result, expected)
  650. df = DataFrame(
  651. {
  652. "a": np.where(
  653. np.random.default_rng(2).random(n) < 0.5,
  654. np.nan,
  655. np.random.default_rng(2).standard_normal(n),
  656. ),
  657. "b": np.random.default_rng(2).standard_normal(n),
  658. }
  659. )
  660. expected = df[df["a"].notnull()]
  661. result = df.query("a.notnull()", engine=engine, parser=parser)
  662. tm.assert_frame_equal(result, expected)
  663. @td.skip_if_no("numexpr")
  664. class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas):
  665. @pytest.fixture
  666. def engine(self):
  667. return "numexpr"
  668. @pytest.fixture
  669. def parser(self):
  670. return "python"
  671. def test_date_query_no_attribute_access(self, engine, parser):
  672. df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  673. df["dates1"] = date_range("1/1/2012", periods=5)
  674. df["dates2"] = date_range("1/1/2013", periods=5)
  675. df["dates3"] = date_range("1/1/2014", periods=5)
  676. res = df.query(
  677. "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
  678. )
  679. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  680. tm.assert_frame_equal(res, expec)
  681. def test_date_query_with_NaT(self, engine, parser):
  682. n = 10
  683. df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
  684. df["dates1"] = date_range("1/1/2012", periods=n)
  685. df["dates2"] = date_range("1/1/2013", periods=n)
  686. df["dates3"] = date_range("1/1/2014", periods=n)
  687. df.loc[np.random.default_rng(2).random(n) > 0.5, "dates1"] = pd.NaT
  688. df.loc[np.random.default_rng(2).random(n) > 0.5, "dates3"] = pd.NaT
  689. res = df.query(
  690. "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
  691. )
  692. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  693. tm.assert_frame_equal(res, expec)
  694. def test_date_index_query(self, engine, parser):
  695. n = 10
  696. df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
  697. df["dates1"] = date_range("1/1/2012", periods=n)
  698. df["dates3"] = date_range("1/1/2014", periods=n)
  699. return_value = df.set_index("dates1", inplace=True, drop=True)
  700. assert return_value is None
  701. res = df.query(
  702. "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
  703. )
  704. expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
  705. tm.assert_frame_equal(res, expec)
  706. def test_date_index_query_with_NaT(self, engine, parser):
  707. n = 10
  708. # Cast to object to avoid implicit cast when setting entry to pd.NaT below
  709. df = DataFrame(np.random.default_rng(2).standard_normal((n, 3))).astype(
  710. {0: object}
  711. )
  712. df["dates1"] = date_range("1/1/2012", periods=n)
  713. df["dates3"] = date_range("1/1/2014", periods=n)
  714. df.iloc[0, 0] = pd.NaT
  715. return_value = df.set_index("dates1", inplace=True, drop=True)
  716. assert return_value is None
  717. res = df.query(
  718. "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
  719. )
  720. expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
  721. tm.assert_frame_equal(res, expec)
  722. def test_date_index_query_with_NaT_duplicates(self, engine, parser):
  723. n = 10
  724. df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
  725. df["dates1"] = date_range("1/1/2012", periods=n)
  726. df["dates3"] = date_range("1/1/2014", periods=n)
  727. df.loc[np.random.default_rng(2).random(n) > 0.5, "dates1"] = pd.NaT
  728. return_value = df.set_index("dates1", inplace=True, drop=True)
  729. assert return_value is None
  730. msg = r"'BoolOp' nodes are not implemented"
  731. with pytest.raises(NotImplementedError, match=msg):
  732. df.query("index < 20130101 < dates3", engine=engine, parser=parser)
  733. def test_nested_scope(self, engine, parser):
  734. # smoke test
  735. x = 1 # noqa: F841
  736. result = pd.eval("x + 1", engine=engine, parser=parser)
  737. assert result == 2
  738. df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  739. df2 = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  740. # don't have the pandas parser
  741. msg = r"The '@' prefix is only supported by the pandas parser"
  742. with pytest.raises(SyntaxError, match=msg):
  743. df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser)
  744. with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"):
  745. df.query("(df>0) & (df2>0)", engine=engine, parser=parser)
  746. expected = df[(df > 0) & (df2 > 0)]
  747. result = pd.eval("df[(df > 0) & (df2 > 0)]", engine=engine, parser=parser)
  748. tm.assert_frame_equal(expected, result)
  749. expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
  750. result = pd.eval(
  751. "df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]", engine=engine, parser=parser
  752. )
  753. tm.assert_frame_equal(expected, result)
  754. def test_query_numexpr_with_min_and_max_columns(self):
  755. df = DataFrame({"min": [1, 2, 3], "max": [4, 5, 6]})
  756. regex_to_match = (
  757. r"Variables in expression \"\(min\) == \(1\)\" "
  758. r"overlap with builtins: \('min'\)"
  759. )
  760. with pytest.raises(NumExprClobberingError, match=regex_to_match):
  761. df.query("min == 1")
  762. regex_to_match = (
  763. r"Variables in expression \"\(max\) == \(1\)\" "
  764. r"overlap with builtins: \('max'\)"
  765. )
  766. with pytest.raises(NumExprClobberingError, match=regex_to_match):
  767. df.query("max == 1")
  768. class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas):
  769. @pytest.fixture
  770. def engine(self):
  771. return "python"
  772. @pytest.fixture
  773. def parser(self):
  774. return "pandas"
  775. def test_query_builtin(self, engine, parser):
  776. n = m = 10
  777. df = DataFrame(
  778. np.random.default_rng(2).integers(m, size=(n, 3)), columns=list("abc")
  779. )
  780. df.index.name = "sin"
  781. expected = df[df.index > 5]
  782. result = df.query("sin > 5", engine=engine, parser=parser)
  783. tm.assert_frame_equal(expected, result)
  784. class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython):
  785. @pytest.fixture
  786. def engine(self):
  787. return "python"
  788. @pytest.fixture
  789. def parser(self):
  790. return "python"
  791. def test_query_builtin(self, engine, parser):
  792. n = m = 10
  793. df = DataFrame(
  794. np.random.default_rng(2).integers(m, size=(n, 3)), columns=list("abc")
  795. )
  796. df.index.name = "sin"
  797. expected = df[df.index > 5]
  798. result = df.query("sin > 5", engine=engine, parser=parser)
  799. tm.assert_frame_equal(expected, result)
  800. class TestDataFrameQueryStrings:
  801. def test_str_query_method(self, parser, engine):
  802. df = DataFrame(np.random.default_rng(2).standard_normal((10, 1)), columns=["b"])
  803. df["strings"] = Series(list("aabbccddee"))
  804. expect = df[df.strings == "a"]
  805. if parser != "pandas":
  806. col = "strings"
  807. lst = '"a"'
  808. lhs = [col] * 2 + [lst] * 2
  809. rhs = lhs[::-1]
  810. eq, ne = "==", "!="
  811. ops = 2 * ([eq] + [ne])
  812. msg = r"'(Not)?In' nodes are not implemented"
  813. for lhs, op, rhs in zip(lhs, ops, rhs):
  814. ex = f"{lhs} {op} {rhs}"
  815. with pytest.raises(NotImplementedError, match=msg):
  816. df.query(
  817. ex,
  818. engine=engine,
  819. parser=parser,
  820. local_dict={"strings": df.strings},
  821. )
  822. else:
  823. res = df.query('"a" == strings', engine=engine, parser=parser)
  824. tm.assert_frame_equal(res, expect)
  825. res = df.query('strings == "a"', engine=engine, parser=parser)
  826. tm.assert_frame_equal(res, expect)
  827. tm.assert_frame_equal(res, df[df.strings.isin(["a"])])
  828. expect = df[df.strings != "a"]
  829. res = df.query('strings != "a"', engine=engine, parser=parser)
  830. tm.assert_frame_equal(res, expect)
  831. res = df.query('"a" != strings', engine=engine, parser=parser)
  832. tm.assert_frame_equal(res, expect)
  833. tm.assert_frame_equal(res, df[~df.strings.isin(["a"])])
  834. def test_str_list_query_method(self, parser, engine):
  835. df = DataFrame(np.random.default_rng(2).standard_normal((10, 1)), columns=["b"])
  836. df["strings"] = Series(list("aabbccddee"))
  837. expect = df[df.strings.isin(["a", "b"])]
  838. if parser != "pandas":
  839. col = "strings"
  840. lst = '["a", "b"]'
  841. lhs = [col] * 2 + [lst] * 2
  842. rhs = lhs[::-1]
  843. eq, ne = "==", "!="
  844. ops = 2 * ([eq] + [ne])
  845. msg = r"'(Not)?In' nodes are not implemented"
  846. for lhs, op, rhs in zip(lhs, ops, rhs):
  847. ex = f"{lhs} {op} {rhs}"
  848. with pytest.raises(NotImplementedError, match=msg):
  849. df.query(ex, engine=engine, parser=parser)
  850. else:
  851. res = df.query('strings == ["a", "b"]', engine=engine, parser=parser)
  852. tm.assert_frame_equal(res, expect)
  853. res = df.query('["a", "b"] == strings', engine=engine, parser=parser)
  854. tm.assert_frame_equal(res, expect)
  855. expect = df[~df.strings.isin(["a", "b"])]
  856. res = df.query('strings != ["a", "b"]', engine=engine, parser=parser)
  857. tm.assert_frame_equal(res, expect)
  858. res = df.query('["a", "b"] != strings', engine=engine, parser=parser)
  859. tm.assert_frame_equal(res, expect)
  860. def test_query_with_string_columns(self, parser, engine):
  861. df = DataFrame(
  862. {
  863. "a": list("aaaabbbbcccc"),
  864. "b": list("aabbccddeeff"),
  865. "c": np.random.default_rng(2).integers(5, size=12),
  866. "d": np.random.default_rng(2).integers(9, size=12),
  867. }
  868. )
  869. if parser == "pandas":
  870. res = df.query("a in b", parser=parser, engine=engine)
  871. expec = df[df.a.isin(df.b)]
  872. tm.assert_frame_equal(res, expec)
  873. res = df.query("a in b and c < d", parser=parser, engine=engine)
  874. expec = df[df.a.isin(df.b) & (df.c < df.d)]
  875. tm.assert_frame_equal(res, expec)
  876. else:
  877. msg = r"'(Not)?In' nodes are not implemented"
  878. with pytest.raises(NotImplementedError, match=msg):
  879. df.query("a in b", parser=parser, engine=engine)
  880. msg = r"'BoolOp' nodes are not implemented"
  881. with pytest.raises(NotImplementedError, match=msg):
  882. df.query("a in b and c < d", parser=parser, engine=engine)
  883. def test_object_array_eq_ne(self, parser, engine):
  884. df = DataFrame(
  885. {
  886. "a": list("aaaabbbbcccc"),
  887. "b": list("aabbccddeeff"),
  888. "c": np.random.default_rng(2).integers(5, size=12),
  889. "d": np.random.default_rng(2).integers(9, size=12),
  890. }
  891. )
  892. res = df.query("a == b", parser=parser, engine=engine)
  893. exp = df[df.a == df.b]
  894. tm.assert_frame_equal(res, exp)
  895. res = df.query("a != b", parser=parser, engine=engine)
  896. exp = df[df.a != df.b]
  897. tm.assert_frame_equal(res, exp)
  898. def test_query_with_nested_strings(self, parser, engine):
  899. skip_if_no_pandas_parser(parser)
  900. events = [
  901. f"page {n} {act}" for n in range(1, 4) for act in ["load", "exit"]
  902. ] * 2
  903. stamps1 = date_range("2014-01-01 0:00:01", freq="30s", periods=6)
  904. stamps2 = date_range("2014-02-01 1:00:01", freq="30s", periods=6)
  905. df = DataFrame(
  906. {
  907. "id": np.arange(1, 7).repeat(2),
  908. "event": events,
  909. "timestamp": stamps1.append(stamps2),
  910. }
  911. )
  912. expected = df[df.event == '"page 1 load"']
  913. res = df.query("""'"page 1 load"' in event""", parser=parser, engine=engine)
  914. tm.assert_frame_equal(expected, res)
  915. def test_query_with_nested_special_character(self, parser, engine):
  916. skip_if_no_pandas_parser(parser)
  917. df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]})
  918. res = df.query('a == "test & test"', parser=parser, engine=engine)
  919. expec = df[df.a == "test & test"]
  920. tm.assert_frame_equal(res, expec)
  921. @pytest.mark.parametrize(
  922. "op, func",
  923. [
  924. ["<", operator.lt],
  925. [">", operator.gt],
  926. ["<=", operator.le],
  927. [">=", operator.ge],
  928. ],
  929. )
  930. def test_query_lex_compare_strings(self, parser, engine, op, func):
  931. a = Series(np.random.default_rng(2).choice(list("abcde"), 20))
  932. b = Series(np.arange(a.size))
  933. df = DataFrame({"X": a, "Y": b})
  934. res = df.query(f'X {op} "d"', engine=engine, parser=parser)
  935. expected = df[func(df.X, "d")]
  936. tm.assert_frame_equal(res, expected)
  937. def test_query_single_element_booleans(self, parser, engine):
  938. columns = "bid", "bidsize", "ask", "asksize"
  939. data = np.random.default_rng(2).integers(2, size=(1, len(columns))).astype(bool)
  940. df = DataFrame(data, columns=columns)
  941. res = df.query("bid & ask", engine=engine, parser=parser)
  942. expected = df[df.bid & df.ask]
  943. tm.assert_frame_equal(res, expected)
  944. def test_query_string_scalar_variable(self, parser, engine):
  945. skip_if_no_pandas_parser(parser)
  946. df = DataFrame(
  947. {
  948. "Symbol": ["BUD US", "BUD US", "IBM US", "IBM US"],
  949. "Price": [109.70, 109.72, 183.30, 183.35],
  950. }
  951. )
  952. e = df[df.Symbol == "BUD US"]
  953. symb = "BUD US" # noqa: F841
  954. r = df.query("Symbol == @symb", parser=parser, engine=engine)
  955. tm.assert_frame_equal(e, r)
  956. @pytest.mark.parametrize(
  957. "in_list",
  958. [
  959. [None, "asdf", "ghjk"],
  960. ["asdf", None, "ghjk"],
  961. ["asdf", "ghjk", None],
  962. [None, None, "asdf"],
  963. ["asdf", None, None],
  964. [None, None, None],
  965. ],
  966. )
  967. def test_query_string_null_elements(self, in_list):
  968. # GITHUB ISSUE #31516
  969. parser = "pandas"
  970. engine = "python"
  971. expected = {i: value for i, value in enumerate(in_list) if value == "asdf"}
  972. df_expected = DataFrame({"a": expected}, dtype="string")
  973. df_expected.index = df_expected.index.astype("int64")
  974. df = DataFrame({"a": in_list}, dtype="string")
  975. res1 = df.query("a == 'asdf'", parser=parser, engine=engine)
  976. res2 = df[df["a"] == "asdf"]
  977. res3 = df.query("a <= 'asdf'", parser=parser, engine=engine)
  978. tm.assert_frame_equal(res1, df_expected)
  979. tm.assert_frame_equal(res1, res2)
  980. tm.assert_frame_equal(res1, res3)
  981. tm.assert_frame_equal(res2, res3)
  982. class TestDataFrameEvalWithFrame:
  983. @pytest.fixture
  984. def frame(self):
  985. return DataFrame(
  986. np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
  987. )
  988. def test_simple_expr(self, frame, parser, engine):
  989. res = frame.eval("a + b", engine=engine, parser=parser)
  990. expect = frame.a + frame.b
  991. tm.assert_series_equal(res, expect)
  992. def test_bool_arith_expr(self, frame, parser, engine):
  993. res = frame.eval("a[a < 1] + b", engine=engine, parser=parser)
  994. expect = frame.a[frame.a < 1] + frame.b
  995. tm.assert_series_equal(res, expect)
  996. @pytest.mark.parametrize("op", ["+", "-", "*", "/"])
  997. def test_invalid_type_for_operator_raises(self, parser, engine, op):
  998. df = DataFrame({"a": [1, 2], "b": ["c", "d"]})
  999. msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'|Cannot"
  1000. with pytest.raises(TypeError, match=msg):
  1001. df.eval(f"a {op} b", engine=engine, parser=parser)
  1002. class TestDataFrameQueryBacktickQuoting:
  1003. @pytest.fixture
  1004. def df(self):
  1005. """
  1006. Yields a dataframe with strings that may or may not need escaping
  1007. by backticks. The last two columns cannot be escaped by backticks
  1008. and should raise a ValueError.
  1009. """
  1010. yield DataFrame(
  1011. {
  1012. "A": [1, 2, 3],
  1013. "B B": [3, 2, 1],
  1014. "C C": [4, 5, 6],
  1015. "C C": [7, 4, 3],
  1016. "C_C": [8, 9, 10],
  1017. "D_D D": [11, 1, 101],
  1018. "E.E": [6, 3, 5],
  1019. "F-F": [8, 1, 10],
  1020. "1e1": [2, 4, 8],
  1021. "def": [10, 11, 2],
  1022. "A (x)": [4, 1, 3],
  1023. "B(x)": [1, 1, 5],
  1024. "B (x)": [2, 7, 4],
  1025. " &^ :!€$?(} > <++*'' ": [2, 5, 6],
  1026. "": [10, 11, 1],
  1027. " A": [4, 7, 9],
  1028. " ": [1, 2, 1],
  1029. "it's": [6, 3, 1],
  1030. "that's": [9, 1, 8],
  1031. "☺": [8, 7, 6],
  1032. "foo#bar": [2, 4, 5],
  1033. 1: [5, 7, 9],
  1034. }
  1035. )
  1036. def test_single_backtick_variable_query(self, df):
  1037. res = df.query("1 < `B B`")
  1038. expect = df[1 < df["B B"]]
  1039. tm.assert_frame_equal(res, expect)
  1040. def test_two_backtick_variables_query(self, df):
  1041. res = df.query("1 < `B B` and 4 < `C C`")
  1042. expect = df[(1 < df["B B"]) & (4 < df["C C"])]
  1043. tm.assert_frame_equal(res, expect)
  1044. def test_single_backtick_variable_expr(self, df):
  1045. res = df.eval("A + `B B`")
  1046. expect = df["A"] + df["B B"]
  1047. tm.assert_series_equal(res, expect)
  1048. def test_two_backtick_variables_expr(self, df):
  1049. res = df.eval("`B B` + `C C`")
  1050. expect = df["B B"] + df["C C"]
  1051. tm.assert_series_equal(res, expect)
  1052. def test_already_underscore_variable(self, df):
  1053. res = df.eval("`C_C` + A")
  1054. expect = df["C_C"] + df["A"]
  1055. tm.assert_series_equal(res, expect)
  1056. def test_same_name_but_underscores(self, df):
  1057. res = df.eval("C_C + `C C`")
  1058. expect = df["C_C"] + df["C C"]
  1059. tm.assert_series_equal(res, expect)
  1060. def test_mixed_underscores_and_spaces(self, df):
  1061. res = df.eval("A + `D_D D`")
  1062. expect = df["A"] + df["D_D D"]
  1063. tm.assert_series_equal(res, expect)
  1064. def test_backtick_quote_name_with_no_spaces(self, df):
  1065. res = df.eval("A + `C_C`")
  1066. expect = df["A"] + df["C_C"]
  1067. tm.assert_series_equal(res, expect)
  1068. def test_special_characters(self, df):
  1069. res = df.eval("`E.E` + `F-F` - A")
  1070. expect = df["E.E"] + df["F-F"] - df["A"]
  1071. tm.assert_series_equal(res, expect)
  1072. def test_start_with_digit(self, df):
  1073. res = df.eval("A + `1e1`")
  1074. expect = df["A"] + df["1e1"]
  1075. tm.assert_series_equal(res, expect)
  1076. def test_keyword(self, df):
  1077. res = df.eval("A + `def`")
  1078. expect = df["A"] + df["def"]
  1079. tm.assert_series_equal(res, expect)
  1080. def test_unneeded_quoting(self, df):
  1081. res = df.query("`A` > 2")
  1082. expect = df[df["A"] > 2]
  1083. tm.assert_frame_equal(res, expect)
  1084. def test_parenthesis(self, df):
  1085. res = df.query("`A (x)` > 2")
  1086. expect = df[df["A (x)"] > 2]
  1087. tm.assert_frame_equal(res, expect)
  1088. def test_empty_string(self, df):
  1089. res = df.query("`` > 5")
  1090. expect = df[df[""] > 5]
  1091. tm.assert_frame_equal(res, expect)
  1092. def test_multiple_spaces(self, df):
  1093. res = df.query("`C C` > 5")
  1094. expect = df[df["C C"] > 5]
  1095. tm.assert_frame_equal(res, expect)
  1096. def test_start_with_spaces(self, df):
  1097. res = df.eval("` A` + ` `")
  1098. expect = df[" A"] + df[" "]
  1099. tm.assert_series_equal(res, expect)
  1100. def test_lots_of_operators_string(self, df):
  1101. res = df.query("` &^ :!€$?(} > <++*'' ` > 4")
  1102. expect = df[df[" &^ :!€$?(} > <++*'' "] > 4]
  1103. tm.assert_frame_equal(res, expect)
  1104. def test_missing_attribute(self, df):
  1105. message = "module 'pandas' has no attribute 'thing'"
  1106. with pytest.raises(AttributeError, match=message):
  1107. df.eval("@pd.thing")
  1108. def test_failing_quote(self, df):
  1109. msg = r"(Could not convert ).*( to a valid Python identifier.)"
  1110. with pytest.raises(SyntaxError, match=msg):
  1111. df.query("`it's` > `that's`")
  1112. def test_failing_character_outside_range(self, df):
  1113. msg = r"(Could not convert ).*( to a valid Python identifier.)"
  1114. with pytest.raises(SyntaxError, match=msg):
  1115. df.query("`☺` > 4")
  1116. def test_failing_hashtag(self, df):
  1117. msg = "Failed to parse backticks"
  1118. with pytest.raises(SyntaxError, match=msg):
  1119. df.query("`foo#bar` > 4")
  1120. def test_call_non_named_expression(self, df):
  1121. """
  1122. Only attributes and variables ('named functions') can be called.
  1123. .__call__() is not an allowed attribute because that would allow
  1124. calling anything.
  1125. https://github.com/pandas-dev/pandas/pull/32460
  1126. """
  1127. def func(*_):
  1128. return 1
  1129. funcs = [func] # noqa: F841
  1130. df.eval("@func()")
  1131. with pytest.raises(TypeError, match="Only named functions are supported"):
  1132. df.eval("@funcs[0]()")
  1133. with pytest.raises(TypeError, match="Only named functions are supported"):
  1134. df.eval("@funcs[0].__call__()")
  1135. def test_ea_dtypes(self, any_numeric_ea_and_arrow_dtype):
  1136. # GH#29618
  1137. df = DataFrame(
  1138. [[1, 2], [3, 4]], columns=["a", "b"], dtype=any_numeric_ea_and_arrow_dtype
  1139. )
  1140. warning = RuntimeWarning if NUMEXPR_INSTALLED else None
  1141. with tm.assert_produces_warning(warning):
  1142. result = df.eval("c = b - a")
  1143. expected = DataFrame(
  1144. [[1, 2, 1], [3, 4, 1]],
  1145. columns=["a", "b", "c"],
  1146. dtype=any_numeric_ea_and_arrow_dtype,
  1147. )
  1148. tm.assert_frame_equal(result, expected)
  1149. def test_ea_dtypes_and_scalar(self):
  1150. # GH#29618
  1151. df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"], dtype="Float64")
  1152. warning = RuntimeWarning if NUMEXPR_INSTALLED else None
  1153. with tm.assert_produces_warning(warning):
  1154. result = df.eval("c = b - 1")
  1155. expected = DataFrame(
  1156. [[1, 2, 1], [3, 4, 3]], columns=["a", "b", "c"], dtype="Float64"
  1157. )
  1158. tm.assert_frame_equal(result, expected)
  1159. def test_ea_dtypes_and_scalar_operation(self, any_numeric_ea_and_arrow_dtype):
  1160. # GH#29618
  1161. df = DataFrame(
  1162. [[1, 2], [3, 4]], columns=["a", "b"], dtype=any_numeric_ea_and_arrow_dtype
  1163. )
  1164. result = df.eval("c = 2 - 1")
  1165. expected = DataFrame(
  1166. {
  1167. "a": Series([1, 3], dtype=any_numeric_ea_and_arrow_dtype),
  1168. "b": Series([2, 4], dtype=any_numeric_ea_and_arrow_dtype),
  1169. "c": Series([1, 1], dtype=result["c"].dtype),
  1170. }
  1171. )
  1172. tm.assert_frame_equal(result, expected)
  1173. @pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"])
  1174. def test_query_ea_dtypes(self, dtype):
  1175. if dtype == "int64[pyarrow]":
  1176. pytest.importorskip("pyarrow")
  1177. # GH#50261
  1178. df = DataFrame({"a": Series([1, 2], dtype=dtype)})
  1179. ref = {2} # noqa: F841
  1180. warning = RuntimeWarning if dtype == "Int64" and NUMEXPR_INSTALLED else None
  1181. with tm.assert_produces_warning(warning):
  1182. result = df.query("a in @ref")
  1183. expected = DataFrame({"a": Series([2], dtype=dtype, index=[1])})
  1184. tm.assert_frame_equal(result, expected)
  1185. @pytest.mark.parametrize("engine", ["python", "numexpr"])
  1186. @pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"])
  1187. def test_query_ea_equality_comparison(self, dtype, engine):
  1188. # GH#50261
  1189. warning = RuntimeWarning if engine == "numexpr" else None
  1190. if engine == "numexpr" and not NUMEXPR_INSTALLED:
  1191. pytest.skip("numexpr not installed")
  1192. if dtype == "int64[pyarrow]":
  1193. pytest.importorskip("pyarrow")
  1194. df = DataFrame(
  1195. {"A": Series([1, 1, 2], dtype="Int64"), "B": Series([1, 2, 2], dtype=dtype)}
  1196. )
  1197. with tm.assert_produces_warning(warning):
  1198. result = df.query("A == B", engine=engine)
  1199. expected = DataFrame(
  1200. {
  1201. "A": Series([1, 2], dtype="Int64", index=[0, 2]),
  1202. "B": Series([1, 2], dtype=dtype, index=[0, 2]),
  1203. }
  1204. )
  1205. tm.assert_frame_equal(result, expected)
  1206. def test_all_nat_in_object(self):
  1207. # GH#57068
  1208. now = pd.Timestamp.now("UTC") # noqa: F841
  1209. df = DataFrame({"a": pd.to_datetime([None, None], utc=True)}, dtype=object)
  1210. result = df.query("a > @now")
  1211. expected = DataFrame({"a": []}, dtype=object)
  1212. tm.assert_frame_equal(result, expected)