test_expressions.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. import operator
  2. import re
  3. import numpy as np
  4. import pytest
  5. from pandas.compat._optional import import_optional_dependency
  6. from pandas import option_context
  7. import pandas._testing as tm
  8. from pandas.core.api import DataFrame
  9. from pandas.core.computation import expressions as expr
  10. from pandas.util.version import Version
  11. @pytest.fixture
  12. def _frame():
  13. return DataFrame(
  14. np.random.default_rng(2).standard_normal((10001, 4)),
  15. columns=list("ABCD"),
  16. dtype="float64",
  17. )
  18. @pytest.fixture
  19. def _frame2():
  20. return DataFrame(
  21. np.random.default_rng(2).standard_normal((100, 4)),
  22. columns=list("ABCD"),
  23. dtype="float64",
  24. )
  25. @pytest.fixture
  26. def _mixed(_frame):
  27. return DataFrame(
  28. {
  29. "A": _frame["A"],
  30. "B": _frame["B"].astype("float32"),
  31. "C": _frame["C"].astype("int64"),
  32. "D": _frame["D"].astype("int32"),
  33. }
  34. )
  35. @pytest.fixture
  36. def _mixed2(_frame2):
  37. return DataFrame(
  38. {
  39. "A": _frame2["A"],
  40. "B": _frame2["B"].astype("float32"),
  41. "C": _frame2["C"].astype("int64"),
  42. "D": _frame2["D"].astype("int32"),
  43. }
  44. )
  45. @pytest.fixture
  46. def _integer():
  47. return DataFrame(
  48. np.random.default_rng(2).integers(1, 100, size=(10001, 4)),
  49. columns=list("ABCD"),
  50. dtype="int64",
  51. )
  52. @pytest.fixture
  53. def _integer_integers(_integer):
  54. # integers to get a case with zeros
  55. return _integer * np.random.default_rng(2).integers(0, 2, size=np.shape(_integer))
  56. @pytest.fixture
  57. def _integer2():
  58. return DataFrame(
  59. np.random.default_rng(2).integers(1, 100, size=(101, 4)),
  60. columns=list("ABCD"),
  61. dtype="int64",
  62. )
  63. @pytest.fixture
  64. def _array(_frame):
  65. return _frame["A"].to_numpy()
  66. @pytest.fixture
  67. def _array2(_frame2):
  68. return _frame2["A"].to_numpy()
  69. @pytest.fixture
  70. def _array_mixed(_mixed):
  71. return _mixed["D"].to_numpy()
  72. @pytest.fixture
  73. def _array_mixed2(_mixed2):
  74. return _mixed2["D"].to_numpy()
  75. @pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr")
  76. class TestExpressions:
  77. @staticmethod
  78. def call_op(df, other, flex: bool, opname: str):
  79. if flex:
  80. op = lambda x, y: getattr(x, opname)(y)
  81. op.__name__ = opname
  82. else:
  83. op = getattr(operator, opname)
  84. with option_context("compute.use_numexpr", False):
  85. expected = op(df, other)
  86. expr.get_test_result()
  87. result = op(df, other)
  88. return result, expected
  89. @pytest.mark.parametrize(
  90. "fixture",
  91. [
  92. "_integer",
  93. "_integer2",
  94. "_integer_integers",
  95. "_frame",
  96. "_frame2",
  97. "_mixed",
  98. "_mixed2",
  99. ],
  100. )
  101. @pytest.mark.parametrize("flex", [True, False])
  102. @pytest.mark.parametrize(
  103. "arith", ["add", "sub", "mul", "mod", "truediv", "floordiv"]
  104. )
  105. def test_run_arithmetic(self, request, fixture, flex, arith, monkeypatch):
  106. df = request.getfixturevalue(fixture)
  107. with monkeypatch.context() as m:
  108. m.setattr(expr, "_MIN_ELEMENTS", 0)
  109. result, expected = self.call_op(df, df, flex, arith)
  110. if arith == "truediv":
  111. assert all(x.kind == "f" for x in expected.dtypes.values)
  112. tm.assert_equal(expected, result)
  113. for i in range(len(df.columns)):
  114. result, expected = self.call_op(
  115. df.iloc[:, i], df.iloc[:, i], flex, arith
  116. )
  117. if arith == "truediv":
  118. assert expected.dtype.kind == "f"
  119. tm.assert_equal(expected, result)
  120. @pytest.mark.parametrize(
  121. "fixture",
  122. [
  123. "_integer",
  124. "_integer2",
  125. "_integer_integers",
  126. "_frame",
  127. "_frame2",
  128. "_mixed",
  129. "_mixed2",
  130. ],
  131. )
  132. @pytest.mark.parametrize("flex", [True, False])
  133. def test_run_binary(self, request, fixture, flex, comparison_op, monkeypatch):
  134. """
  135. tests solely that the result is the same whether or not numexpr is
  136. enabled. Need to test whether the function does the correct thing
  137. elsewhere.
  138. """
  139. df = request.getfixturevalue(fixture)
  140. arith = comparison_op.__name__
  141. with option_context("compute.use_numexpr", False):
  142. other = df + 1
  143. with monkeypatch.context() as m:
  144. m.setattr(expr, "_MIN_ELEMENTS", 0)
  145. expr.set_test_mode(True)
  146. result, expected = self.call_op(df, other, flex, arith)
  147. used_numexpr = expr.get_test_result()
  148. assert used_numexpr, "Did not use numexpr as expected."
  149. tm.assert_equal(expected, result)
  150. for i in range(len(df.columns)):
  151. binary_comp = other.iloc[:, i] + 1
  152. self.call_op(df.iloc[:, i], binary_comp, flex, "add")
  153. def test_invalid(self):
  154. array = np.random.default_rng(2).standard_normal(1_000_001)
  155. array2 = np.random.default_rng(2).standard_normal(100)
  156. # no op
  157. result = expr._can_use_numexpr(operator.add, None, array, array, "evaluate")
  158. assert not result
  159. # min elements
  160. result = expr._can_use_numexpr(operator.add, "+", array2, array2, "evaluate")
  161. assert not result
  162. # ok, we only check on first part of expression
  163. result = expr._can_use_numexpr(operator.add, "+", array, array2, "evaluate")
  164. assert result
  165. @pytest.mark.filterwarnings("ignore:invalid value encountered in:RuntimeWarning")
  166. @pytest.mark.parametrize(
  167. "opname,op_str",
  168. [("add", "+"), ("sub", "-"), ("mul", "*"), ("truediv", "/"), ("pow", "**")],
  169. )
  170. @pytest.mark.parametrize(
  171. "left_fix,right_fix", [("_array", "_array2"), ("_array_mixed", "_array_mixed2")]
  172. )
  173. def test_binary_ops(self, request, opname, op_str, left_fix, right_fix):
  174. left = request.getfixturevalue(left_fix)
  175. right = request.getfixturevalue(right_fix)
  176. def testit(left, right, opname, op_str):
  177. if opname == "pow":
  178. left = np.abs(left)
  179. op = getattr(operator, opname)
  180. # array has 0s
  181. result = expr.evaluate(op, left, left, use_numexpr=True)
  182. expected = expr.evaluate(op, left, left, use_numexpr=False)
  183. tm.assert_numpy_array_equal(result, expected)
  184. result = expr._can_use_numexpr(op, op_str, right, right, "evaluate")
  185. assert not result
  186. with option_context("compute.use_numexpr", False):
  187. testit(left, right, opname, op_str)
  188. expr.set_numexpr_threads(1)
  189. testit(left, right, opname, op_str)
  190. expr.set_numexpr_threads()
  191. testit(left, right, opname, op_str)
  192. @pytest.mark.parametrize(
  193. "left_fix,right_fix", [("_array", "_array2"), ("_array_mixed", "_array_mixed2")]
  194. )
  195. def test_comparison_ops(self, request, comparison_op, left_fix, right_fix):
  196. left = request.getfixturevalue(left_fix)
  197. right = request.getfixturevalue(right_fix)
  198. def testit():
  199. f12 = left + 1
  200. f22 = right + 1
  201. op = comparison_op
  202. result = expr.evaluate(op, left, f12, use_numexpr=True)
  203. expected = expr.evaluate(op, left, f12, use_numexpr=False)
  204. tm.assert_numpy_array_equal(result, expected)
  205. result = expr._can_use_numexpr(op, op, right, f22, "evaluate")
  206. assert not result
  207. with option_context("compute.use_numexpr", False):
  208. testit()
  209. expr.set_numexpr_threads(1)
  210. testit()
  211. expr.set_numexpr_threads()
  212. testit()
  213. @pytest.mark.parametrize("cond", [True, False])
  214. @pytest.mark.parametrize("fixture", ["_frame", "_frame2", "_mixed", "_mixed2"])
  215. def test_where(self, request, cond, fixture):
  216. df = request.getfixturevalue(fixture)
  217. def testit():
  218. c = np.empty(df.shape, dtype=np.bool_)
  219. c.fill(cond)
  220. result = expr.where(c, df.values, df.values + 1)
  221. expected = np.where(c, df.values, df.values + 1)
  222. tm.assert_numpy_array_equal(result, expected)
  223. with option_context("compute.use_numexpr", False):
  224. testit()
  225. expr.set_numexpr_threads(1)
  226. testit()
  227. expr.set_numexpr_threads()
  228. testit()
  229. @pytest.mark.parametrize(
  230. "op_str,opname", [("/", "truediv"), ("//", "floordiv"), ("**", "pow")]
  231. )
  232. def test_bool_ops_raise_on_arithmetic(self, op_str, opname):
  233. df = DataFrame(
  234. {
  235. "a": np.random.default_rng(2).random(10) > 0.5,
  236. "b": np.random.default_rng(2).random(10) > 0.5,
  237. }
  238. )
  239. msg = f"operator '{opname}' not implemented for bool dtypes"
  240. f = getattr(operator, opname)
  241. err_msg = re.escape(msg)
  242. with pytest.raises(NotImplementedError, match=err_msg):
  243. f(df, df)
  244. with pytest.raises(NotImplementedError, match=err_msg):
  245. f(df.a, df.b)
  246. with pytest.raises(NotImplementedError, match=err_msg):
  247. f(df.a, True)
  248. with pytest.raises(NotImplementedError, match=err_msg):
  249. f(False, df.a)
  250. with pytest.raises(NotImplementedError, match=err_msg):
  251. f(False, df)
  252. with pytest.raises(NotImplementedError, match=err_msg):
  253. f(df, True)
  254. @pytest.mark.parametrize(
  255. "op_str,opname", [("+", "add"), ("*", "mul"), ("-", "sub")]
  256. )
  257. def test_bool_ops_warn_on_arithmetic(self, op_str, opname, monkeypatch):
  258. n = 10
  259. df = DataFrame(
  260. {
  261. "a": np.random.default_rng(2).random(n) > 0.5,
  262. "b": np.random.default_rng(2).random(n) > 0.5,
  263. }
  264. )
  265. subs = {"+": "|", "*": "&", "-": "^"}
  266. sub_funcs = {"|": "or_", "&": "and_", "^": "xor"}
  267. f = getattr(operator, opname)
  268. fe = getattr(operator, sub_funcs[subs[op_str]])
  269. if op_str == "-":
  270. # raises TypeError
  271. return
  272. msg = "operator is not supported by numexpr"
  273. ne = import_optional_dependency("numexpr", errors="ignore")
  274. warning = (
  275. UserWarning
  276. if ne
  277. and op_str in {"+", "*"}
  278. and Version(ne.__version__) < Version("2.13.1")
  279. else None
  280. )
  281. with monkeypatch.context() as m:
  282. m.setattr(expr, "_MIN_ELEMENTS", 5)
  283. with option_context("compute.use_numexpr", True):
  284. with tm.assert_produces_warning(warning, match=msg):
  285. r = f(df, df)
  286. e = fe(df, df)
  287. tm.assert_frame_equal(r, e)
  288. with tm.assert_produces_warning(warning, match=msg):
  289. r = f(df.a, df.b)
  290. e = fe(df.a, df.b)
  291. tm.assert_series_equal(r, e)
  292. with tm.assert_produces_warning(warning, match=msg):
  293. r = f(df.a, True)
  294. e = fe(df.a, True)
  295. tm.assert_series_equal(r, e)
  296. with tm.assert_produces_warning(warning, match=msg):
  297. r = f(False, df.a)
  298. e = fe(False, df.a)
  299. tm.assert_series_equal(r, e)
  300. with tm.assert_produces_warning(warning, match=msg):
  301. r = f(False, df)
  302. e = fe(False, df)
  303. tm.assert_frame_equal(r, e)
  304. with tm.assert_produces_warning(warning, match=msg):
  305. r = f(df, True)
  306. e = fe(df, True)
  307. tm.assert_frame_equal(r, e)
  308. @pytest.mark.parametrize(
  309. "test_input,expected",
  310. [
  311. (
  312. DataFrame(
  313. [[0, 1, 2, "aa"], [0, 1, 2, "aa"]], columns=["a", "b", "c", "dtype"]
  314. ),
  315. DataFrame([[False, False], [False, False]], columns=["a", "dtype"]),
  316. ),
  317. (
  318. DataFrame(
  319. [[0, 3, 2, "aa"], [0, 4, 2, "aa"], [0, 1, 1, "bb"]],
  320. columns=["a", "b", "c", "dtype"],
  321. ),
  322. DataFrame(
  323. [[False, False], [False, False], [False, False]],
  324. columns=["a", "dtype"],
  325. ),
  326. ),
  327. ],
  328. )
  329. def test_bool_ops_column_name_dtype(self, test_input, expected):
  330. # GH 22383 - .ne fails if columns containing column name 'dtype'
  331. result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]])
  332. tm.assert_frame_equal(result, expected)
  333. @pytest.mark.parametrize(
  334. "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv")
  335. )
  336. @pytest.mark.parametrize("axis", (0, 1))
  337. def test_frame_series_axis(self, axis, arith, _frame, monkeypatch):
  338. # GH#26736 Dataframe.floordiv(Series, axis=1) fails
  339. df = _frame
  340. if axis == 1:
  341. other = df.iloc[0, :]
  342. else:
  343. other = df.iloc[:, 0]
  344. with monkeypatch.context() as m:
  345. m.setattr(expr, "_MIN_ELEMENTS", 0)
  346. op_func = getattr(df, arith)
  347. with option_context("compute.use_numexpr", False):
  348. expected = op_func(other, axis=axis)
  349. result = op_func(other, axis=axis)
  350. tm.assert_frame_equal(expected, result)
  351. @pytest.mark.parametrize(
  352. "op",
  353. [
  354. "__mod__",
  355. "__rmod__",
  356. "__floordiv__",
  357. "__rfloordiv__",
  358. ],
  359. )
  360. @pytest.mark.parametrize("scalar", [-5, 5])
  361. def test_python_semantics_with_numexpr_installed(
  362. self, op, box_with_array, scalar, monkeypatch
  363. ):
  364. # https://github.com/pandas-dev/pandas/issues/36047
  365. with monkeypatch.context() as m:
  366. m.setattr(expr, "_MIN_ELEMENTS", 0)
  367. data = np.arange(-50, 50)
  368. obj = box_with_array(data)
  369. method = getattr(obj, op)
  370. result = method(scalar)
  371. # compare result with numpy
  372. with option_context("compute.use_numexpr", False):
  373. expected = method(scalar)
  374. tm.assert_equal(result, expected)
  375. # compare result element-wise with Python
  376. for i, elem in enumerate(data):
  377. if box_with_array == DataFrame:
  378. scalar_result = result.iloc[i, 0]
  379. else:
  380. scalar_result = result[i]
  381. try:
  382. expected = getattr(int(elem), op)(scalar)
  383. except ZeroDivisionError:
  384. pass
  385. else:
  386. assert scalar_result == expected