test_frame_apply.py 53 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739
  1. from datetime import datetime
  2. import warnings
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import is_platform_arm
  6. from pandas.core.dtypes.dtypes import CategoricalDtype
  7. import pandas as pd
  8. from pandas import (
  9. DataFrame,
  10. MultiIndex,
  11. Series,
  12. Timestamp,
  13. date_range,
  14. )
  15. import pandas._testing as tm
  16. from pandas.tests.frame.common import zip_frames
  17. from pandas.util.version import Version
  18. @pytest.fixture
  19. def int_frame_const_col():
  20. """
  21. Fixture for DataFrame of ints which are constant per column
  22. Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3]
  23. """
  24. df = DataFrame(
  25. np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1,
  26. columns=["A", "B", "C"],
  27. )
  28. return df
  29. @pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)])
  30. def engine(request):
  31. if request.param == "numba":
  32. pytest.importorskip("numba")
  33. return request.param
  34. def test_apply(float_frame, engine, request):
  35. if engine == "numba":
  36. mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet")
  37. request.node.add_marker(mark)
  38. with np.errstate(all="ignore"):
  39. # ufunc
  40. result = np.sqrt(float_frame["A"])
  41. expected = float_frame.apply(np.sqrt, engine=engine)["A"]
  42. tm.assert_series_equal(result, expected)
  43. # aggregator
  44. result = float_frame.apply(np.mean, engine=engine)["A"]
  45. expected = np.mean(float_frame["A"])
  46. assert result == expected
  47. d = float_frame.index[0]
  48. result = float_frame.apply(np.mean, axis=1, engine=engine)
  49. expected = np.mean(float_frame.xs(d))
  50. assert result[d] == expected
  51. assert result.index is float_frame.index
  52. @pytest.mark.parametrize("axis", [0, 1])
  53. @pytest.mark.parametrize("raw", [True, False])
  54. def test_apply_args(float_frame, axis, raw, engine, request):
  55. if engine == "numba":
  56. numba = pytest.importorskip("numba")
  57. if Version(numba.__version__) == Version("0.61") and is_platform_arm():
  58. pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}")
  59. mark = pytest.mark.xfail(reason="numba engine doesn't support args")
  60. request.node.add_marker(mark)
  61. result = float_frame.apply(
  62. lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine
  63. )
  64. expected = float_frame + 1
  65. tm.assert_frame_equal(result, expected)
  66. def test_apply_categorical_func():
  67. # GH 9573
  68. df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]})
  69. result = df.apply(lambda ts: ts.astype("category"))
  70. assert result.shape == (4, 2)
  71. assert isinstance(result["c0"].dtype, CategoricalDtype)
  72. assert isinstance(result["c1"].dtype, CategoricalDtype)
  73. def test_apply_axis1_with_ea():
  74. # GH#36785
  75. expected = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]})
  76. result = expected.apply(lambda x: x, axis=1)
  77. tm.assert_frame_equal(result, expected)
  78. @pytest.mark.parametrize(
  79. "data, dtype",
  80. [(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)],
  81. )
  82. def test_agg_axis1_duplicate_index(data, dtype):
  83. # GH 42380
  84. expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype)
  85. result = expected.agg(lambda x: x, axis=1)
  86. tm.assert_frame_equal(result, expected)
  87. def test_apply_mixed_datetimelike():
  88. # mixed datetimelike
  89. # GH 7778
  90. expected = DataFrame(
  91. {
  92. "A": date_range("20130101", periods=3),
  93. "B": pd.to_timedelta(np.arange(3), unit="s"),
  94. }
  95. )
  96. result = expected.apply(lambda x: x, axis=1)
  97. tm.assert_frame_equal(result, expected)
  98. @pytest.mark.parametrize("func", [np.sqrt, np.mean])
  99. def test_apply_empty(func, engine):
  100. # empty
  101. empty_frame = DataFrame()
  102. result = empty_frame.apply(func, engine=engine)
  103. assert result.empty
  104. def test_apply_float_frame(float_frame, engine):
  105. no_rows = float_frame[:0]
  106. result = no_rows.apply(lambda x: x.mean(), engine=engine)
  107. expected = Series(np.nan, index=float_frame.columns)
  108. tm.assert_series_equal(result, expected)
  109. no_cols = float_frame.loc[:, []]
  110. result = no_cols.apply(lambda x: x.mean(), axis=1, engine=engine)
  111. expected = Series(np.nan, index=float_frame.index)
  112. tm.assert_series_equal(result, expected)
  113. def test_apply_empty_except_index(engine):
  114. # GH 2476
  115. expected = DataFrame(index=["a"])
  116. result = expected.apply(lambda x: x["a"], axis=1, engine=engine)
  117. tm.assert_frame_equal(result, expected)
  118. def test_apply_with_reduce_empty():
  119. # reduce with an empty DataFrame
  120. empty_frame = DataFrame()
  121. x = []
  122. result = empty_frame.apply(x.append, axis=1, result_type="expand")
  123. tm.assert_frame_equal(result, empty_frame)
  124. result = empty_frame.apply(x.append, axis=1, result_type="reduce")
  125. expected = Series([], dtype=np.float64)
  126. tm.assert_series_equal(result, expected)
  127. empty_with_cols = DataFrame(columns=["a", "b", "c"])
  128. result = empty_with_cols.apply(x.append, axis=1, result_type="expand")
  129. tm.assert_frame_equal(result, empty_with_cols)
  130. result = empty_with_cols.apply(x.append, axis=1, result_type="reduce")
  131. expected = Series([], dtype=np.float64)
  132. tm.assert_series_equal(result, expected)
  133. # Ensure that x.append hasn't been called
  134. assert x == []
  135. @pytest.mark.parametrize("func", ["sum", "prod", "any", "all"])
  136. def test_apply_funcs_over_empty(func):
  137. # GH 28213
  138. df = DataFrame(columns=["a", "b", "c"])
  139. result = df.apply(getattr(np, func))
  140. expected = getattr(df, func)()
  141. if func in ("sum", "prod"):
  142. expected = expected.astype(float)
  143. tm.assert_series_equal(result, expected)
  144. def test_nunique_empty():
  145. # GH 28213
  146. df = DataFrame(columns=["a", "b", "c"])
  147. result = df.nunique()
  148. expected = Series(0, index=df.columns)
  149. tm.assert_series_equal(result, expected)
  150. result = df.T.nunique()
  151. expected = Series([], dtype=np.float64)
  152. tm.assert_series_equal(result, expected)
  153. def test_apply_standard_nonunique():
  154. df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"])
  155. result = df.apply(lambda s: s[0], axis=1)
  156. expected = Series([1, 4, 7], ["a", "a", "c"])
  157. tm.assert_series_equal(result, expected)
  158. result = df.T.apply(lambda s: s[0], axis=0)
  159. tm.assert_series_equal(result, expected)
  160. def test_apply_broadcast_scalars(float_frame):
  161. # scalars
  162. result = float_frame.apply(np.mean, result_type="broadcast")
  163. expected = DataFrame([float_frame.mean()], index=float_frame.index)
  164. tm.assert_frame_equal(result, expected)
  165. def test_apply_broadcast_scalars_axis1(float_frame):
  166. result = float_frame.apply(np.mean, axis=1, result_type="broadcast")
  167. m = float_frame.mean(axis=1)
  168. expected = DataFrame({c: m for c in float_frame.columns})
  169. tm.assert_frame_equal(result, expected)
  170. def test_apply_broadcast_lists_columns(float_frame):
  171. # lists
  172. result = float_frame.apply(
  173. lambda x: list(range(len(float_frame.columns))),
  174. axis=1,
  175. result_type="broadcast",
  176. )
  177. m = list(range(len(float_frame.columns)))
  178. expected = DataFrame(
  179. [m] * len(float_frame.index),
  180. dtype="float64",
  181. index=float_frame.index,
  182. columns=float_frame.columns,
  183. )
  184. tm.assert_frame_equal(result, expected)
  185. def test_apply_broadcast_lists_index(float_frame):
  186. result = float_frame.apply(
  187. lambda x: list(range(len(float_frame.index))), result_type="broadcast"
  188. )
  189. m = list(range(len(float_frame.index)))
  190. expected = DataFrame(
  191. {c: m for c in float_frame.columns},
  192. dtype="float64",
  193. index=float_frame.index,
  194. )
  195. tm.assert_frame_equal(result, expected)
  196. def test_apply_broadcast_list_lambda_func(int_frame_const_col):
  197. # preserve columns
  198. df = int_frame_const_col
  199. result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast")
  200. tm.assert_frame_equal(result, df)
  201. def test_apply_broadcast_series_lambda_func(int_frame_const_col):
  202. df = int_frame_const_col
  203. result = df.apply(
  204. lambda x: Series([1, 2, 3], index=list("abc")),
  205. axis=1,
  206. result_type="broadcast",
  207. )
  208. expected = df.copy()
  209. tm.assert_frame_equal(result, expected)
  210. @pytest.mark.parametrize("axis", [0, 1])
  211. def test_apply_raw_float_frame(float_frame, axis, engine):
  212. if engine == "numba":
  213. pytest.skip("numba can't handle when UDF returns None.")
  214. def _assert_raw(x):
  215. assert isinstance(x, np.ndarray)
  216. assert x.ndim == 1
  217. float_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True)
  218. @pytest.mark.parametrize("axis", [0, 1])
  219. def test_apply_raw_float_frame_lambda(float_frame, axis, engine):
  220. result = float_frame.apply(np.mean, axis=axis, engine=engine, raw=True)
  221. expected = float_frame.apply(lambda x: x.values.mean(), axis=axis)
  222. tm.assert_series_equal(result, expected)
  223. def test_apply_raw_float_frame_no_reduction(float_frame, engine):
  224. # no reduction
  225. result = float_frame.apply(lambda x: x * 2, engine=engine, raw=True)
  226. expected = float_frame * 2
  227. tm.assert_frame_equal(result, expected)
  228. @pytest.mark.parametrize("axis", [0, 1])
  229. def test_apply_raw_mixed_type_frame(axis, engine):
  230. if engine == "numba":
  231. pytest.skip("isinstance check doesn't work with numba")
  232. def _assert_raw(x):
  233. assert isinstance(x, np.ndarray)
  234. assert x.ndim == 1
  235. # Mixed dtype (GH-32423)
  236. df = DataFrame(
  237. {
  238. "a": 1.0,
  239. "b": 2,
  240. "c": "foo",
  241. "float32": np.array([1.0] * 10, dtype="float32"),
  242. "int32": np.array([1] * 10, dtype="int32"),
  243. },
  244. index=np.arange(10),
  245. )
  246. df.apply(_assert_raw, axis=axis, engine=engine, raw=True)
  247. def test_apply_axis1(float_frame):
  248. d = float_frame.index[0]
  249. result = float_frame.apply(np.mean, axis=1)[d]
  250. expected = np.mean(float_frame.xs(d))
  251. assert result == expected
  252. def test_apply_mixed_dtype_corner():
  253. df = DataFrame({"A": ["foo"], "B": [1.0]})
  254. result = df[:0].apply(np.mean, axis=1)
  255. # the result here is actually kind of ambiguous, should it be a Series
  256. # or a DataFrame?
  257. expected = Series(np.nan, index=pd.Index([], dtype="int64"))
  258. tm.assert_series_equal(result, expected)
  259. def test_apply_mixed_dtype_corner_indexing():
  260. df = DataFrame({"A": ["foo"], "B": [1.0]})
  261. result = df.apply(lambda x: x["A"], axis=1)
  262. expected = Series(["foo"], index=[0])
  263. tm.assert_series_equal(result, expected)
  264. result = df.apply(lambda x: x["B"], axis=1)
  265. expected = Series([1.0], index=[0])
  266. tm.assert_series_equal(result, expected)
  267. @pytest.mark.filterwarnings("ignore::RuntimeWarning")
  268. @pytest.mark.parametrize("ax", ["index", "columns"])
  269. @pytest.mark.parametrize(
  270. "func", [lambda x: x, lambda x: x.mean()], ids=["identity", "mean"]
  271. )
  272. @pytest.mark.parametrize("raw", [True, False])
  273. @pytest.mark.parametrize("axis", [0, 1])
  274. def test_apply_empty_infer_type(ax, func, raw, axis, engine, request):
  275. df = DataFrame(**{ax: ["a", "b", "c"]})
  276. with np.errstate(all="ignore"):
  277. test_res = func(np.array([], dtype="f8"))
  278. is_reduction = not isinstance(test_res, np.ndarray)
  279. result = df.apply(func, axis=axis, engine=engine, raw=raw)
  280. if is_reduction:
  281. agg_axis = df._get_agg_axis(axis)
  282. assert isinstance(result, Series)
  283. assert result.index is agg_axis
  284. else:
  285. assert isinstance(result, DataFrame)
  286. def test_apply_empty_infer_type_broadcast():
  287. no_cols = DataFrame(index=["a", "b", "c"])
  288. result = no_cols.apply(lambda x: x.mean(), result_type="broadcast")
  289. assert isinstance(result, DataFrame)
  290. def test_apply_with_args_kwds_add_some(float_frame):
  291. def add_some(x, howmuch=0):
  292. return x + howmuch
  293. result = float_frame.apply(add_some, howmuch=2)
  294. expected = float_frame.apply(lambda x: x + 2)
  295. tm.assert_frame_equal(result, expected)
  296. def test_apply_with_args_kwds_agg_and_add(float_frame):
  297. def agg_and_add(x, howmuch=0):
  298. return x.mean() + howmuch
  299. result = float_frame.apply(agg_and_add, howmuch=2)
  300. expected = float_frame.apply(lambda x: x.mean() + 2)
  301. tm.assert_series_equal(result, expected)
  302. def test_apply_with_args_kwds_subtract_and_divide(float_frame):
  303. def subtract_and_divide(x, sub, divide=1):
  304. return (x - sub) / divide
  305. result = float_frame.apply(subtract_and_divide, args=(2,), divide=2)
  306. expected = float_frame.apply(lambda x: (x - 2.0) / 2.0)
  307. tm.assert_frame_equal(result, expected)
  308. def test_apply_yield_list(float_frame):
  309. result = float_frame.apply(list)
  310. tm.assert_frame_equal(result, float_frame)
  311. def test_apply_reduce_Series(float_frame):
  312. float_frame.iloc[::2, float_frame.columns.get_loc("A")] = np.nan
  313. expected = float_frame.mean(1)
  314. result = float_frame.apply(np.mean, axis=1)
  315. tm.assert_series_equal(result, expected)
  316. def test_apply_reduce_to_dict():
  317. # GH 25196 37544
  318. data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"])
  319. result = data.apply(dict, axis=0)
  320. expected = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns)
  321. tm.assert_series_equal(result, expected)
  322. result = data.apply(dict, axis=1)
  323. expected = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index)
  324. tm.assert_series_equal(result, expected)
  325. def test_apply_differently_indexed():
  326. df = DataFrame(np.random.default_rng(2).standard_normal((20, 10)))
  327. result = df.apply(Series.describe, axis=0)
  328. expected = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns)
  329. tm.assert_frame_equal(result, expected)
  330. result = df.apply(Series.describe, axis=1)
  331. expected = DataFrame({i: v.describe() for i, v in df.T.items()}, columns=df.index).T
  332. tm.assert_frame_equal(result, expected)
  333. def test_apply_bug():
  334. # GH 6125
  335. positions = DataFrame(
  336. [
  337. [1, "ABC0", 50],
  338. [1, "YUM0", 20],
  339. [1, "DEF0", 20],
  340. [2, "ABC1", 50],
  341. [2, "YUM1", 20],
  342. [2, "DEF1", 20],
  343. ],
  344. columns=["a", "market", "position"],
  345. )
  346. def f(r):
  347. return r["market"]
  348. expected = positions.apply(f, axis=1)
  349. positions = DataFrame(
  350. [
  351. [datetime(2013, 1, 1), "ABC0", 50],
  352. [datetime(2013, 1, 2), "YUM0", 20],
  353. [datetime(2013, 1, 3), "DEF0", 20],
  354. [datetime(2013, 1, 4), "ABC1", 50],
  355. [datetime(2013, 1, 5), "YUM1", 20],
  356. [datetime(2013, 1, 6), "DEF1", 20],
  357. ],
  358. columns=["a", "market", "position"],
  359. )
  360. result = positions.apply(f, axis=1)
  361. tm.assert_series_equal(result, expected)
  362. def test_apply_convert_objects():
  363. expected = DataFrame(
  364. {
  365. "A": [
  366. "foo",
  367. "foo",
  368. "foo",
  369. "foo",
  370. "bar",
  371. "bar",
  372. "bar",
  373. "bar",
  374. "foo",
  375. "foo",
  376. "foo",
  377. ],
  378. "B": [
  379. "one",
  380. "one",
  381. "one",
  382. "two",
  383. "one",
  384. "one",
  385. "one",
  386. "two",
  387. "two",
  388. "two",
  389. "one",
  390. ],
  391. "C": [
  392. "dull",
  393. "dull",
  394. "shiny",
  395. "dull",
  396. "dull",
  397. "shiny",
  398. "shiny",
  399. "dull",
  400. "shiny",
  401. "shiny",
  402. "shiny",
  403. ],
  404. "D": np.random.default_rng(2).standard_normal(11),
  405. "E": np.random.default_rng(2).standard_normal(11),
  406. "F": np.random.default_rng(2).standard_normal(11),
  407. }
  408. )
  409. result = expected.apply(lambda x: x, axis=1)
  410. tm.assert_frame_equal(result, expected)
  411. def test_apply_attach_name(float_frame):
  412. result = float_frame.apply(lambda x: x.name)
  413. expected = Series(float_frame.columns, index=float_frame.columns)
  414. tm.assert_series_equal(result, expected)
  415. def test_apply_attach_name_axis1(float_frame):
  416. result = float_frame.apply(lambda x: x.name, axis=1)
  417. expected = Series(float_frame.index, index=float_frame.index)
  418. tm.assert_series_equal(result, expected)
  419. def test_apply_attach_name_non_reduction(float_frame):
  420. # non-reductions
  421. result = float_frame.apply(lambda x: np.repeat(x.name, len(x)))
  422. expected = DataFrame(
  423. np.tile(float_frame.columns, (len(float_frame.index), 1)),
  424. index=float_frame.index,
  425. columns=float_frame.columns,
  426. )
  427. tm.assert_frame_equal(result, expected)
  428. def test_apply_attach_name_non_reduction_axis1(float_frame):
  429. result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1)
  430. expected = Series(
  431. np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples()
  432. )
  433. expected.index = float_frame.index
  434. tm.assert_series_equal(result, expected)
  435. def test_apply_multi_index():
  436. index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]])
  437. s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"])
  438. result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1)
  439. expected = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"])
  440. tm.assert_frame_equal(result, expected, check_like=True)
  441. @pytest.mark.parametrize(
  442. "df, dicts",
  443. [
  444. [
  445. DataFrame([["foo", "bar"], ["spam", "eggs"]]),
  446. Series([{0: "foo", 1: "spam"}, {0: "bar", 1: "eggs"}]),
  447. ],
  448. [DataFrame([[0, 1], [2, 3]]), Series([{0: 0, 1: 2}, {0: 1, 1: 3}])],
  449. ],
  450. )
  451. def test_apply_dict(df, dicts):
  452. # GH 8735
  453. fn = lambda x: x.to_dict()
  454. reduce_true = df.apply(fn, result_type="reduce")
  455. reduce_false = df.apply(fn, result_type="expand")
  456. reduce_none = df.apply(fn)
  457. tm.assert_series_equal(reduce_true, dicts)
  458. tm.assert_frame_equal(reduce_false, df)
  459. tm.assert_series_equal(reduce_none, dicts)
  460. def test_apply_non_numpy_dtype():
  461. # GH 12244
  462. df = DataFrame({"dt": date_range("2015-01-01", periods=3, tz="Europe/Brussels")})
  463. result = df.apply(lambda x: x)
  464. tm.assert_frame_equal(result, df)
  465. result = df.apply(lambda x: x + pd.Timedelta("1day"))
  466. expected = DataFrame(
  467. {"dt": date_range("2015-01-02", periods=3, tz="Europe/Brussels")}
  468. )
  469. tm.assert_frame_equal(result, expected)
  470. def test_apply_non_numpy_dtype_category():
  471. df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category")
  472. result = df.apply(lambda x: x)
  473. tm.assert_frame_equal(result, df)
  474. def test_apply_dup_names_multi_agg():
  475. # GH 21063
  476. df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"])
  477. expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"])
  478. result = df.agg(["min"])
  479. tm.assert_frame_equal(result, expected)
  480. @pytest.mark.parametrize("op", ["apply", "agg"])
  481. def test_apply_nested_result_axis_1(op):
  482. # GH 13820
  483. def apply_list(row):
  484. return [2 * row["A"], 2 * row["C"], 2 * row["B"]]
  485. df = DataFrame(np.zeros((4, 4)), columns=list("ABCD"))
  486. result = getattr(df, op)(apply_list, axis=1)
  487. expected = Series(
  488. [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
  489. )
  490. tm.assert_series_equal(result, expected)
  491. def test_apply_noreduction_tzaware_object():
  492. # https://github.com/pandas-dev/pandas/issues/31505
  493. expected = DataFrame(
  494. {"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]"
  495. )
  496. result = expected.apply(lambda x: x)
  497. tm.assert_frame_equal(result, expected)
  498. result = expected.apply(lambda x: x.copy())
  499. tm.assert_frame_equal(result, expected)
  500. def test_apply_function_runs_once():
  501. # https://github.com/pandas-dev/pandas/issues/30815
  502. df = DataFrame({"a": [1, 2, 3]})
  503. names = [] # Save row names function is applied to
  504. def reducing_function(row):
  505. names.append(row.name)
  506. def non_reducing_function(row):
  507. names.append(row.name)
  508. return row
  509. for func in [reducing_function, non_reducing_function]:
  510. del names[:]
  511. df.apply(func, axis=1)
  512. assert names == list(df.index)
  513. def test_apply_raw_function_runs_once(engine):
  514. # https://github.com/pandas-dev/pandas/issues/34506
  515. if engine == "numba":
  516. pytest.skip("appending to list outside of numba func is not supported")
  517. df = DataFrame({"a": [1, 2, 3]})
  518. values = [] # Save row values function is applied to
  519. def reducing_function(row):
  520. values.extend(row)
  521. def non_reducing_function(row):
  522. values.extend(row)
  523. return row
  524. for func in [reducing_function, non_reducing_function]:
  525. del values[:]
  526. df.apply(func, engine=engine, raw=True, axis=1)
  527. assert values == list(df.a.to_list())
  528. def test_apply_with_byte_string():
  529. # GH 34529
  530. df = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"])
  531. expected = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object)
  532. # After we make the apply we expect a dataframe just
  533. # like the original but with the object datatype
  534. result = df.apply(lambda x: x.astype("object"))
  535. tm.assert_frame_equal(result, expected)
  536. @pytest.mark.parametrize("val", ["asd", 12, None, np.nan])
  537. def test_apply_category_equalness(val):
  538. # Check if categorical comparisons on apply, GH 21239
  539. df_values = ["asd", None, 12, "asd", "cde", np.nan]
  540. df = DataFrame({"a": df_values}, dtype="category")
  541. result = df.a.apply(lambda x: x == val)
  542. expected = Series(
  543. [np.nan if pd.isnull(x) else x == val for x in df_values], name="a"
  544. )
  545. tm.assert_series_equal(result, expected)
  546. # the user has supplied an opaque UDF where
  547. # they are transforming the input that requires
  548. # us to infer the output
  549. def test_infer_row_shape():
  550. # GH 17437
  551. # if row shape is changing, infer it
  552. df = DataFrame(np.random.default_rng(2).random((10, 2)))
  553. result = df.apply(np.fft.fft, axis=0).shape
  554. assert result == (10, 2)
  555. result = df.apply(np.fft.rfft, axis=0).shape
  556. assert result == (6, 2)
  557. @pytest.mark.parametrize(
  558. "ops, by_row, expected",
  559. [
  560. ({"a": lambda x: x + 1}, "compat", DataFrame({"a": [2, 3]})),
  561. ({"a": lambda x: x + 1}, False, DataFrame({"a": [2, 3]})),
  562. ({"a": lambda x: x.sum()}, "compat", Series({"a": 3})),
  563. ({"a": lambda x: x.sum()}, False, Series({"a": 3})),
  564. (
  565. {"a": ["sum", np.sum, lambda x: x.sum()]},
  566. "compat",
  567. DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
  568. ),
  569. (
  570. {"a": ["sum", np.sum, lambda x: x.sum()]},
  571. False,
  572. DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
  573. ),
  574. ({"a": lambda x: 1}, "compat", DataFrame({"a": [1, 1]})),
  575. ({"a": lambda x: 1}, False, Series({"a": 1})),
  576. ],
  577. )
  578. def test_dictlike_lambda(ops, by_row, expected):
  579. # GH53601
  580. df = DataFrame({"a": [1, 2]})
  581. result = df.apply(ops, by_row=by_row)
  582. tm.assert_equal(result, expected)
  583. @pytest.mark.parametrize(
  584. "ops",
  585. [
  586. {"a": lambda x: x + 1},
  587. {"a": lambda x: x.sum()},
  588. {"a": ["sum", np.sum, lambda x: x.sum()]},
  589. {"a": lambda x: 1},
  590. ],
  591. )
  592. def test_dictlike_lambda_raises(ops):
  593. # GH53601
  594. df = DataFrame({"a": [1, 2]})
  595. with pytest.raises(ValueError, match="by_row=True not allowed"):
  596. df.apply(ops, by_row=True)
  597. def test_with_dictlike_columns():
  598. # GH 17602
  599. df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  600. result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1)
  601. expected = Series([{"s": 3} for t in df.itertuples()])
  602. tm.assert_series_equal(result, expected)
  603. df["tm"] = [
  604. Timestamp("2017-05-01 00:00:00"),
  605. Timestamp("2017-05-02 00:00:00"),
  606. ]
  607. result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1)
  608. tm.assert_series_equal(result, expected)
  609. # compose a series
  610. result = (df["a"] + df["b"]).apply(lambda x: {"s": x})
  611. expected = Series([{"s": 3}, {"s": 3}])
  612. tm.assert_series_equal(result, expected)
  613. def test_with_dictlike_columns_with_datetime():
  614. # GH 18775
  615. df = DataFrame()
  616. df["author"] = ["X", "Y", "Z"]
  617. df["publisher"] = ["BBC", "NBC", "N24"]
  618. df["date"] = pd.to_datetime(
  619. ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"],
  620. dayfirst=True,
  621. )
  622. result = df.apply(lambda x: {}, axis=1)
  623. expected = Series([{}, {}, {}])
  624. tm.assert_series_equal(result, expected)
  625. def test_with_dictlike_columns_with_infer():
  626. # GH 17602
  627. df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  628. result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand")
  629. expected = DataFrame({"s": [3, 3]})
  630. tm.assert_frame_equal(result, expected)
  631. df["tm"] = [
  632. Timestamp("2017-05-01 00:00:00"),
  633. Timestamp("2017-05-02 00:00:00"),
  634. ]
  635. result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand")
  636. tm.assert_frame_equal(result, expected)
  637. @pytest.mark.parametrize(
  638. "ops, by_row, expected",
  639. [
  640. ([lambda x: x + 1], "compat", DataFrame({("a", "<lambda>"): [2, 3]})),
  641. ([lambda x: x + 1], False, DataFrame({("a", "<lambda>"): [2, 3]})),
  642. ([lambda x: x.sum()], "compat", DataFrame({"a": [3]}, index=["<lambda>"])),
  643. ([lambda x: x.sum()], False, DataFrame({"a": [3]}, index=["<lambda>"])),
  644. (
  645. ["sum", np.sum, lambda x: x.sum()],
  646. "compat",
  647. DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
  648. ),
  649. (
  650. ["sum", np.sum, lambda x: x.sum()],
  651. False,
  652. DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
  653. ),
  654. (
  655. [lambda x: x + 1, lambda x: 3],
  656. "compat",
  657. DataFrame([[2, 3], [3, 3]], columns=[["a", "a"], ["<lambda>", "<lambda>"]]),
  658. ),
  659. (
  660. [lambda x: 2, lambda x: 3],
  661. False,
  662. DataFrame({"a": [2, 3]}, ["<lambda>", "<lambda>"]),
  663. ),
  664. ],
  665. )
  666. def test_listlike_lambda(ops, by_row, expected):
  667. # GH53601
  668. df = DataFrame({"a": [1, 2]})
  669. result = df.apply(ops, by_row=by_row)
  670. tm.assert_equal(result, expected)
  671. @pytest.mark.parametrize(
  672. "ops",
  673. [
  674. [lambda x: x + 1],
  675. [lambda x: x.sum()],
  676. ["sum", np.sum, lambda x: x.sum()],
  677. [lambda x: x + 1, lambda x: 3],
  678. ],
  679. )
  680. def test_listlike_lambda_raises(ops):
  681. # GH53601
  682. df = DataFrame({"a": [1, 2]})
  683. with pytest.raises(ValueError, match="by_row=True not allowed"):
  684. df.apply(ops, by_row=True)
  685. def test_with_listlike_columns():
  686. # GH 17348
  687. df = DataFrame(
  688. {
  689. "a": Series(np.random.default_rng(2).standard_normal(4)),
  690. "b": ["a", "list", "of", "words"],
  691. "ts": date_range("2016-10-01", periods=4, freq="h"),
  692. }
  693. )
  694. result = df[["a", "b"]].apply(tuple, axis=1)
  695. expected = Series([t[1:] for t in df[["a", "b"]].itertuples()])
  696. tm.assert_series_equal(result, expected)
  697. result = df[["a", "ts"]].apply(tuple, axis=1)
  698. expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()])
  699. tm.assert_series_equal(result, expected)
  700. def test_with_listlike_columns_returning_list():
  701. # GH 18919
  702. df = DataFrame({"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])})
  703. df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")])
  704. result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1)
  705. expected = Series([[], ["q"]], index=df.index)
  706. tm.assert_series_equal(result, expected)
  707. def test_infer_output_shape_columns():
  708. # GH 18573
  709. df = DataFrame(
  710. {
  711. "number": [1.0, 2.0],
  712. "string": ["foo", "bar"],
  713. "datetime": [
  714. Timestamp("2017-11-29 03:30:00"),
  715. Timestamp("2017-11-29 03:45:00"),
  716. ],
  717. }
  718. )
  719. result = df.apply(lambda row: (row.number, row.string), axis=1)
  720. expected = Series([(t.number, t.string) for t in df.itertuples()])
  721. tm.assert_series_equal(result, expected)
  722. def test_infer_output_shape_listlike_columns():
  723. # GH 16353
  724. df = DataFrame(
  725. np.random.default_rng(2).standard_normal((6, 3)), columns=["A", "B", "C"]
  726. )
  727. result = df.apply(lambda x: [1, 2, 3], axis=1)
  728. expected = Series([[1, 2, 3] for t in df.itertuples()])
  729. tm.assert_series_equal(result, expected)
  730. result = df.apply(lambda x: [1, 2], axis=1)
  731. expected = Series([[1, 2] for t in df.itertuples()])
  732. tm.assert_series_equal(result, expected)
  733. @pytest.mark.parametrize("val", [1, 2])
  734. def test_infer_output_shape_listlike_columns_np_func(val):
  735. # GH 17970
  736. df = DataFrame({"a": [1, 2, 3]}, index=list("abc"))
  737. result = df.apply(lambda row: np.ones(val), axis=1)
  738. expected = Series([np.ones(val) for t in df.itertuples()], index=df.index)
  739. tm.assert_series_equal(result, expected)
  740. def test_infer_output_shape_listlike_columns_with_timestamp():
  741. # GH 17892
  742. df = DataFrame(
  743. {
  744. "a": [
  745. Timestamp("2010-02-01"),
  746. Timestamp("2010-02-04"),
  747. Timestamp("2010-02-05"),
  748. Timestamp("2010-02-06"),
  749. ],
  750. "b": [9, 5, 4, 3],
  751. "c": [5, 3, 4, 2],
  752. "d": [1, 2, 3, 4],
  753. }
  754. )
  755. def fun(x):
  756. return (1, 2)
  757. result = df.apply(fun, axis=1)
  758. expected = Series([(1, 2) for t in df.itertuples()])
  759. tm.assert_series_equal(result, expected)
  760. @pytest.mark.parametrize("lst", [[1, 2, 3], [1, 2]])
  761. def test_consistent_coerce_for_shapes(lst):
  762. # we want column names to NOT be propagated
  763. # just because the shape matches the input shape
  764. df = DataFrame(
  765. np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
  766. )
  767. result = df.apply(lambda x: lst, axis=1)
  768. expected = Series([lst for t in df.itertuples()])
  769. tm.assert_series_equal(result, expected)
  770. def test_consistent_names(int_frame_const_col):
  771. # if a Series is returned, we should use the resulting index names
  772. df = int_frame_const_col
  773. result = df.apply(
  774. lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1
  775. )
  776. expected = int_frame_const_col.rename(
  777. columns={"A": "test", "B": "other", "C": "cols"}
  778. )
  779. tm.assert_frame_equal(result, expected)
  780. result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1)
  781. expected = expected[["test", "other"]]
  782. tm.assert_frame_equal(result, expected)
  783. def test_result_type(int_frame_const_col):
  784. # result_type should be consistent no matter which
  785. # path we take in the code
  786. df = int_frame_const_col
  787. result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
  788. expected = df.copy()
  789. expected.columns = [0, 1, 2]
  790. tm.assert_frame_equal(result, expected)
  791. def test_result_type_shorter_list(int_frame_const_col):
  792. # result_type should be consistent no matter which
  793. # path we take in the code
  794. df = int_frame_const_col
  795. result = df.apply(lambda x: [1, 2], axis=1, result_type="expand")
  796. expected = df[["A", "B"]].copy()
  797. expected.columns = [0, 1]
  798. tm.assert_frame_equal(result, expected)
  799. def test_result_type_broadcast(int_frame_const_col, request, engine):
  800. # result_type should be consistent no matter which
  801. # path we take in the code
  802. if engine == "numba":
  803. mark = pytest.mark.xfail(reason="numba engine doesn't support list return")
  804. request.node.add_marker(mark)
  805. df = int_frame_const_col
  806. # broadcast result
  807. result = df.apply(
  808. lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine
  809. )
  810. expected = df.copy()
  811. tm.assert_frame_equal(result, expected)
  812. def test_result_type_broadcast_series_func(int_frame_const_col, engine, request):
  813. # result_type should be consistent no matter which
  814. # path we take in the code
  815. if engine == "numba":
  816. mark = pytest.mark.xfail(
  817. reason="numba Series constructor only support ndarrays not list data"
  818. )
  819. request.node.add_marker(mark)
  820. df = int_frame_const_col
  821. columns = ["other", "col", "names"]
  822. result = df.apply(
  823. lambda x: Series([1, 2, 3], index=columns),
  824. axis=1,
  825. result_type="broadcast",
  826. engine=engine,
  827. )
  828. expected = df.copy()
  829. tm.assert_frame_equal(result, expected)
  830. def test_result_type_series_result(int_frame_const_col, engine, request):
  831. # result_type should be consistent no matter which
  832. # path we take in the code
  833. if engine == "numba":
  834. mark = pytest.mark.xfail(
  835. reason="numba Series constructor only support ndarrays not list data"
  836. )
  837. request.node.add_marker(mark)
  838. df = int_frame_const_col
  839. # series result
  840. result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1, engine=engine)
  841. expected = df.copy()
  842. tm.assert_frame_equal(result, expected)
  843. def test_result_type_series_result_other_index(int_frame_const_col, engine, request):
  844. # result_type should be consistent no matter which
  845. # path we take in the code
  846. if engine == "numba":
  847. mark = pytest.mark.xfail(
  848. reason="no support in numba Series constructor for list of columns"
  849. )
  850. request.node.add_marker(mark)
  851. df = int_frame_const_col
  852. # series result with other index
  853. columns = ["other", "col", "names"]
  854. result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1, engine=engine)
  855. expected = df.copy()
  856. expected.columns = columns
  857. tm.assert_frame_equal(result, expected)
  858. @pytest.mark.parametrize(
  859. "box",
  860. [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")],
  861. ids=["list", "tuple", "array"],
  862. )
  863. def test_consistency_for_boxed(box, int_frame_const_col):
  864. # passing an array or list should not affect the output shape
  865. df = int_frame_const_col
  866. result = df.apply(lambda x: box([1, 2]), axis=1)
  867. expected = Series([box([1, 2]) for t in df.itertuples()])
  868. tm.assert_series_equal(result, expected)
  869. result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand")
  870. expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1})
  871. tm.assert_frame_equal(result, expected)
  872. def test_agg_transform(axis, float_frame):
  873. other_axis = 1 if axis in {0, "index"} else 0
  874. with np.errstate(all="ignore"):
  875. f_abs = np.abs(float_frame)
  876. f_sqrt = np.sqrt(float_frame)
  877. # ufunc
  878. expected = f_sqrt.copy()
  879. result = float_frame.apply(np.sqrt, axis=axis)
  880. tm.assert_frame_equal(result, expected)
  881. # list-like
  882. result = float_frame.apply([np.sqrt], axis=axis)
  883. expected = f_sqrt.copy()
  884. if axis in {0, "index"}:
  885. expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]])
  886. else:
  887. expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]])
  888. tm.assert_frame_equal(result, expected)
  889. # multiple items in list
  890. # these are in the order as if we are applying both
  891. # functions per series and then concatting
  892. result = float_frame.apply([np.abs, np.sqrt], axis=axis)
  893. expected = zip_frames([f_abs, f_sqrt], axis=other_axis)
  894. if axis in {0, "index"}:
  895. expected.columns = MultiIndex.from_product(
  896. [float_frame.columns, ["absolute", "sqrt"]]
  897. )
  898. else:
  899. expected.index = MultiIndex.from_product(
  900. [float_frame.index, ["absolute", "sqrt"]]
  901. )
  902. tm.assert_frame_equal(result, expected)
  903. def test_demo():
  904. # demonstration tests
  905. df = DataFrame({"A": range(5), "B": 5})
  906. result = df.agg(["min", "max"])
  907. expected = DataFrame(
  908. {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"]
  909. )
  910. tm.assert_frame_equal(result, expected)
  911. def test_demo_dict_agg():
  912. # demonstration tests
  913. df = DataFrame({"A": range(5), "B": 5})
  914. result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]})
  915. expected = DataFrame(
  916. {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]},
  917. columns=["A", "B"],
  918. index=["max", "min", "sum"],
  919. )
  920. tm.assert_frame_equal(result.reindex_like(expected), expected)
  921. def test_agg_with_name_as_column_name():
  922. # GH 36212 - Column name is "name"
  923. data = {"name": ["foo", "bar"]}
  924. df = DataFrame(data)
  925. # result's name should be None
  926. result = df.agg({"name": "count"})
  927. expected = Series({"name": 2})
  928. tm.assert_series_equal(result, expected)
  929. # Check if name is still preserved when aggregating series instead
  930. result = df["name"].agg({"name": "count"})
  931. expected = Series({"name": 2}, name="name")
  932. tm.assert_series_equal(result, expected)
  933. def test_agg_multiple_mixed():
  934. # GH 20909
  935. mdf = DataFrame(
  936. {
  937. "A": [1, 2, 3],
  938. "B": [1.0, 2.0, 3.0],
  939. "C": ["foo", "bar", "baz"],
  940. }
  941. )
  942. expected = DataFrame(
  943. {
  944. "A": [1, 6],
  945. "B": [1.0, 6.0],
  946. "C": ["bar", "foobarbaz"],
  947. },
  948. index=["min", "sum"],
  949. )
  950. # sorted index
  951. result = mdf.agg(["min", "sum"])
  952. tm.assert_frame_equal(result, expected)
  953. result = mdf[["C", "B", "A"]].agg(["sum", "min"])
  954. # GH40420: the result of .agg should have an index that is sorted
  955. # according to the arguments provided to agg.
  956. expected = expected[["C", "B", "A"]].reindex(["sum", "min"])
  957. tm.assert_frame_equal(result, expected)
  958. def test_agg_multiple_mixed_raises():
  959. # GH 20909
  960. mdf = DataFrame(
  961. {
  962. "A": [1, 2, 3],
  963. "B": [1.0, 2.0, 3.0],
  964. "C": ["foo", "bar", "baz"],
  965. "D": date_range("20130101", periods=3),
  966. }
  967. )
  968. # sorted index
  969. msg = "does not support reduction"
  970. with pytest.raises(TypeError, match=msg):
  971. mdf.agg(["min", "sum"])
  972. with pytest.raises(TypeError, match=msg):
  973. mdf[["D", "C", "B", "A"]].agg(["sum", "min"])
  974. def test_agg_reduce(axis, float_frame):
  975. other_axis = 1 if axis in {0, "index"} else 0
  976. name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values()
  977. # all reducers
  978. expected = pd.concat(
  979. [
  980. float_frame.mean(axis=axis),
  981. float_frame.max(axis=axis),
  982. float_frame.sum(axis=axis),
  983. ],
  984. axis=1,
  985. )
  986. expected.columns = ["mean", "max", "sum"]
  987. expected = expected.T if axis in {0, "index"} else expected
  988. result = float_frame.agg(["mean", "max", "sum"], axis=axis)
  989. tm.assert_frame_equal(result, expected)
  990. # dict input with scalars
  991. func = {name1: "mean", name2: "sum"}
  992. result = float_frame.agg(func, axis=axis)
  993. expected = Series(
  994. [
  995. float_frame.loc(other_axis)[name1].mean(),
  996. float_frame.loc(other_axis)[name2].sum(),
  997. ],
  998. index=[name1, name2],
  999. )
  1000. tm.assert_series_equal(result, expected)
  1001. # dict input with lists
  1002. func = {name1: ["mean"], name2: ["sum"]}
  1003. result = float_frame.agg(func, axis=axis)
  1004. expected = DataFrame(
  1005. {
  1006. name1: Series([float_frame.loc(other_axis)[name1].mean()], index=["mean"]),
  1007. name2: Series([float_frame.loc(other_axis)[name2].sum()], index=["sum"]),
  1008. }
  1009. )
  1010. expected = expected.T if axis in {1, "columns"} else expected
  1011. tm.assert_frame_equal(result, expected)
  1012. # dict input with lists with multiple
  1013. func = {name1: ["mean", "sum"], name2: ["sum", "max"]}
  1014. result = float_frame.agg(func, axis=axis)
  1015. expected = pd.concat(
  1016. {
  1017. name1: Series(
  1018. [
  1019. float_frame.loc(other_axis)[name1].mean(),
  1020. float_frame.loc(other_axis)[name1].sum(),
  1021. ],
  1022. index=["mean", "sum"],
  1023. ),
  1024. name2: Series(
  1025. [
  1026. float_frame.loc(other_axis)[name2].sum(),
  1027. float_frame.loc(other_axis)[name2].max(),
  1028. ],
  1029. index=["sum", "max"],
  1030. ),
  1031. },
  1032. axis=1,
  1033. )
  1034. expected = expected.T if axis in {1, "columns"} else expected
  1035. tm.assert_frame_equal(result, expected)
  1036. def test_nuiscance_columns():
  1037. # GH 15015
  1038. df = DataFrame(
  1039. {
  1040. "A": [1, 2, 3],
  1041. "B": [1.0, 2.0, 3.0],
  1042. "C": ["foo", "bar", "baz"],
  1043. "D": date_range("20130101", periods=3),
  1044. }
  1045. )
  1046. result = df.agg("min")
  1047. expected = Series([1, 1.0, "bar", Timestamp("20130101")], index=df.columns)
  1048. tm.assert_series_equal(result, expected)
  1049. result = df.agg(["min"])
  1050. expected = DataFrame(
  1051. [[1, 1.0, "bar", Timestamp("20130101").as_unit("ns")]],
  1052. index=["min"],
  1053. columns=df.columns,
  1054. )
  1055. tm.assert_frame_equal(result, expected)
  1056. msg = "does not support reduction"
  1057. with pytest.raises(TypeError, match=msg):
  1058. df.agg("sum")
  1059. result = df[["A", "B", "C"]].agg("sum")
  1060. expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
  1061. tm.assert_series_equal(result, expected)
  1062. msg = "does not support reduction"
  1063. with pytest.raises(TypeError, match=msg):
  1064. df.agg(["sum"])
  1065. @pytest.mark.parametrize("how", ["agg", "apply"])
  1066. def test_non_callable_aggregates(how):
  1067. # GH 16405
  1068. # 'size' is a property of frame/series
  1069. # validate that this is working
  1070. # GH 39116 - expand to apply
  1071. df = DataFrame(
  1072. {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
  1073. )
  1074. # Function aggregate
  1075. result = getattr(df, how)({"A": "count"})
  1076. expected = Series({"A": 2})
  1077. tm.assert_series_equal(result, expected)
  1078. # Non-function aggregate
  1079. result = getattr(df, how)({"A": "size"})
  1080. expected = Series({"A": 3})
  1081. tm.assert_series_equal(result, expected)
  1082. # Mix function and non-function aggs
  1083. result1 = getattr(df, how)(["count", "size"])
  1084. result2 = getattr(df, how)(
  1085. {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]}
  1086. )
  1087. expected = DataFrame(
  1088. {
  1089. "A": {"count": 2, "size": 3},
  1090. "B": {"count": 2, "size": 3},
  1091. "C": {"count": 2, "size": 3},
  1092. }
  1093. )
  1094. tm.assert_frame_equal(result1, result2, check_like=True)
  1095. tm.assert_frame_equal(result2, expected, check_like=True)
  1096. # Just functional string arg is same as calling df.arg()
  1097. result = getattr(df, how)("count")
  1098. expected = df.count()
  1099. tm.assert_series_equal(result, expected)
  1100. @pytest.mark.parametrize("how", ["agg", "apply"])
  1101. def test_size_as_str(how, axis):
  1102. # GH 39934
  1103. df = DataFrame(
  1104. {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
  1105. )
  1106. # Just a string attribute arg same as calling df.arg
  1107. # on the columns
  1108. result = getattr(df, how)("size", axis=axis)
  1109. if axis in (0, "index"):
  1110. expected = Series(df.shape[0], index=df.columns)
  1111. else:
  1112. expected = Series(df.shape[1], index=df.index)
  1113. tm.assert_series_equal(result, expected)
  1114. def test_agg_listlike_result():
  1115. # GH-29587 user defined function returning list-likes
  1116. df = DataFrame({"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]})
  1117. def func(group_col):
  1118. return list(group_col.dropna().unique())
  1119. result = df.agg(func)
  1120. expected = Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"])
  1121. tm.assert_series_equal(result, expected)
  1122. result = df.agg([func])
  1123. expected = expected.to_frame("func").T
  1124. tm.assert_frame_equal(result, expected)
  1125. @pytest.mark.parametrize("axis", [0, 1])
  1126. @pytest.mark.parametrize(
  1127. "args, kwargs",
  1128. [
  1129. ((1, 2, 3), {}),
  1130. ((8, 7, 15), {}),
  1131. ((1, 2), {}),
  1132. ((1,), {"b": 2}),
  1133. ((), {"a": 1, "b": 2}),
  1134. ((), {"a": 2, "b": 1}),
  1135. ((), {"a": 1, "b": 2, "c": 3}),
  1136. ],
  1137. )
  1138. def test_agg_args_kwargs(axis, args, kwargs):
  1139. def f(x, a, b, c=3):
  1140. return x.sum() + (a + b) / c
  1141. df = DataFrame([[1, 2], [3, 4]])
  1142. if axis == 0:
  1143. expected = Series([5.0, 7.0])
  1144. else:
  1145. expected = Series([4.0, 8.0])
  1146. result = df.agg(f, axis, *args, **kwargs)
  1147. tm.assert_series_equal(result, expected)
  1148. @pytest.mark.parametrize("num_cols", [2, 3, 5])
  1149. def test_frequency_is_original(num_cols, engine, request):
  1150. # GH 22150
  1151. if engine == "numba":
  1152. mark = pytest.mark.xfail(reason="numba engine only supports numeric indices")
  1153. request.node.add_marker(mark)
  1154. index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"])
  1155. original = index.copy()
  1156. df = DataFrame(1, index=index, columns=range(num_cols))
  1157. df.apply(lambda x: x, engine=engine)
  1158. assert index.freq == original.freq
  1159. def test_apply_datetime_tz_issue(engine, request):
  1160. # GH 29052
  1161. if engine == "numba":
  1162. mark = pytest.mark.xfail(
  1163. reason="numba engine doesn't support non-numeric indexes"
  1164. )
  1165. request.node.add_marker(mark)
  1166. timestamps = [
  1167. Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"),
  1168. Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"),
  1169. Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"),
  1170. ]
  1171. df = DataFrame(data=[0, 1, 2], index=timestamps)
  1172. result = df.apply(lambda x: x.name, axis=1, engine=engine)
  1173. expected = Series(index=timestamps, data=timestamps)
  1174. tm.assert_series_equal(result, expected)
  1175. @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})])
  1176. @pytest.mark.parametrize("method", ["min", "max", "sum"])
  1177. def test_mixed_column_raises(df, method, using_infer_string):
  1178. # GH 16832
  1179. if method == "sum":
  1180. msg = r'can only concatenate str \(not "int"\) to str|does not support'
  1181. else:
  1182. msg = "not supported between instances of 'str' and 'float'"
  1183. if not using_infer_string:
  1184. with pytest.raises(TypeError, match=msg):
  1185. getattr(df, method)()
  1186. else:
  1187. getattr(df, method)()
  1188. @pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan])
  1189. def test_apply_dtype(col):
  1190. # GH 31466
  1191. df = DataFrame([[1.0, col]], columns=["a", "b"])
  1192. result = df.apply(lambda x: x.dtype)
  1193. expected = df.dtypes
  1194. tm.assert_series_equal(result, expected)
  1195. def test_apply_mutating(using_array_manager, using_copy_on_write, warn_copy_on_write):
  1196. # GH#35462 case where applied func pins a new BlockManager to a row
  1197. df = DataFrame({"a": range(100), "b": range(100, 200)})
  1198. df_orig = df.copy()
  1199. def func(row):
  1200. mgr = row._mgr
  1201. row.loc["a"] += 1
  1202. assert row._mgr is not mgr
  1203. return row
  1204. expected = df.copy()
  1205. expected["a"] += 1
  1206. with tm.assert_cow_warning(warn_copy_on_write):
  1207. result = df.apply(func, axis=1)
  1208. tm.assert_frame_equal(result, expected)
  1209. if using_copy_on_write or using_array_manager:
  1210. # INFO(CoW) With copy on write, mutating a viewing row doesn't mutate the parent
  1211. # INFO(ArrayManager) With BlockManager, the row is a view and mutated in place,
  1212. # with ArrayManager the row is not a view, and thus not mutated in place
  1213. tm.assert_frame_equal(df, df_orig)
  1214. else:
  1215. tm.assert_frame_equal(df, result)
  1216. def test_apply_empty_list_reduce():
  1217. # GH#35683 get columns correct
  1218. df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"])
  1219. result = df.apply(lambda x: [], result_type="reduce")
  1220. expected = Series({"a": [], "b": []}, dtype=object)
  1221. tm.assert_series_equal(result, expected)
  1222. def test_apply_no_suffix_index(engine, request):
  1223. # GH36189
  1224. if engine == "numba":
  1225. mark = pytest.mark.xfail(
  1226. reason="numba engine doesn't support list-likes/dict-like callables"
  1227. )
  1228. request.node.add_marker(mark)
  1229. pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"])
  1230. result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], engine=engine)
  1231. expected = DataFrame(
  1232. {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "<lambda>", "<lambda>"]
  1233. )
  1234. tm.assert_frame_equal(result, expected)
  1235. def test_apply_raw_returns_string(engine):
  1236. # https://github.com/pandas-dev/pandas/issues/35940
  1237. if engine == "numba":
  1238. pytest.skip("No object dtype support in numba")
  1239. df = DataFrame({"A": ["aa", "bbb"]})
  1240. result = df.apply(lambda x: x[0], engine=engine, axis=1, raw=True)
  1241. expected = Series(["aa", "bbb"])
  1242. tm.assert_series_equal(result, expected)
  1243. def test_aggregation_func_column_order():
  1244. # GH40420: the result of .agg should have an index that is sorted
  1245. # according to the arguments provided to agg.
  1246. df = DataFrame(
  1247. [
  1248. (1, 0, 0),
  1249. (2, 0, 0),
  1250. (3, 0, 0),
  1251. (4, 5, 4),
  1252. (5, 6, 6),
  1253. (6, 7, 7),
  1254. ],
  1255. columns=("att1", "att2", "att3"),
  1256. )
  1257. def sum_div2(s):
  1258. return s.sum() / 2
  1259. aggs = ["sum", sum_div2, "count", "min"]
  1260. result = df.agg(aggs)
  1261. expected = DataFrame(
  1262. {
  1263. "att1": [21.0, 10.5, 6.0, 1.0],
  1264. "att2": [18.0, 9.0, 6.0, 0.0],
  1265. "att3": [17.0, 8.5, 6.0, 0.0],
  1266. },
  1267. index=["sum", "sum_div2", "count", "min"],
  1268. )
  1269. tm.assert_frame_equal(result, expected)
  1270. def test_apply_getitem_axis_1(engine, request):
  1271. # GH 13427
  1272. if engine == "numba":
  1273. mark = pytest.mark.xfail(
  1274. reason="numba engine not supporting duplicate index values"
  1275. )
  1276. request.node.add_marker(mark)
  1277. df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
  1278. result = df[["a", "a"]].apply(
  1279. lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine
  1280. )
  1281. expected = Series([0, 2, 4])
  1282. tm.assert_series_equal(result, expected)
  1283. def test_nuisance_depr_passes_through_warnings():
  1284. # GH 43740
  1285. # DataFrame.agg with list-likes may emit warnings for both individual
  1286. # args and for entire columns, but we only want to emit once. We
  1287. # catch and suppress the warnings for individual args, but need to make
  1288. # sure if some other warnings were raised, they get passed through to
  1289. # the user.
  1290. def expected_warning(x):
  1291. warnings.warn("Hello, World!")
  1292. return x.sum()
  1293. df = DataFrame({"a": [1, 2, 3]})
  1294. with tm.assert_produces_warning(UserWarning, match="Hello, World!"):
  1295. df.agg([expected_warning])
  1296. def test_apply_type():
  1297. # GH 46719
  1298. df = DataFrame(
  1299. {"col1": [3, "string", float], "col2": [0.25, datetime(2020, 1, 1), np.nan]},
  1300. index=["a", "b", "c"],
  1301. )
  1302. # axis=0
  1303. result = df.apply(type, axis=0)
  1304. expected = Series({"col1": Series, "col2": Series})
  1305. tm.assert_series_equal(result, expected)
  1306. # axis=1
  1307. result = df.apply(type, axis=1)
  1308. expected = Series({"a": Series, "b": Series, "c": Series})
  1309. tm.assert_series_equal(result, expected)
  1310. def test_apply_on_empty_dataframe(engine):
  1311. # GH 39111
  1312. df = DataFrame({"a": [1, 2], "b": [3, 0]})
  1313. result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1, engine=engine)
  1314. expected = Series([], dtype=np.float64)
  1315. tm.assert_series_equal(result, expected)
  1316. def test_apply_return_list():
  1317. df = DataFrame({"a": [1, 2], "b": [2, 3]})
  1318. result = df.apply(lambda x: [x.values])
  1319. expected = DataFrame({"a": [[1, 2]], "b": [[2, 3]]})
  1320. tm.assert_frame_equal(result, expected)
  1321. @pytest.mark.parametrize(
  1322. "test, constant",
  1323. [
  1324. ({"a": [1, 2, 3], "b": [1, 1, 1]}, {"a": [1, 2, 3], "b": [1]}),
  1325. ({"a": [2, 2, 2], "b": [1, 1, 1]}, {"a": [2], "b": [1]}),
  1326. ],
  1327. )
  1328. def test_unique_agg_type_is_series(test, constant):
  1329. # GH#22558
  1330. df1 = DataFrame(test)
  1331. expected = Series(data=constant, index=["a", "b"], dtype="object")
  1332. aggregation = {"a": "unique", "b": "unique"}
  1333. result = df1.agg(aggregation)
  1334. tm.assert_series_equal(result, expected)
  1335. def test_any_apply_keyword_non_zero_axis_regression():
  1336. # https://github.com/pandas-dev/pandas/issues/48656
  1337. df = DataFrame({"A": [1, 2, 0], "B": [0, 2, 0], "C": [0, 0, 0]})
  1338. expected = Series([True, True, False])
  1339. tm.assert_series_equal(df.any(axis=1), expected)
  1340. result = df.apply("any", axis=1)
  1341. tm.assert_series_equal(result, expected)
  1342. result = df.apply("any", 1)
  1343. tm.assert_series_equal(result, expected)
  1344. def test_agg_mapping_func_deprecated():
  1345. # GH 53325
  1346. df = DataFrame({"x": [1, 2, 3]})
  1347. def foo1(x, a=1, c=0):
  1348. return x + a + c
  1349. def foo2(x, b=2, c=0):
  1350. return x + b + c
  1351. # single func already takes the vectorized path
  1352. result = df.agg(foo1, 0, 3, c=4)
  1353. expected = df + 7
  1354. tm.assert_frame_equal(result, expected)
  1355. msg = "using .+ in Series.agg cannot aggregate and"
  1356. with tm.assert_produces_warning(FutureWarning, match=msg):
  1357. result = df.agg([foo1, foo2], 0, 3, c=4)
  1358. expected = DataFrame(
  1359. [[8, 8], [9, 9], [10, 10]], columns=[["x", "x"], ["foo1", "foo2"]]
  1360. )
  1361. tm.assert_frame_equal(result, expected)
  1362. # TODO: the result below is wrong, should be fixed (GH53325)
  1363. with tm.assert_produces_warning(FutureWarning, match=msg):
  1364. result = df.agg({"x": foo1}, 0, 3, c=4)
  1365. expected = DataFrame([2, 3, 4], columns=["x"])
  1366. tm.assert_frame_equal(result, expected)
  1367. def test_agg_std():
  1368. df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"])
  1369. with tm.assert_produces_warning(FutureWarning, match="using DataFrame.std"):
  1370. result = df.agg(np.std)
  1371. expected = Series({"A": 2.0, "B": 2.0}, dtype=float)
  1372. tm.assert_series_equal(result, expected)
  1373. with tm.assert_produces_warning(FutureWarning, match="using Series.std"):
  1374. result = df.agg([np.std])
  1375. expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"])
  1376. tm.assert_frame_equal(result, expected)
  1377. def test_agg_dist_like_and_nonunique_columns():
  1378. # GH#51099
  1379. df = DataFrame(
  1380. {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
  1381. )
  1382. df.columns = ["A", "A", "C"]
  1383. result = df.agg({"A": "count"})
  1384. expected = df["A"].count()
  1385. tm.assert_series_equal(result, expected)