test_aggregate.py 54 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672
  1. """
  2. test .agg behavior / note that .apply is tested generally in test_groupby.py
  3. """
  4. import datetime
  5. import functools
  6. from functools import partial
  7. import re
  8. import numpy as np
  9. import pytest
  10. from pandas.errors import SpecificationError
  11. from pandas.core.dtypes.common import is_integer_dtype
  12. import pandas as pd
  13. from pandas import (
  14. DataFrame,
  15. Index,
  16. MultiIndex,
  17. Series,
  18. concat,
  19. to_datetime,
  20. )
  21. import pandas._testing as tm
  22. from pandas.core.groupby.grouper import Grouping
  23. def test_groupby_agg_no_extra_calls():
  24. # GH#31760
  25. df = DataFrame({"key": ["a", "b", "c", "c"], "value": [1, 2, 3, 4]})
  26. gb = df.groupby("key")["value"]
  27. def dummy_func(x):
  28. assert len(x) != 0
  29. return x.sum()
  30. gb.agg(dummy_func)
  31. def test_agg_regression1(tsframe):
  32. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  33. result = grouped.agg("mean")
  34. expected = grouped.mean()
  35. tm.assert_frame_equal(result, expected)
  36. def test_agg_must_agg(df):
  37. grouped = df.groupby("A")["C"]
  38. msg = "Must produce aggregated value"
  39. with pytest.raises(Exception, match=msg):
  40. grouped.agg(lambda x: x.describe())
  41. with pytest.raises(Exception, match=msg):
  42. grouped.agg(lambda x: x.index[:2])
  43. def test_agg_ser_multi_key(df):
  44. f = lambda x: x.sum()
  45. results = df.C.groupby([df.A, df.B]).aggregate(f)
  46. expected = df.groupby(["A", "B"]).sum()["C"]
  47. tm.assert_series_equal(results, expected)
  48. def test_groupby_aggregation_mixed_dtype():
  49. # GH 6212
  50. expected = DataFrame(
  51. {
  52. "v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
  53. "v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
  54. },
  55. index=MultiIndex.from_tuples(
  56. [
  57. (1, 95),
  58. (1, 99),
  59. (2, 95),
  60. (2, 99),
  61. ("big", "damp"),
  62. ("blue", "dry"),
  63. ("red", "red"),
  64. ("red", "wet"),
  65. ],
  66. names=["by1", "by2"],
  67. ),
  68. )
  69. df = DataFrame(
  70. {
  71. "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
  72. "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
  73. "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
  74. "by2": [
  75. "wet",
  76. "dry",
  77. 99,
  78. 95,
  79. np.nan,
  80. "damp",
  81. 95,
  82. 99,
  83. "red",
  84. 99,
  85. np.nan,
  86. np.nan,
  87. ],
  88. }
  89. )
  90. g = df.groupby(["by1", "by2"])
  91. result = g[["v1", "v2"]].mean()
  92. tm.assert_frame_equal(result, expected)
  93. def test_groupby_aggregation_multi_level_column():
  94. # GH 29772
  95. lst = [
  96. [True, True, True, False],
  97. [True, False, np.nan, False],
  98. [True, True, np.nan, False],
  99. [True, True, np.nan, False],
  100. ]
  101. df = DataFrame(
  102. data=lst,
  103. columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
  104. )
  105. msg = "DataFrame.groupby with axis=1 is deprecated"
  106. with tm.assert_produces_warning(FutureWarning, match=msg):
  107. gb = df.groupby(level=1, axis=1)
  108. result = gb.sum(numeric_only=False)
  109. expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
  110. tm.assert_frame_equal(result, expected)
  111. def test_agg_apply_corner(ts, tsframe):
  112. # nothing to group, all NA
  113. grouped = ts.groupby(ts * np.nan, group_keys=False)
  114. assert ts.dtype == np.float64
  115. # groupby float64 values results in a float64 Index
  116. exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64))
  117. tm.assert_series_equal(grouped.sum(), exp)
  118. tm.assert_series_equal(grouped.agg("sum"), exp)
  119. tm.assert_series_equal(grouped.apply("sum"), exp, check_index_type=False)
  120. # DataFrame
  121. grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False)
  122. exp_df = DataFrame(
  123. columns=tsframe.columns,
  124. dtype=float,
  125. index=Index([], name="A", dtype=np.float64),
  126. )
  127. tm.assert_frame_equal(grouped.sum(), exp_df)
  128. tm.assert_frame_equal(grouped.agg("sum"), exp_df)
  129. msg = "The behavior of DataFrame.sum with axis=None is deprecated"
  130. with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
  131. res = grouped.apply(np.sum)
  132. tm.assert_frame_equal(res, exp_df)
  133. def test_agg_grouping_is_list_tuple(ts):
  134. df = DataFrame(
  135. np.random.default_rng(2).standard_normal((30, 4)),
  136. columns=Index(list("ABCD"), dtype=object),
  137. index=pd.date_range("2000-01-01", periods=30, freq="B"),
  138. )
  139. grouped = df.groupby(lambda x: x.year)
  140. grouper = grouped._grouper.groupings[0].grouping_vector
  141. grouped._grouper.groupings[0] = Grouping(ts.index, list(grouper))
  142. result = grouped.agg("mean")
  143. expected = grouped.mean()
  144. tm.assert_frame_equal(result, expected)
  145. grouped._grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
  146. result = grouped.agg("mean")
  147. expected = grouped.mean()
  148. tm.assert_frame_equal(result, expected)
  149. def test_agg_python_multiindex(multiindex_dataframe_random_data):
  150. grouped = multiindex_dataframe_random_data.groupby(["A", "B"])
  151. result = grouped.agg("mean")
  152. expected = grouped.mean()
  153. tm.assert_frame_equal(result, expected)
  154. @pytest.mark.parametrize(
  155. "groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
  156. )
  157. def test_aggregate_str_func(tsframe, groupbyfunc):
  158. grouped = tsframe.groupby(groupbyfunc)
  159. # single series
  160. result = grouped["A"].agg("std")
  161. expected = grouped["A"].std()
  162. tm.assert_series_equal(result, expected)
  163. # group frame by function name
  164. result = grouped.aggregate("var")
  165. expected = grouped.var()
  166. tm.assert_frame_equal(result, expected)
  167. # group frame by function dict
  168. result = grouped.agg({"A": "var", "B": "std", "C": "mean", "D": "sem"})
  169. expected = DataFrame(
  170. {
  171. "A": grouped["A"].var(),
  172. "B": grouped["B"].std(),
  173. "C": grouped["C"].mean(),
  174. "D": grouped["D"].sem(),
  175. }
  176. )
  177. tm.assert_frame_equal(result, expected)
  178. def test_std_masked_dtype(any_numeric_ea_dtype):
  179. # GH#35516
  180. df = DataFrame(
  181. {
  182. "a": [2, 1, 1, 1, 2, 2, 1],
  183. "b": Series([pd.NA, 1, 2, 1, 1, 1, 2], dtype="Float64"),
  184. }
  185. )
  186. result = df.groupby("a").std()
  187. expected = DataFrame(
  188. {"b": [0.57735, 0]}, index=Index([1, 2], name="a"), dtype="Float64"
  189. )
  190. tm.assert_frame_equal(result, expected)
  191. def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func):
  192. gb = df.groupby(level=0)
  193. warn_msg = f"DataFrameGroupBy.{reduction_func} with axis=1 is deprecated"
  194. if reduction_func in ("idxmax", "idxmin"):
  195. error = TypeError
  196. msg = "'[<>]' not supported between instances of 'float' and 'str'"
  197. warn = FutureWarning
  198. else:
  199. error = ValueError
  200. msg = f"Operation {reduction_func} does not support axis=1"
  201. warn = None
  202. with pytest.raises(error, match=msg):
  203. with tm.assert_produces_warning(warn, match=warn_msg):
  204. gb.agg(reduction_func, axis=1)
  205. @pytest.mark.parametrize(
  206. "func, expected, dtype, result_dtype_dict",
  207. [
  208. ("sum", [5, 7, 9], "int64", {}),
  209. ("std", [4.5**0.5] * 3, int, {"i": float, "j": float, "k": float}),
  210. ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}),
  211. ("sum", [5, 7, 9], "Int64", {"j": "int64"}),
  212. ("std", [4.5**0.5] * 3, "Int64", {"i": float, "j": float, "k": float}),
  213. ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}),
  214. ],
  215. )
  216. def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict):
  217. # GH#43209
  218. df = DataFrame(
  219. [[1, 2, 3, 4, 5, 6]] * 3,
  220. columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
  221. ).astype({("a", "j"): dtype, ("b", "j"): dtype})
  222. msg = "DataFrame.groupby with axis=1 is deprecated"
  223. with tm.assert_produces_warning(FutureWarning, match=msg):
  224. gb = df.groupby(level=1, axis=1)
  225. result = gb.agg(func)
  226. expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
  227. result_dtype_dict
  228. )
  229. tm.assert_frame_equal(result, expected)
  230. @pytest.mark.parametrize(
  231. "func, expected_data, result_dtype_dict",
  232. [
  233. ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}),
  234. # std should ideally return Int64 / Float64 #43330
  235. ("std", [[2**0.5] * 2] * 3, "float64"),
  236. ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}),
  237. ],
  238. )
  239. def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
  240. # GH#43209
  241. df = DataFrame(
  242. np.arange(12).reshape(3, 4),
  243. index=Index([0, 1, 0], name="y"),
  244. columns=Index([10, 20, 10, 20], name="x"),
  245. dtype="int64",
  246. ).astype({10: "Int64"})
  247. msg = "DataFrame.groupby with axis=1 is deprecated"
  248. with tm.assert_produces_warning(FutureWarning, match=msg):
  249. gb = df.groupby("x", axis=1)
  250. result = gb.agg(func)
  251. expected = DataFrame(
  252. data=expected_data,
  253. index=Index([0, 1, 0], name="y"),
  254. columns=Index([10, 20], name="x"),
  255. ).astype(result_dtype_dict)
  256. tm.assert_frame_equal(result, expected)
  257. def test_aggregate_item_by_item(df):
  258. grouped = df.groupby("A")
  259. aggfun_0 = lambda ser: ser.size
  260. result = grouped.agg(aggfun_0)
  261. foosum = (df.A == "foo").sum()
  262. barsum = (df.A == "bar").sum()
  263. K = len(result.columns)
  264. # GH5782
  265. exp = Series(np.array([foosum] * K), index=list("BCD"), name="foo")
  266. tm.assert_series_equal(result.xs("foo"), exp)
  267. exp = Series(np.array([barsum] * K), index=list("BCD"), name="bar")
  268. tm.assert_almost_equal(result.xs("bar"), exp)
  269. def aggfun_1(ser):
  270. return ser.size
  271. result = DataFrame().groupby(df.A).agg(aggfun_1)
  272. assert isinstance(result, DataFrame)
  273. assert len(result) == 0
  274. def test_wrap_agg_out(three_group):
  275. grouped = three_group.groupby(["A", "B"])
  276. def func(ser):
  277. if ser.dtype in (object, "string"):
  278. raise TypeError("Test error message")
  279. return ser.sum()
  280. with pytest.raises(TypeError, match="Test error message"):
  281. grouped.aggregate(func)
  282. result = grouped[["D", "E", "F"]].aggregate(func)
  283. exp_grouped = three_group.loc[:, ["A", "B", "D", "E", "F"]]
  284. expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
  285. tm.assert_frame_equal(result, expected)
  286. def test_agg_multiple_functions_maintain_order(df):
  287. # GH #610
  288. funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
  289. msg = "is currently using SeriesGroupBy.mean"
  290. with tm.assert_produces_warning(FutureWarning, match=msg):
  291. result = df.groupby("A")["C"].agg(funcs)
  292. exp_cols = Index(["mean", "max", "min"])
  293. tm.assert_index_equal(result.columns, exp_cols)
  294. def test_series_index_name(df):
  295. grouped = df.loc[:, ["C"]].groupby(df["A"])
  296. result = grouped.agg(lambda x: x.mean())
  297. assert result.index.name == "A"
  298. def test_agg_multiple_functions_same_name():
  299. # GH 30880
  300. df = DataFrame(
  301. np.random.default_rng(2).standard_normal((1000, 3)),
  302. index=pd.date_range("1/1/2012", freq="s", periods=1000),
  303. columns=["A", "B", "C"],
  304. )
  305. result = df.resample("3min").agg(
  306. {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
  307. )
  308. expected_index = pd.date_range("1/1/2012", freq="3min", periods=6)
  309. expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")])
  310. expected_values = np.array(
  311. [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]]
  312. ).T
  313. expected = DataFrame(
  314. expected_values, columns=expected_columns, index=expected_index
  315. )
  316. tm.assert_frame_equal(result, expected)
  317. def test_agg_multiple_functions_same_name_with_ohlc_present():
  318. # GH 30880
  319. # ohlc expands dimensions, so different test to the above is required.
  320. df = DataFrame(
  321. np.random.default_rng(2).standard_normal((1000, 3)),
  322. index=pd.date_range("1/1/2012", freq="s", periods=1000, name="dti"),
  323. columns=Index(["A", "B", "C"], name="alpha"),
  324. )
  325. result = df.resample("3min").agg(
  326. {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
  327. )
  328. expected_index = pd.date_range("1/1/2012", freq="3min", periods=6, name="dti")
  329. expected_columns = MultiIndex.from_tuples(
  330. [
  331. ("A", "ohlc", "open"),
  332. ("A", "ohlc", "high"),
  333. ("A", "ohlc", "low"),
  334. ("A", "ohlc", "close"),
  335. ("A", "quantile", "A"),
  336. ("A", "quantile", "A"),
  337. ],
  338. names=["alpha", None, None],
  339. )
  340. non_ohlc_expected_values = np.array(
  341. [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]]
  342. ).T
  343. expected_values = np.hstack(
  344. [df.resample("3min").A.ohlc(), non_ohlc_expected_values]
  345. )
  346. expected = DataFrame(
  347. expected_values, columns=expected_columns, index=expected_index
  348. )
  349. tm.assert_frame_equal(result, expected)
  350. def test_multiple_functions_tuples_and_non_tuples(df):
  351. # #1359
  352. # Columns B and C would cause partial failure
  353. df = df.drop(columns=["B", "C"])
  354. funcs = [("foo", "mean"), "std"]
  355. ex_funcs = [("foo", "mean"), ("std", "std")]
  356. result = df.groupby("A")["D"].agg(funcs)
  357. expected = df.groupby("A")["D"].agg(ex_funcs)
  358. tm.assert_frame_equal(result, expected)
  359. result = df.groupby("A").agg(funcs)
  360. expected = df.groupby("A").agg(ex_funcs)
  361. tm.assert_frame_equal(result, expected)
  362. def test_more_flexible_frame_multi_function(df):
  363. grouped = df.groupby("A")
  364. exmean = grouped.agg({"C": "mean", "D": "mean"})
  365. exstd = grouped.agg({"C": "std", "D": "std"})
  366. expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
  367. expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
  368. d = {"C": ["mean", "std"], "D": ["mean", "std"]}
  369. result = grouped.aggregate(d)
  370. tm.assert_frame_equal(result, expected)
  371. # be careful
  372. result = grouped.aggregate({"C": "mean", "D": ["mean", "std"]})
  373. expected = grouped.aggregate({"C": "mean", "D": ["mean", "std"]})
  374. tm.assert_frame_equal(result, expected)
  375. def numpymean(x):
  376. return np.mean(x)
  377. def numpystd(x):
  378. return np.std(x, ddof=1)
  379. # this uses column selection & renaming
  380. msg = r"nested renamer is not supported"
  381. with pytest.raises(SpecificationError, match=msg):
  382. d = {"C": "mean", "D": {"foo": "mean", "bar": "std"}}
  383. grouped.aggregate(d)
  384. # But without renaming, these functions are OK
  385. d = {"C": ["mean"], "D": [numpymean, numpystd]}
  386. grouped.aggregate(d)
  387. def test_multi_function_flexible_mix(df):
  388. # GH #1268
  389. grouped = df.groupby("A")
  390. # Expected
  391. d = {"C": {"foo": "mean", "bar": "std"}, "D": {"sum": "sum"}}
  392. # this uses column selection & renaming
  393. msg = r"nested renamer is not supported"
  394. with pytest.raises(SpecificationError, match=msg):
  395. grouped.aggregate(d)
  396. # Test 1
  397. d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
  398. # this uses column selection & renaming
  399. with pytest.raises(SpecificationError, match=msg):
  400. grouped.aggregate(d)
  401. # Test 2
  402. d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
  403. # this uses column selection & renaming
  404. with pytest.raises(SpecificationError, match=msg):
  405. grouped.aggregate(d)
  406. def test_groupby_agg_coercing_bools():
  407. # issue 14873
  408. dat = DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
  409. gp = dat.groupby("a")
  410. index = Index([1, 2], name="a")
  411. result = gp["b"].aggregate(lambda x: (x != 0).all())
  412. expected = Series([False, True], index=index, name="b")
  413. tm.assert_series_equal(result, expected)
  414. result = gp["c"].aggregate(lambda x: x.isnull().all())
  415. expected = Series([True, False], index=index, name="c")
  416. tm.assert_series_equal(result, expected)
  417. def test_groupby_agg_dict_with_getitem():
  418. # issue 25471
  419. dat = DataFrame({"A": ["A", "A", "B", "B", "B"], "B": [1, 2, 1, 1, 2]})
  420. result = dat.groupby("A")[["B"]].agg({"B": "sum"})
  421. expected = DataFrame({"B": [3, 4]}, index=["A", "B"]).rename_axis("A", axis=0)
  422. tm.assert_frame_equal(result, expected)
  423. def test_groupby_agg_dict_dup_columns():
  424. # GH#55006
  425. df = DataFrame(
  426. [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
  427. columns=["a", "b", "c", "c"],
  428. )
  429. gb = df.groupby("a")
  430. result = gb.agg({"b": "sum"})
  431. expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a"))
  432. tm.assert_frame_equal(result, expected)
  433. @pytest.mark.parametrize(
  434. "op",
  435. [
  436. lambda x: x.sum(),
  437. lambda x: x.cumsum(),
  438. lambda x: x.transform("sum"),
  439. lambda x: x.transform("cumsum"),
  440. lambda x: x.agg("sum"),
  441. lambda x: x.agg("cumsum"),
  442. ],
  443. )
  444. def test_bool_agg_dtype(op):
  445. # GH 7001
  446. # Bool sum aggregations result in int
  447. df = DataFrame({"a": [1, 1], "b": [False, True]})
  448. s = df.set_index("a")["b"]
  449. result = op(df.groupby("a"))["b"].dtype
  450. assert is_integer_dtype(result)
  451. result = op(s.groupby("a")).dtype
  452. assert is_integer_dtype(result)
  453. @pytest.mark.parametrize(
  454. "keys, agg_index",
  455. [
  456. (["a"], Index([1], name="a")),
  457. (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
  458. ],
  459. )
  460. @pytest.mark.parametrize(
  461. "input_dtype", ["bool", "int32", "int64", "float32", "float64"]
  462. )
  463. @pytest.mark.parametrize(
  464. "result_dtype", ["bool", "int32", "int64", "float32", "float64"]
  465. )
  466. @pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
  467. def test_callable_result_dtype_frame(
  468. keys, agg_index, input_dtype, result_dtype, method
  469. ):
  470. # GH 21240
  471. df = DataFrame({"a": [1], "b": [2], "c": [True]})
  472. df["c"] = df["c"].astype(input_dtype)
  473. op = getattr(df.groupby(keys)[["c"]], method)
  474. result = op(lambda x: x.astype(result_dtype).iloc[0])
  475. expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
  476. expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype(
  477. result_dtype
  478. )
  479. if method == "apply":
  480. expected.columns.names = [0]
  481. tm.assert_frame_equal(result, expected)
  482. @pytest.mark.parametrize(
  483. "keys, agg_index",
  484. [
  485. (["a"], Index([1], name="a")),
  486. (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
  487. ],
  488. )
  489. @pytest.mark.parametrize("input", [True, 1, 1.0])
  490. @pytest.mark.parametrize("dtype", [bool, int, float])
  491. @pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
  492. def test_callable_result_dtype_series(keys, agg_index, input, dtype, method):
  493. # GH 21240
  494. df = DataFrame({"a": [1], "b": [2], "c": [input]})
  495. op = getattr(df.groupby(keys)["c"], method)
  496. result = op(lambda x: x.astype(dtype).iloc[0])
  497. expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
  498. expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype)
  499. tm.assert_series_equal(result, expected)
  500. def test_order_aggregate_multiple_funcs():
  501. # GH 25692
  502. df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
  503. res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
  504. result = res.columns.levels[1]
  505. expected = Index(["sum", "max", "mean", "ohlc", "min"])
  506. tm.assert_index_equal(result, expected)
  507. def test_ohlc_ea_dtypes(any_numeric_ea_dtype):
  508. # GH#37493
  509. df = DataFrame(
  510. {"a": [1, 1, 2, 3, 4, 4], "b": [22, 11, pd.NA, 10, 20, pd.NA]},
  511. dtype=any_numeric_ea_dtype,
  512. )
  513. gb = df.groupby("a")
  514. result = gb.ohlc()
  515. expected = DataFrame(
  516. [[22, 22, 11, 11], [pd.NA] * 4, [10] * 4, [20] * 4],
  517. columns=MultiIndex.from_product([["b"], ["open", "high", "low", "close"]]),
  518. index=Index([1, 2, 3, 4], dtype=any_numeric_ea_dtype, name="a"),
  519. dtype=any_numeric_ea_dtype,
  520. )
  521. tm.assert_frame_equal(result, expected)
  522. gb2 = df.groupby("a", as_index=False)
  523. result2 = gb2.ohlc()
  524. expected2 = expected.reset_index()
  525. tm.assert_frame_equal(result2, expected2)
  526. @pytest.mark.parametrize("dtype", [np.int64, np.uint64])
  527. @pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
  528. def test_uint64_type_handling(dtype, how):
  529. # GH 26310
  530. df = DataFrame({"x": 6903052872240755750, "y": [1, 2]})
  531. expected = df.groupby("y").agg({"x": how})
  532. df.x = df.x.astype(dtype)
  533. result = df.groupby("y").agg({"x": how})
  534. if how not in ("mean", "median"):
  535. # mean and median always result in floats
  536. result.x = result.x.astype(np.int64)
  537. tm.assert_frame_equal(result, expected, check_exact=True)
  538. def test_func_duplicates_raises():
  539. # GH28426
  540. msg = "Function names"
  541. df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
  542. with pytest.raises(SpecificationError, match=msg):
  543. df.groupby("A").agg(["min", "min"])
  544. @pytest.mark.parametrize(
  545. "index",
  546. [
  547. pd.CategoricalIndex(list("abc")),
  548. pd.interval_range(0, 3),
  549. pd.period_range("2020", periods=3, freq="D"),
  550. MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
  551. ],
  552. )
  553. def test_agg_index_has_complex_internals(index):
  554. # GH 31223
  555. df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
  556. result = df.groupby("group").agg({"value": Series.nunique})
  557. expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
  558. tm.assert_frame_equal(result, expected)
  559. def test_agg_split_block():
  560. # https://github.com/pandas-dev/pandas/issues/31522
  561. df = DataFrame(
  562. {
  563. "key1": ["a", "a", "b", "b", "a"],
  564. "key2": ["one", "two", "one", "two", "one"],
  565. "key3": ["three", "three", "three", "six", "six"],
  566. }
  567. )
  568. result = df.groupby("key1").min()
  569. expected = DataFrame(
  570. {"key2": ["one", "one"], "key3": ["six", "six"]},
  571. index=Index(["a", "b"], name="key1"),
  572. )
  573. tm.assert_frame_equal(result, expected)
  574. def test_agg_split_object_part_datetime():
  575. # https://github.com/pandas-dev/pandas/pull/31616
  576. df = DataFrame(
  577. {
  578. "A": pd.date_range("2000", periods=4),
  579. "B": ["a", "b", "c", "d"],
  580. "C": [1, 2, 3, 4],
  581. "D": ["b", "c", "d", "e"],
  582. "E": pd.date_range("2000", periods=4),
  583. "F": [1, 2, 3, 4],
  584. }
  585. ).astype(object)
  586. result = df.groupby([0, 0, 0, 0]).min()
  587. expected = DataFrame(
  588. {
  589. "A": [pd.Timestamp("2000")],
  590. "B": ["a"],
  591. "C": [1],
  592. "D": ["b"],
  593. "E": [pd.Timestamp("2000")],
  594. "F": [1],
  595. },
  596. index=np.array([0]),
  597. dtype=object,
  598. )
  599. tm.assert_frame_equal(result, expected)
  600. class TestNamedAggregationSeries:
  601. def test_series_named_agg(self):
  602. df = Series([1, 2, 3, 4])
  603. gr = df.groupby([0, 0, 1, 1])
  604. result = gr.agg(a="sum", b="min")
  605. expected = DataFrame(
  606. {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=np.array([0, 1])
  607. )
  608. tm.assert_frame_equal(result, expected)
  609. result = gr.agg(b="min", a="sum")
  610. expected = expected[["b", "a"]]
  611. tm.assert_frame_equal(result, expected)
  612. def test_no_args_raises(self):
  613. gr = Series([1, 2]).groupby([0, 1])
  614. with pytest.raises(TypeError, match="Must provide"):
  615. gr.agg()
  616. # but we do allow this
  617. result = gr.agg([])
  618. expected = DataFrame(columns=[])
  619. tm.assert_frame_equal(result, expected)
  620. def test_series_named_agg_duplicates_no_raises(self):
  621. # GH28426
  622. gr = Series([1, 2, 3]).groupby([0, 0, 1])
  623. grouped = gr.agg(a="sum", b="sum")
  624. expected = DataFrame({"a": [3, 3], "b": [3, 3]}, index=np.array([0, 1]))
  625. tm.assert_frame_equal(expected, grouped)
  626. def test_mangled(self):
  627. gr = Series([1, 2, 3]).groupby([0, 0, 1])
  628. result = gr.agg(a=lambda x: 0, b=lambda x: 1)
  629. expected = DataFrame({"a": [0, 0], "b": [1, 1]}, index=np.array([0, 1]))
  630. tm.assert_frame_equal(result, expected)
  631. @pytest.mark.parametrize(
  632. "inp",
  633. [
  634. pd.NamedAgg(column="anything", aggfunc="min"),
  635. ("anything", "min"),
  636. ["anything", "min"],
  637. ],
  638. )
  639. def test_named_agg_nametuple(self, inp):
  640. # GH34422
  641. s = Series([1, 1, 2, 2, 3, 3, 4, 5])
  642. msg = f"func is expected but received {type(inp).__name__}"
  643. with pytest.raises(TypeError, match=msg):
  644. s.groupby(s.values).agg(a=inp)
  645. class TestNamedAggregationDataFrame:
  646. def test_agg_relabel(self):
  647. df = DataFrame(
  648. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  649. )
  650. result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
  651. expected = DataFrame(
  652. {"a_max": [1, 3], "b_max": [6, 8]},
  653. index=Index(["a", "b"], name="group"),
  654. columns=["a_max", "b_max"],
  655. )
  656. tm.assert_frame_equal(result, expected)
  657. # order invariance
  658. p98 = functools.partial(np.percentile, q=98)
  659. result = df.groupby("group").agg(
  660. b_min=("B", "min"),
  661. a_min=("A", "min"),
  662. a_mean=("A", "mean"),
  663. a_max=("A", "max"),
  664. b_max=("B", "max"),
  665. a_98=("A", p98),
  666. )
  667. expected = DataFrame(
  668. {
  669. "b_min": [5, 7],
  670. "a_min": [0, 2],
  671. "a_mean": [0.5, 2.5],
  672. "a_max": [1, 3],
  673. "b_max": [6, 8],
  674. "a_98": [0.98, 2.98],
  675. },
  676. index=Index(["a", "b"], name="group"),
  677. columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
  678. )
  679. tm.assert_frame_equal(result, expected)
  680. def test_agg_relabel_non_identifier(self):
  681. df = DataFrame(
  682. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  683. )
  684. result = df.groupby("group").agg(**{"my col": ("A", "max")})
  685. expected = DataFrame({"my col": [1, 3]}, index=Index(["a", "b"], name="group"))
  686. tm.assert_frame_equal(result, expected)
  687. def test_duplicate_no_raises(self):
  688. # GH 28426, if use same input function on same column,
  689. # no error should raise
  690. df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
  691. grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min"))
  692. expected = DataFrame({"a": [1, 3], "b": [1, 3]}, index=Index([0, 1], name="A"))
  693. tm.assert_frame_equal(grouped, expected)
  694. quant50 = functools.partial(np.percentile, q=50)
  695. quant70 = functools.partial(np.percentile, q=70)
  696. quant50.__name__ = "quant50"
  697. quant70.__name__ = "quant70"
  698. test = DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]})
  699. grouped = test.groupby("col1").agg(
  700. quantile_50=("col2", quant50), quantile_70=("col2", quant70)
  701. )
  702. expected = DataFrame(
  703. {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]},
  704. index=Index(["a", "b"], name="col1"),
  705. )
  706. tm.assert_frame_equal(grouped, expected)
  707. def test_agg_relabel_with_level(self):
  708. df = DataFrame(
  709. {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
  710. index=MultiIndex.from_product([["A", "B"], ["a", "b"]]),
  711. )
  712. result = df.groupby(level=0).agg(
  713. aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
  714. )
  715. expected = DataFrame(
  716. {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
  717. )
  718. tm.assert_frame_equal(result, expected)
  719. def test_agg_relabel_other_raises(self):
  720. df = DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
  721. grouped = df.groupby("A")
  722. match = "Must provide"
  723. with pytest.raises(TypeError, match=match):
  724. grouped.agg(foo=1)
  725. with pytest.raises(TypeError, match=match):
  726. grouped.agg()
  727. with pytest.raises(TypeError, match=match):
  728. grouped.agg(a=("B", "max"), b=(1, 2, 3))
  729. def test_missing_raises(self):
  730. df = DataFrame({"A": [0, 1], "B": [1, 2]})
  731. match = re.escape("Column(s) ['C'] do not exist")
  732. with pytest.raises(KeyError, match=match):
  733. df.groupby("A").agg(c=("C", "sum"))
  734. def test_agg_namedtuple(self):
  735. df = DataFrame({"A": [0, 1], "B": [1, 2]})
  736. result = df.groupby("A").agg(
  737. b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
  738. )
  739. expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
  740. tm.assert_frame_equal(result, expected)
  741. def test_mangled(self):
  742. df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
  743. result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
  744. expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A"))
  745. tm.assert_frame_equal(result, expected)
  746. @pytest.mark.parametrize(
  747. "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3",
  748. [
  749. (
  750. (("y", "A"), "max"),
  751. (("y", "A"), np.mean),
  752. (("y", "B"), "mean"),
  753. [1, 3],
  754. [0.5, 2.5],
  755. [5.5, 7.5],
  756. ),
  757. (
  758. (("y", "A"), lambda x: max(x)),
  759. (("y", "A"), lambda x: 1),
  760. (("y", "B"), np.mean),
  761. [1, 3],
  762. [1, 1],
  763. [5.5, 7.5],
  764. ),
  765. (
  766. pd.NamedAgg(("y", "A"), "max"),
  767. pd.NamedAgg(("y", "B"), np.mean),
  768. pd.NamedAgg(("y", "A"), lambda x: 1),
  769. [1, 3],
  770. [5.5, 7.5],
  771. [1, 1],
  772. ),
  773. ],
  774. )
  775. def test_agg_relabel_multiindex_column(
  776. agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3
  777. ):
  778. # GH 29422, add tests for multiindex column cases
  779. df = DataFrame(
  780. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  781. )
  782. df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
  783. idx = Index(["a", "b"], name=("x", "group"))
  784. result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max"))
  785. expected = DataFrame({"a_max": [1, 3]}, index=idx)
  786. tm.assert_frame_equal(result, expected)
  787. msg = "is currently using SeriesGroupBy.mean"
  788. with tm.assert_produces_warning(FutureWarning, match=msg):
  789. result = df.groupby(("x", "group")).agg(
  790. col_1=agg_col1, col_2=agg_col2, col_3=agg_col3
  791. )
  792. expected = DataFrame(
  793. {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx
  794. )
  795. tm.assert_frame_equal(result, expected)
  796. def test_agg_relabel_multiindex_raises_not_exist():
  797. # GH 29422, add test for raises scenario when aggregate column does not exist
  798. df = DataFrame(
  799. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  800. )
  801. df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
  802. with pytest.raises(KeyError, match="do not exist"):
  803. df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
  804. def test_agg_relabel_multiindex_duplicates():
  805. # GH29422, add test for raises scenario when getting duplicates
  806. # GH28426, after this change, duplicates should also work if the relabelling is
  807. # different
  808. df = DataFrame(
  809. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  810. )
  811. df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
  812. result = df.groupby(("x", "group")).agg(
  813. a=(("y", "A"), "min"), b=(("y", "A"), "min")
  814. )
  815. idx = Index(["a", "b"], name=("x", "group"))
  816. expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx)
  817. tm.assert_frame_equal(result, expected)
  818. @pytest.mark.parametrize("kwargs", [{"c": ["min"]}, {"b": [], "c": ["min"]}])
  819. def test_groupby_aggregate_empty_key(kwargs):
  820. # GH: 32580
  821. df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
  822. result = df.groupby("a").agg(kwargs)
  823. expected = DataFrame(
  824. [1, 4],
  825. index=Index([1, 2], dtype="int64", name="a"),
  826. columns=MultiIndex.from_tuples([["c", "min"]]),
  827. )
  828. tm.assert_frame_equal(result, expected)
  829. def test_groupby_aggregate_empty_key_empty_return():
  830. # GH: 32580 Check if everything works, when return is empty
  831. df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
  832. result = df.groupby("a").agg({"b": []})
  833. expected = DataFrame(columns=MultiIndex(levels=[["b"], []], codes=[[], []]))
  834. tm.assert_frame_equal(result, expected)
  835. def test_groupby_aggregate_empty_with_multiindex_frame():
  836. # GH 39178
  837. df = DataFrame(columns=["a", "b", "c"])
  838. result = df.groupby(["a", "b"], group_keys=False).agg(d=("c", list))
  839. expected = DataFrame(
  840. columns=["d"], index=MultiIndex([[], []], [[], []], names=["a", "b"])
  841. )
  842. tm.assert_frame_equal(result, expected)
  843. def test_grouby_agg_loses_results_with_as_index_false_relabel():
  844. # GH 32240: When the aggregate function relabels column names and
  845. # as_index=False is specified, the results are dropped.
  846. df = DataFrame(
  847. {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}
  848. )
  849. grouped = df.groupby("key", as_index=False)
  850. result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
  851. expected = DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]})
  852. tm.assert_frame_equal(result, expected)
  853. def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
  854. # GH 32240: When the aggregate function relabels column names and
  855. # as_index=False is specified, the results are dropped. Check if
  856. # multiindex is returned in the right order
  857. df = DataFrame(
  858. {
  859. "key": ["x", "y", "x", "y", "x", "x"],
  860. "key1": ["a", "b", "c", "b", "a", "c"],
  861. "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75],
  862. }
  863. )
  864. grouped = df.groupby(["key", "key1"], as_index=False)
  865. result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
  866. expected = DataFrame(
  867. {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]}
  868. )
  869. tm.assert_frame_equal(result, expected)
  870. @pytest.mark.parametrize(
  871. "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)]
  872. )
  873. def test_multiindex_custom_func(func):
  874. # GH 31777
  875. data = [[1, 4, 2], [5, 7, 1]]
  876. df = DataFrame(
  877. data,
  878. columns=MultiIndex.from_arrays(
  879. [[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"]
  880. ),
  881. )
  882. result = df.groupby(np.array([0, 1])).agg(func)
  883. expected_dict = {
  884. (1, 3): {0: 1.0, 1: 5.0},
  885. (1, 4): {0: 4.0, 1: 7.0},
  886. (2, 3): {0: 2.0, 1: 1.0},
  887. }
  888. expected = DataFrame(expected_dict, index=np.array([0, 1]), columns=df.columns)
  889. tm.assert_frame_equal(result, expected)
  890. def myfunc(s):
  891. return np.percentile(s, q=0.90)
  892. @pytest.mark.parametrize("func", [lambda s: np.percentile(s, q=0.90), myfunc])
  893. def test_lambda_named_agg(func):
  894. # see gh-28467
  895. animals = DataFrame(
  896. {
  897. "kind": ["cat", "dog", "cat", "dog"],
  898. "height": [9.1, 6.0, 9.5, 34.0],
  899. "weight": [7.9, 7.5, 9.9, 198.0],
  900. }
  901. )
  902. result = animals.groupby("kind").agg(
  903. mean_height=("height", "mean"), perc90=("height", func)
  904. )
  905. expected = DataFrame(
  906. [[9.3, 9.1036], [20.0, 6.252]],
  907. columns=["mean_height", "perc90"],
  908. index=Index(["cat", "dog"], name="kind"),
  909. )
  910. tm.assert_frame_equal(result, expected)
  911. def test_aggregate_mixed_types():
  912. # GH 16916
  913. df = DataFrame(
  914. data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
  915. )
  916. df["grouping"] = ["group 1", "group 1", 2]
  917. result = df.groupby("grouping").aggregate(lambda x: x.tolist())
  918. expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
  919. expected = DataFrame(
  920. expected_data,
  921. index=Index([2, "group 1"], dtype="object", name="grouping"),
  922. columns=Index(["X", "Y", "Z"]),
  923. )
  924. tm.assert_frame_equal(result, expected)
  925. @pytest.mark.xfail(reason="Not implemented;see GH 31256")
  926. def test_aggregate_udf_na_extension_type():
  927. # https://github.com/pandas-dev/pandas/pull/31359
  928. # This is currently failing to cast back to Int64Dtype.
  929. # The presence of the NA causes two problems
  930. # 1. NA is not an instance of Int64Dtype.type (numpy.int64)
  931. # 2. The presence of an NA forces object type, so the non-NA values is
  932. # a Python int rather than a NumPy int64. Python ints aren't
  933. # instances of numpy.int64.
  934. def aggfunc(x):
  935. if all(x > 2):
  936. return 1
  937. else:
  938. return pd.NA
  939. df = DataFrame({"A": pd.array([1, 2, 3])})
  940. result = df.groupby([1, 1, 2]).agg(aggfunc)
  941. expected = DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2])
  942. tm.assert_frame_equal(result, expected)
  943. class TestLambdaMangling:
  944. def test_basic(self):
  945. df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
  946. result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
  947. expected = DataFrame(
  948. {("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
  949. index=Index([0, 1], name="A"),
  950. )
  951. tm.assert_frame_equal(result, expected)
  952. def test_mangle_series_groupby(self):
  953. gr = Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
  954. result = gr.agg([lambda x: 0, lambda x: 1])
  955. exp_data = {"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]}
  956. expected = DataFrame(exp_data, index=np.array([0, 1]))
  957. tm.assert_frame_equal(result, expected)
  958. @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
  959. def test_with_kwargs(self):
  960. f1 = lambda x, y, b=1: x.sum() + y + b
  961. f2 = lambda x, y, b=2: x.sum() + y * b
  962. result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
  963. expected = DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
  964. tm.assert_frame_equal(result, expected)
  965. result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
  966. expected = DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
  967. tm.assert_frame_equal(result, expected)
  968. def test_agg_with_one_lambda(self):
  969. # GH 25719, write tests for DataFrameGroupby.agg with only one lambda
  970. df = DataFrame(
  971. {
  972. "kind": ["cat", "dog", "cat", "dog"],
  973. "height": [9.1, 6.0, 9.5, 34.0],
  974. "weight": [7.9, 7.5, 9.9, 198.0],
  975. }
  976. )
  977. columns = ["height_sqr_min", "height_max", "weight_max"]
  978. expected = DataFrame(
  979. {
  980. "height_sqr_min": [82.81, 36.00],
  981. "height_max": [9.5, 34.0],
  982. "weight_max": [9.9, 198.0],
  983. },
  984. index=Index(["cat", "dog"], name="kind"),
  985. columns=columns,
  986. )
  987. # check pd.NameAgg case
  988. result1 = df.groupby(by="kind").agg(
  989. height_sqr_min=pd.NamedAgg(
  990. column="height", aggfunc=lambda x: np.min(x**2)
  991. ),
  992. height_max=pd.NamedAgg(column="height", aggfunc="max"),
  993. weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
  994. )
  995. tm.assert_frame_equal(result1, expected)
  996. # check agg(key=(col, aggfunc)) case
  997. result2 = df.groupby(by="kind").agg(
  998. height_sqr_min=("height", lambda x: np.min(x**2)),
  999. height_max=("height", "max"),
  1000. weight_max=("weight", "max"),
  1001. )
  1002. tm.assert_frame_equal(result2, expected)
  1003. def test_agg_multiple_lambda(self):
  1004. # GH25719, test for DataFrameGroupby.agg with multiple lambdas
  1005. # with mixed aggfunc
  1006. df = DataFrame(
  1007. {
  1008. "kind": ["cat", "dog", "cat", "dog"],
  1009. "height": [9.1, 6.0, 9.5, 34.0],
  1010. "weight": [7.9, 7.5, 9.9, 198.0],
  1011. }
  1012. )
  1013. columns = [
  1014. "height_sqr_min",
  1015. "height_max",
  1016. "weight_max",
  1017. "height_max_2",
  1018. "weight_min",
  1019. ]
  1020. expected = DataFrame(
  1021. {
  1022. "height_sqr_min": [82.81, 36.00],
  1023. "height_max": [9.5, 34.0],
  1024. "weight_max": [9.9, 198.0],
  1025. "height_max_2": [9.5, 34.0],
  1026. "weight_min": [7.9, 7.5],
  1027. },
  1028. index=Index(["cat", "dog"], name="kind"),
  1029. columns=columns,
  1030. )
  1031. # check agg(key=(col, aggfunc)) case
  1032. result1 = df.groupby(by="kind").agg(
  1033. height_sqr_min=("height", lambda x: np.min(x**2)),
  1034. height_max=("height", "max"),
  1035. weight_max=("weight", "max"),
  1036. height_max_2=("height", lambda x: np.max(x)),
  1037. weight_min=("weight", lambda x: np.min(x)),
  1038. )
  1039. tm.assert_frame_equal(result1, expected)
  1040. # check pd.NamedAgg case
  1041. result2 = df.groupby(by="kind").agg(
  1042. height_sqr_min=pd.NamedAgg(
  1043. column="height", aggfunc=lambda x: np.min(x**2)
  1044. ),
  1045. height_max=pd.NamedAgg(column="height", aggfunc="max"),
  1046. weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
  1047. height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)),
  1048. weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)),
  1049. )
  1050. tm.assert_frame_equal(result2, expected)
  1051. def test_groupby_get_by_index():
  1052. # GH 33439
  1053. df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]})
  1054. res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])})
  1055. expected = DataFrame({"A": ["S", "W"], "B": [1.0, 2.0]}).set_index("A")
  1056. tm.assert_frame_equal(res, expected)
  1057. @pytest.mark.parametrize(
  1058. "grp_col_dict, exp_data",
  1059. [
  1060. ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}),
  1061. ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}),
  1062. ({"nr": "min"}, {"nr": [1, 5]}),
  1063. ],
  1064. )
  1065. def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):
  1066. # test single aggregations on ordered categorical cols GHGH27800
  1067. # create the result dataframe
  1068. input_df = DataFrame(
  1069. {
  1070. "nr": [1, 2, 3, 4, 5, 6, 7, 8],
  1071. "cat_ord": list("aabbccdd"),
  1072. "cat": list("aaaabbbb"),
  1073. }
  1074. )
  1075. input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
  1076. input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
  1077. result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
  1078. # create expected dataframe
  1079. cat_index = pd.CategoricalIndex(
  1080. ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
  1081. )
  1082. expected_df = DataFrame(data=exp_data, index=cat_index)
  1083. if "cat_ord" in expected_df:
  1084. # ordered categorical columns should be preserved
  1085. dtype = input_df["cat_ord"].dtype
  1086. expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype)
  1087. tm.assert_frame_equal(result_df, expected_df)
  1088. @pytest.mark.parametrize(
  1089. "grp_col_dict, exp_data",
  1090. [
  1091. ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]),
  1092. ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]),
  1093. ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]),
  1094. ],
  1095. )
  1096. def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
  1097. # test combined aggregations on ordered categorical cols GH27800
  1098. # create the result dataframe
  1099. input_df = DataFrame(
  1100. {
  1101. "nr": [1, 2, 3, 4, 5, 6, 7, 8],
  1102. "cat_ord": list("aabbccdd"),
  1103. "cat": list("aaaabbbb"),
  1104. }
  1105. )
  1106. input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
  1107. input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
  1108. result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
  1109. # create expected dataframe
  1110. cat_index = pd.CategoricalIndex(
  1111. ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
  1112. )
  1113. # unpack the grp_col_dict to create the multi-index tuple
  1114. # this tuple will be used to create the expected dataframe index
  1115. multi_index_list = []
  1116. for k, v in grp_col_dict.items():
  1117. if isinstance(v, list):
  1118. multi_index_list.extend([k, value] for value in v)
  1119. else:
  1120. multi_index_list.append([k, v])
  1121. multi_index = MultiIndex.from_tuples(tuple(multi_index_list))
  1122. expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index)
  1123. for col in expected_df.columns:
  1124. if isinstance(col, tuple) and "cat_ord" in col:
  1125. # ordered categorical should be preserved
  1126. expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype)
  1127. tm.assert_frame_equal(result_df, expected_df)
  1128. def test_nonagg_agg():
  1129. # GH 35490 - Single/Multiple agg of non-agg function give same results
  1130. # TODO: agg should raise for functions that don't aggregate
  1131. df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]})
  1132. g = df.groupby("a")
  1133. result = g.agg(["cumsum"])
  1134. result.columns = result.columns.droplevel(-1)
  1135. expected = g.agg("cumsum")
  1136. tm.assert_frame_equal(result, expected)
  1137. def test_aggregate_datetime_objects():
  1138. # https://github.com/pandas-dev/pandas/issues/36003
  1139. # ensure we don't raise an error but keep object dtype for out-of-bounds
  1140. # datetimes
  1141. df = DataFrame(
  1142. {
  1143. "A": ["X", "Y"],
  1144. "B": [
  1145. datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
  1146. datetime.datetime(3005, 1, 1, 10, 30, 23, 540000),
  1147. ],
  1148. }
  1149. )
  1150. result = df.groupby("A").B.max()
  1151. expected = df.set_index("A")["B"]
  1152. tm.assert_series_equal(result, expected)
  1153. def test_groupby_index_object_dtype():
  1154. # GH 40014
  1155. df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
  1156. df.index = df.index.astype("O")
  1157. grouped = df.groupby(["c0", "c1"])
  1158. res = grouped.p.agg(lambda x: all(x > 0))
  1159. # Check that providing a user-defined function in agg()
  1160. # produces the correct index shape when using an object-typed index.
  1161. expected_index = MultiIndex.from_tuples(
  1162. [("x", "x"), ("x", "y")], names=("c0", "c1")
  1163. )
  1164. expected = Series([False, True], index=expected_index, name="p")
  1165. tm.assert_series_equal(res, expected)
  1166. def test_timeseries_groupby_agg():
  1167. # GH#43290
  1168. def func(ser):
  1169. if ser.isna().all():
  1170. return None
  1171. return np.sum(ser)
  1172. df = DataFrame([1.0], index=[pd.Timestamp("2018-01-16 00:00:00+00:00")])
  1173. res = df.groupby(lambda x: 1).agg(func)
  1174. expected = DataFrame([[1.0]], index=[1])
  1175. tm.assert_frame_equal(res, expected)
  1176. def test_groupby_agg_precision(any_real_numeric_dtype):
  1177. if any_real_numeric_dtype in tm.ALL_INT_NUMPY_DTYPES:
  1178. max_value = np.iinfo(any_real_numeric_dtype).max
  1179. if any_real_numeric_dtype in tm.FLOAT_NUMPY_DTYPES:
  1180. max_value = np.finfo(any_real_numeric_dtype).max
  1181. if any_real_numeric_dtype in tm.FLOAT_EA_DTYPES:
  1182. max_value = np.finfo(any_real_numeric_dtype.lower()).max
  1183. if any_real_numeric_dtype in tm.ALL_INT_EA_DTYPES:
  1184. max_value = np.iinfo(any_real_numeric_dtype.lower()).max
  1185. df = DataFrame(
  1186. {
  1187. "key1": ["a"],
  1188. "key2": ["b"],
  1189. "key3": pd.array([max_value], dtype=any_real_numeric_dtype),
  1190. }
  1191. )
  1192. arrays = [["a"], ["b"]]
  1193. index = MultiIndex.from_arrays(arrays, names=("key1", "key2"))
  1194. expected = DataFrame(
  1195. {"key3": pd.array([max_value], dtype=any_real_numeric_dtype)}, index=index
  1196. )
  1197. result = df.groupby(["key1", "key2"]).agg(lambda x: x)
  1198. tm.assert_frame_equal(result, expected)
  1199. def test_groupby_aggregate_directory(reduction_func):
  1200. # GH#32793
  1201. if reduction_func in ["corrwith", "nth"]:
  1202. return None
  1203. obj = DataFrame([[0, 1], [0, np.nan]])
  1204. result_reduced_series = obj.groupby(0).agg(reduction_func)
  1205. result_reduced_frame = obj.groupby(0).agg({1: reduction_func})
  1206. if reduction_func in ["size", "ngroup"]:
  1207. # names are different: None / 1
  1208. tm.assert_series_equal(
  1209. result_reduced_series, result_reduced_frame[1], check_names=False
  1210. )
  1211. else:
  1212. tm.assert_frame_equal(result_reduced_series, result_reduced_frame)
  1213. tm.assert_series_equal(
  1214. result_reduced_series.dtypes, result_reduced_frame.dtypes
  1215. )
  1216. def test_group_mean_timedelta_nat():
  1217. # GH43132
  1218. data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]")
  1219. expected = Series(["2 days"], dtype="timedelta64[ns]", index=np.array([0]))
  1220. result = data.groupby([0, 0, 0]).mean()
  1221. tm.assert_series_equal(result, expected)
  1222. @pytest.mark.parametrize(
  1223. "input_data, expected_output",
  1224. [
  1225. ( # no timezone
  1226. ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"],
  1227. ["2021-01-01T01:00"],
  1228. ),
  1229. ( # timezone
  1230. ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"],
  1231. ["2021-01-01T01:00-0100"],
  1232. ),
  1233. ],
  1234. )
  1235. def test_group_mean_datetime64_nat(input_data, expected_output):
  1236. # GH43132
  1237. data = to_datetime(Series(input_data))
  1238. expected = to_datetime(Series(expected_output, index=np.array([0])))
  1239. result = data.groupby([0, 0, 0]).mean()
  1240. tm.assert_series_equal(result, expected)
  1241. @pytest.mark.parametrize(
  1242. "func, output", [("mean", [8 + 18j, 10 + 22j]), ("sum", [40 + 90j, 50 + 110j])]
  1243. )
  1244. def test_groupby_complex(func, output):
  1245. # GH#43701
  1246. data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
  1247. result = data.groupby(data.index % 2).agg(func)
  1248. expected = Series(output)
  1249. tm.assert_series_equal(result, expected)
  1250. @pytest.mark.parametrize("func", ["min", "max", "var"])
  1251. def test_groupby_complex_raises(func):
  1252. # GH#43701
  1253. data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
  1254. msg = "No matching signature found"
  1255. with pytest.raises(TypeError, match=msg):
  1256. data.groupby(data.index % 2).agg(func)
  1257. @pytest.mark.parametrize(
  1258. "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}]
  1259. )
  1260. def test_multi_axis_1_raises(func):
  1261. # GH#46995
  1262. df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]})
  1263. msg = "DataFrame.groupby with axis=1 is deprecated"
  1264. with tm.assert_produces_warning(FutureWarning, match=msg):
  1265. gb = df.groupby("a", axis=1)
  1266. with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"):
  1267. gb.agg(func)
  1268. @pytest.mark.parametrize(
  1269. "test, constant",
  1270. [
  1271. ([[20, "A"], [20, "B"], [10, "C"]], {0: [10, 20], 1: ["C", ["A", "B"]]}),
  1272. ([[20, "A"], [20, "B"], [30, "C"]], {0: [20, 30], 1: [["A", "B"], "C"]}),
  1273. ([["a", 1], ["a", 1], ["b", 2], ["b", 3]], {0: ["a", "b"], 1: [1, [2, 3]]}),
  1274. pytest.param(
  1275. [["a", 1], ["a", 2], ["b", 3], ["b", 3]],
  1276. {0: ["a", "b"], 1: [[1, 2], 3]},
  1277. marks=pytest.mark.xfail,
  1278. ),
  1279. ],
  1280. )
  1281. def test_agg_of_mode_list(test, constant):
  1282. # GH#25581
  1283. df1 = DataFrame(test)
  1284. result = df1.groupby(0).agg(Series.mode)
  1285. # Mode usually only returns 1 value, but can return a list in the case of a tie.
  1286. expected = DataFrame(constant)
  1287. expected = expected.set_index(0)
  1288. tm.assert_frame_equal(result, expected)
  1289. def test_dataframe_groupy_agg_list_like_func_with_args():
  1290. # GH#50624
  1291. df = DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
  1292. gb = df.groupby("y")
  1293. def foo1(x, a=1, c=0):
  1294. return x.sum() + a + c
  1295. def foo2(x, b=2, c=0):
  1296. return x.sum() + b + c
  1297. msg = r"foo1\(\) got an unexpected keyword argument 'b'"
  1298. with pytest.raises(TypeError, match=msg):
  1299. gb.agg([foo1, foo2], 3, b=3, c=4)
  1300. result = gb.agg([foo1, foo2], 3, c=4)
  1301. expected = DataFrame(
  1302. [[8, 8], [9, 9], [10, 10]],
  1303. index=Index(["a", "b", "c"], name="y"),
  1304. columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]),
  1305. )
  1306. tm.assert_frame_equal(result, expected)
  1307. def test_series_groupy_agg_list_like_func_with_args():
  1308. # GH#50624
  1309. s = Series([1, 2, 3])
  1310. sgb = s.groupby(s)
  1311. def foo1(x, a=1, c=0):
  1312. return x.sum() + a + c
  1313. def foo2(x, b=2, c=0):
  1314. return x.sum() + b + c
  1315. msg = r"foo1\(\) got an unexpected keyword argument 'b'"
  1316. with pytest.raises(TypeError, match=msg):
  1317. sgb.agg([foo1, foo2], 3, b=3, c=4)
  1318. result = sgb.agg([foo1, foo2], 3, c=4)
  1319. expected = DataFrame(
  1320. [[8, 8], [9, 9], [10, 10]], index=Index([1, 2, 3]), columns=["foo1", "foo2"]
  1321. )
  1322. tm.assert_frame_equal(result, expected)
  1323. def test_agg_groupings_selection():
  1324. # GH#51186 - a selected grouping should be in the output of agg
  1325. df = DataFrame({"a": [1, 1, 2], "b": [3, 3, 4], "c": [5, 6, 7]})
  1326. gb = df.groupby(["a", "b"])
  1327. selected_gb = gb[["b", "c"]]
  1328. result = selected_gb.agg(lambda x: x.sum())
  1329. index = MultiIndex(
  1330. levels=[[1, 2], [3, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"]
  1331. )
  1332. expected = DataFrame({"b": [6, 4], "c": [11, 7]}, index=index)
  1333. tm.assert_frame_equal(result, expected)
  1334. def test_agg_multiple_with_as_index_false_subset_to_a_single_column():
  1335. # GH#50724
  1336. df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
  1337. gb = df.groupby("a", as_index=False)["b"]
  1338. result = gb.agg(["sum", "mean"])
  1339. expected = DataFrame({"a": [1, 2], "sum": [7, 5], "mean": [3.5, 5.0]})
  1340. tm.assert_frame_equal(result, expected)
  1341. def test_agg_with_as_index_false_with_list():
  1342. # GH#52849
  1343. df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
  1344. gb = df.groupby(by=["a1", "a2"], as_index=False)
  1345. result = gb.agg(["sum"])
  1346. expected = DataFrame(
  1347. data=[[0, 2, 4], [0, 3, 5], [1, 3, 6]],
  1348. columns=MultiIndex.from_tuples([("a1", ""), ("a2", ""), ("b", "sum")]),
  1349. )
  1350. tm.assert_frame_equal(result, expected)
  1351. def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation():
  1352. # GH#41720
  1353. expected = DataFrame(
  1354. {
  1355. "td": {
  1356. 0: pd.Timedelta("0 days 01:00:00"),
  1357. 1: pd.Timedelta("0 days 01:15:00"),
  1358. 2: pd.Timedelta("0 days 01:15:00"),
  1359. }
  1360. }
  1361. )
  1362. df = DataFrame(
  1363. {
  1364. "td": Series(
  1365. ["0 days 01:00:00", "0 days 00:15:00", "0 days 01:15:00"],
  1366. dtype="timedelta64[ns]",
  1367. ),
  1368. "grps": ["a", "a", "b"],
  1369. }
  1370. )
  1371. gb = df.groupby("grps")
  1372. result = gb.agg(td=("td", "cumsum"))
  1373. tm.assert_frame_equal(result, expected)
  1374. def test_groupby_aggregation_empty_group():
  1375. # https://github.com/pandas-dev/pandas/issues/18869
  1376. def func(x):
  1377. if len(x) == 0:
  1378. raise ValueError("length must not be 0")
  1379. return len(x)
  1380. df = DataFrame(
  1381. {"A": pd.Categorical(["a", "a"], categories=["a", "b", "c"]), "B": [1, 1]}
  1382. )
  1383. msg = "length must not be 0"
  1384. with pytest.raises(ValueError, match=msg):
  1385. df.groupby("A", observed=False).agg(func)