test_apply.py 53 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605
  1. from datetime import (
  2. date,
  3. datetime,
  4. )
  5. import numpy as np
  6. import pytest
  7. from pandas._config import using_string_dtype
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. Index,
  12. MultiIndex,
  13. Series,
  14. bdate_range,
  15. )
  16. import pandas._testing as tm
  17. from pandas.tests.groupby import get_groupby_method_args
  18. def test_apply_func_that_appends_group_to_list_without_copy():
  19. # GH: 17718
  20. df = DataFrame(1, index=list(range(10)) * 10, columns=[0]).reset_index()
  21. groups = []
  22. def store(group):
  23. groups.append(group)
  24. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  25. with tm.assert_produces_warning(FutureWarning, match=msg):
  26. df.groupby("index").apply(store)
  27. expected_value = DataFrame(
  28. {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10)
  29. )
  30. tm.assert_frame_equal(groups[0], expected_value)
  31. def test_apply_index_date(using_infer_string):
  32. # GH 5788
  33. ts = [
  34. "2011-05-16 00:00",
  35. "2011-05-16 01:00",
  36. "2011-05-16 02:00",
  37. "2011-05-16 03:00",
  38. "2011-05-17 02:00",
  39. "2011-05-17 03:00",
  40. "2011-05-17 04:00",
  41. "2011-05-17 05:00",
  42. "2011-05-18 02:00",
  43. "2011-05-18 03:00",
  44. "2011-05-18 04:00",
  45. "2011-05-18 05:00",
  46. ]
  47. df = DataFrame(
  48. {
  49. "value": [
  50. 1.40893,
  51. 1.40760,
  52. 1.40750,
  53. 1.40649,
  54. 1.40893,
  55. 1.40760,
  56. 1.40750,
  57. 1.40649,
  58. 1.40893,
  59. 1.40760,
  60. 1.40750,
  61. 1.40649,
  62. ],
  63. },
  64. index=Index(pd.to_datetime(ts), name="date_time"),
  65. )
  66. expected = df.groupby(df.index.date).idxmax()
  67. result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
  68. tm.assert_frame_equal(result, expected)
  69. def test_apply_index_date_object():
  70. # GH 5789
  71. # don't auto coerce dates
  72. ts = [
  73. "2011-05-16 00:00",
  74. "2011-05-16 01:00",
  75. "2011-05-16 02:00",
  76. "2011-05-16 03:00",
  77. "2011-05-17 02:00",
  78. "2011-05-17 03:00",
  79. "2011-05-17 04:00",
  80. "2011-05-17 05:00",
  81. "2011-05-18 02:00",
  82. "2011-05-18 03:00",
  83. "2011-05-18 04:00",
  84. "2011-05-18 05:00",
  85. ]
  86. df = DataFrame([row.split() for row in ts], columns=["date", "time"])
  87. df["value"] = [
  88. 1.40893,
  89. 1.40760,
  90. 1.40750,
  91. 1.40649,
  92. 1.40893,
  93. 1.40760,
  94. 1.40750,
  95. 1.40649,
  96. 1.40893,
  97. 1.40760,
  98. 1.40750,
  99. 1.40649,
  100. ]
  101. exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date")
  102. expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
  103. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  104. with tm.assert_produces_warning(FutureWarning, match=msg):
  105. result = df.groupby("date", group_keys=False).apply(
  106. lambda x: x["time"][x["value"].idxmax()]
  107. )
  108. tm.assert_series_equal(result, expected)
  109. def test_apply_trivial(using_infer_string):
  110. # GH 20066
  111. # trivial apply: ignore input and return a constant dataframe.
  112. df = DataFrame(
  113. {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
  114. columns=["key", "data"],
  115. )
  116. dtype = "str" if using_infer_string else "object"
  117. expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype])
  118. msg = "DataFrame.groupby with axis=1 is deprecated"
  119. with tm.assert_produces_warning(FutureWarning, match=msg):
  120. gb = df.groupby([str(x) for x in df.dtypes], axis=1)
  121. result = gb.apply(lambda x: df.iloc[1:])
  122. tm.assert_frame_equal(result, expected)
  123. def test_apply_trivial_fail(using_infer_string):
  124. # GH 20066
  125. df = DataFrame(
  126. {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
  127. columns=["key", "data"],
  128. )
  129. dtype = "str" if using_infer_string else "object"
  130. expected = pd.concat([df, df], axis=1, keys=["float64", dtype])
  131. msg = "DataFrame.groupby with axis=1 is deprecated"
  132. with tm.assert_produces_warning(FutureWarning, match=msg):
  133. gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True)
  134. result = gb.apply(lambda x: df)
  135. tm.assert_frame_equal(result, expected)
  136. @pytest.mark.parametrize(
  137. "df, group_names",
  138. [
  139. (DataFrame({"a": [1, 1, 1, 2, 3], "b": ["a", "a", "a", "b", "c"]}), [1, 2, 3]),
  140. (DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}), [0, 1]),
  141. (DataFrame({"a": [1]}), [1]),
  142. (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], "b": range(8)}), [1, 2]),
  143. (DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), [1, 2, 3]),
  144. (
  145. DataFrame(
  146. {
  147. "a": list("aaabbbcccc"),
  148. "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4],
  149. "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8],
  150. }
  151. ),
  152. ["a", "b", "c"],
  153. ),
  154. (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), [1, 2]),
  155. ],
  156. ids=[
  157. "GH2936",
  158. "GH7739 & GH10519",
  159. "GH10519",
  160. "GH2656",
  161. "GH12155",
  162. "GH20084",
  163. "GH21417",
  164. ],
  165. )
  166. def test_group_apply_once_per_group(df, group_names):
  167. # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417
  168. # This test should ensure that a function is only evaluated
  169. # once per group. Previously the function has been evaluated twice
  170. # on the first group to check if the Cython index slider is safe to use
  171. # This test ensures that the side effect (append to list) is only triggered
  172. # once per group
  173. names = []
  174. # cannot parameterize over the functions since they need external
  175. # `names` to detect side effects
  176. def f_copy(group):
  177. # this takes the fast apply path
  178. names.append(group.name)
  179. return group.copy()
  180. def f_nocopy(group):
  181. # this takes the slow apply path
  182. names.append(group.name)
  183. return group
  184. def f_scalar(group):
  185. # GH7739, GH2656
  186. names.append(group.name)
  187. return 0
  188. def f_none(group):
  189. # GH10519, GH12155, GH21417
  190. names.append(group.name)
  191. def f_constant_df(group):
  192. # GH2936, GH20084
  193. names.append(group.name)
  194. return DataFrame({"a": [1], "b": [1]})
  195. for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]:
  196. del names[:]
  197. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  198. with tm.assert_produces_warning(FutureWarning, match=msg):
  199. df.groupby("a", group_keys=False).apply(func)
  200. assert names == group_names
  201. def test_group_apply_once_per_group2(capsys):
  202. # GH: 31111
  203. # groupby-apply need to execute len(set(group_by_columns)) times
  204. expected = 2 # Number of times `apply` should call a function for the current test
  205. df = DataFrame(
  206. {
  207. "group_by_column": [0, 0, 0, 0, 1, 1, 1, 1],
  208. "test_column": ["0", "2", "4", "6", "8", "10", "12", "14"],
  209. },
  210. index=["0", "2", "4", "6", "8", "10", "12", "14"],
  211. )
  212. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  213. with tm.assert_produces_warning(FutureWarning, match=msg):
  214. df.groupby("group_by_column", group_keys=False).apply(
  215. lambda df: print("function_called")
  216. )
  217. result = capsys.readouterr().out.count("function_called")
  218. # If `groupby` behaves unexpectedly, this test will break
  219. assert result == expected
  220. def test_apply_fast_slow_identical():
  221. # GH 31613
  222. df = DataFrame({"A": [0, 0, 1], "b": range(3)})
  223. # For simple index structures we check for fast/slow apply using
  224. # an identity check on in/output
  225. def slow(group):
  226. return group
  227. def fast(group):
  228. return group.copy()
  229. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  230. with tm.assert_produces_warning(FutureWarning, match=msg):
  231. fast_df = df.groupby("A", group_keys=False).apply(fast)
  232. with tm.assert_produces_warning(FutureWarning, match=msg):
  233. slow_df = df.groupby("A", group_keys=False).apply(slow)
  234. tm.assert_frame_equal(fast_df, slow_df)
  235. @pytest.mark.parametrize(
  236. "func",
  237. [
  238. lambda x: x,
  239. lambda x: x[:],
  240. lambda x: x.copy(deep=False),
  241. lambda x: x.copy(deep=True),
  242. ],
  243. )
  244. def test_groupby_apply_identity_maybecopy_index_identical(func):
  245. # GH 14927
  246. # Whether the function returns a copy of the input data or not should not
  247. # have an impact on the index structure of the result since this is not
  248. # transparent to the user
  249. df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
  250. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  251. with tm.assert_produces_warning(FutureWarning, match=msg):
  252. result = df.groupby("g", group_keys=False).apply(func)
  253. tm.assert_frame_equal(result, df)
  254. def test_apply_with_mixed_dtype():
  255. # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
  256. df = DataFrame(
  257. {
  258. "foo1": np.random.default_rng(2).standard_normal(6),
  259. "foo2": ["one", "two", "two", "three", "one", "two"],
  260. }
  261. )
  262. result = df.apply(lambda x: x, axis=1).dtypes
  263. expected = df.dtypes
  264. tm.assert_series_equal(result, expected)
  265. # GH 3610 incorrect dtype conversion with as_index=False
  266. df = DataFrame({"c1": [1, 2, 6, 6, 8]})
  267. df["c2"] = df.c1 / 2.0
  268. result1 = df.groupby("c2").mean().reset_index().c2
  269. result2 = df.groupby("c2", as_index=False).mean().c2
  270. tm.assert_series_equal(result1, result2)
  271. def test_groupby_as_index_apply():
  272. # GH #4648 and #3417
  273. df = DataFrame(
  274. {
  275. "item_id": ["b", "b", "a", "c", "a", "b"],
  276. "user_id": [1, 2, 1, 1, 3, 1],
  277. "time": range(6),
  278. }
  279. )
  280. g_as = df.groupby("user_id", as_index=True)
  281. g_not_as = df.groupby("user_id", as_index=False)
  282. res_as = g_as.head(2).index
  283. res_not_as = g_not_as.head(2).index
  284. exp = Index([0, 1, 2, 4])
  285. tm.assert_index_equal(res_as, exp)
  286. tm.assert_index_equal(res_not_as, exp)
  287. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  288. with tm.assert_produces_warning(FutureWarning, match=msg):
  289. res_as_apply = g_as.apply(lambda x: x.head(2)).index
  290. with tm.assert_produces_warning(FutureWarning, match=msg):
  291. res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
  292. # apply doesn't maintain the original ordering
  293. # changed in GH5610 as the as_index=False returns a MI here
  294. exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)])
  295. tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
  296. exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None])
  297. tm.assert_index_equal(res_as_apply, exp_as_apply)
  298. tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
  299. ind = Index(list("abcde"))
  300. df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
  301. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  302. with tm.assert_produces_warning(FutureWarning, match=msg):
  303. res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index
  304. tm.assert_index_equal(res, ind)
  305. def test_apply_concat_preserve_names(three_group):
  306. grouped = three_group.groupby(["A", "B"])
  307. def desc(group):
  308. result = group.describe()
  309. result.index.name = "stat"
  310. return result
  311. def desc2(group):
  312. result = group.describe()
  313. result.index.name = "stat"
  314. result = result[: len(group)]
  315. # weirdo
  316. return result
  317. def desc3(group):
  318. result = group.describe()
  319. # names are different
  320. result.index.name = f"stat_{len(group):d}"
  321. result = result[: len(group)]
  322. # weirdo
  323. return result
  324. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  325. with tm.assert_produces_warning(FutureWarning, match=msg):
  326. result = grouped.apply(desc)
  327. assert result.index.names == ("A", "B", "stat")
  328. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  329. with tm.assert_produces_warning(FutureWarning, match=msg):
  330. result2 = grouped.apply(desc2)
  331. assert result2.index.names == ("A", "B", "stat")
  332. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  333. with tm.assert_produces_warning(FutureWarning, match=msg):
  334. result3 = grouped.apply(desc3)
  335. assert result3.index.names == ("A", "B", None)
  336. def test_apply_series_to_frame():
  337. def f(piece):
  338. with np.errstate(invalid="ignore"):
  339. logged = np.log(piece)
  340. return DataFrame(
  341. {"value": piece, "demeaned": piece - piece.mean(), "logged": logged}
  342. )
  343. dr = bdate_range("1/1/2000", periods=100)
  344. ts = Series(np.random.default_rng(2).standard_normal(100), index=dr)
  345. grouped = ts.groupby(lambda x: x.month, group_keys=False)
  346. result = grouped.apply(f)
  347. assert isinstance(result, DataFrame)
  348. assert not hasattr(result, "name") # GH49907
  349. tm.assert_index_equal(result.index, ts.index)
  350. def test_apply_series_yield_constant(df):
  351. result = df.groupby(["A", "B"])["C"].apply(len)
  352. assert result.index.names[:2] == ("A", "B")
  353. def test_apply_frame_yield_constant(df):
  354. # GH13568
  355. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  356. with tm.assert_produces_warning(FutureWarning, match=msg):
  357. result = df.groupby(["A", "B"]).apply(len)
  358. assert isinstance(result, Series)
  359. assert result.name is None
  360. result = df.groupby(["A", "B"])[["C", "D"]].apply(len)
  361. assert isinstance(result, Series)
  362. assert result.name is None
  363. def test_apply_frame_to_series(df):
  364. grouped = df.groupby(["A", "B"])
  365. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  366. with tm.assert_produces_warning(FutureWarning, match=msg):
  367. result = grouped.apply(len)
  368. expected = grouped.count()["C"]
  369. tm.assert_index_equal(result.index, expected.index)
  370. tm.assert_numpy_array_equal(result.values, expected.values)
  371. def test_apply_frame_not_as_index_column_name(df):
  372. # GH 35964 - path within _wrap_applied_output not hit by a test
  373. grouped = df.groupby(["A", "B"], as_index=False)
  374. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  375. with tm.assert_produces_warning(FutureWarning, match=msg):
  376. result = grouped.apply(len)
  377. expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D")
  378. # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan
  379. tm.assert_index_equal(result.index, expected.index)
  380. tm.assert_numpy_array_equal(result.values, expected.values)
  381. def test_apply_frame_concat_series():
  382. def trans(group):
  383. return group.groupby("B")["C"].sum().sort_values().iloc[:2]
  384. def trans2(group):
  385. grouped = group.groupby(df.reindex(group.index)["B"])
  386. return grouped.sum().sort_values().iloc[:2]
  387. df = DataFrame(
  388. {
  389. "A": np.random.default_rng(2).integers(0, 5, 1000),
  390. "B": np.random.default_rng(2).integers(0, 5, 1000),
  391. "C": np.random.default_rng(2).standard_normal(1000),
  392. }
  393. )
  394. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  395. with tm.assert_produces_warning(FutureWarning, match=msg):
  396. result = df.groupby("A").apply(trans)
  397. exp = df.groupby("A")["C"].apply(trans2)
  398. tm.assert_series_equal(result, exp, check_names=False)
  399. assert result.name == "C"
  400. def test_apply_transform(ts):
  401. grouped = ts.groupby(lambda x: x.month, group_keys=False)
  402. result = grouped.apply(lambda x: x * 2)
  403. expected = grouped.transform(lambda x: x * 2)
  404. tm.assert_series_equal(result, expected)
  405. def test_apply_multikey_corner(tsframe):
  406. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  407. def f(group):
  408. return group.sort_values("A")[-5:]
  409. result = grouped.apply(f)
  410. for key, group in grouped:
  411. tm.assert_frame_equal(result.loc[key], f(group))
  412. @pytest.mark.parametrize("group_keys", [True, False])
  413. def test_apply_chunk_view(group_keys):
  414. # Low level tinkering could be unsafe, make sure not
  415. df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
  416. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  417. with tm.assert_produces_warning(FutureWarning, match=msg):
  418. result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2])
  419. expected = df.take([0, 1, 3, 4, 6, 7])
  420. if group_keys:
  421. expected.index = MultiIndex.from_arrays(
  422. [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None]
  423. )
  424. tm.assert_frame_equal(result, expected)
  425. def test_apply_no_name_column_conflict():
  426. df = DataFrame(
  427. {
  428. "name": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
  429. "name2": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
  430. "value": range(9, -1, -1),
  431. }
  432. )
  433. # it works! #2605
  434. grouped = df.groupby(["name", "name2"])
  435. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  436. with tm.assert_produces_warning(FutureWarning, match=msg):
  437. grouped.apply(lambda x: x.sort_values("value", inplace=True))
  438. def test_apply_typecast_fail():
  439. df = DataFrame(
  440. {
  441. "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
  442. "c": np.tile(["a", "b", "c"], 2),
  443. "v": np.arange(1.0, 7.0),
  444. }
  445. )
  446. def f(group):
  447. v = group["v"]
  448. group["v2"] = (v - v.min()) / (v.max() - v.min())
  449. return group
  450. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  451. with tm.assert_produces_warning(FutureWarning, match=msg):
  452. result = df.groupby("d", group_keys=False).apply(f)
  453. expected = df.copy()
  454. expected["v2"] = np.tile([0.0, 0.5, 1], 2)
  455. tm.assert_frame_equal(result, expected)
  456. def test_apply_multiindex_fail():
  457. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]])
  458. df = DataFrame(
  459. {
  460. "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
  461. "c": np.tile(["a", "b", "c"], 2),
  462. "v": np.arange(1.0, 7.0),
  463. },
  464. index=index,
  465. )
  466. def f(group):
  467. v = group["v"]
  468. group["v2"] = (v - v.min()) / (v.max() - v.min())
  469. return group
  470. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  471. with tm.assert_produces_warning(FutureWarning, match=msg):
  472. result = df.groupby("d", group_keys=False).apply(f)
  473. expected = df.copy()
  474. expected["v2"] = np.tile([0.0, 0.5, 1], 2)
  475. tm.assert_frame_equal(result, expected)
  476. def test_apply_corner(tsframe):
  477. result = tsframe.groupby(lambda x: x.year, group_keys=False).apply(lambda x: x * 2)
  478. expected = tsframe * 2
  479. tm.assert_frame_equal(result, expected)
  480. def test_apply_without_copy():
  481. # GH 5545
  482. # returning a non-copy in an applied function fails
  483. data = DataFrame(
  484. {
  485. "id_field": [100, 100, 200, 300],
  486. "category": ["a", "b", "c", "c"],
  487. "value": [1, 2, 3, 4],
  488. }
  489. )
  490. def filt1(x):
  491. if x.shape[0] == 1:
  492. return x.copy()
  493. else:
  494. return x[x.category == "c"]
  495. def filt2(x):
  496. if x.shape[0] == 1:
  497. return x
  498. else:
  499. return x[x.category == "c"]
  500. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  501. with tm.assert_produces_warning(FutureWarning, match=msg):
  502. expected = data.groupby("id_field").apply(filt1)
  503. with tm.assert_produces_warning(FutureWarning, match=msg):
  504. result = data.groupby("id_field").apply(filt2)
  505. tm.assert_frame_equal(result, expected)
  506. @pytest.mark.parametrize("test_series", [True, False])
  507. def test_apply_with_duplicated_non_sorted_axis(test_series):
  508. # GH 30667
  509. df = DataFrame(
  510. [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2]
  511. )
  512. if test_series:
  513. ser = df.set_index("Y")["X"]
  514. result = ser.groupby(level=0, group_keys=False).apply(lambda x: x)
  515. # not expecting the order to remain the same for duplicated axis
  516. result = result.sort_index()
  517. expected = ser.sort_index()
  518. tm.assert_series_equal(result, expected)
  519. else:
  520. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  521. with tm.assert_produces_warning(FutureWarning, match=msg):
  522. result = df.groupby("Y", group_keys=False).apply(lambda x: x)
  523. # not expecting the order to remain the same for duplicated axis
  524. result = result.sort_values("Y")
  525. expected = df.sort_values("Y")
  526. tm.assert_frame_equal(result, expected)
  527. def test_apply_reindex_values():
  528. # GH: 26209
  529. # reindexing from a single column of a groupby object with duplicate indices caused
  530. # a ValueError (cannot reindex from duplicate axis) in 0.24.2, the problem was
  531. # solved in #30679
  532. values = [1, 2, 3, 4]
  533. indices = [1, 1, 2, 2]
  534. df = DataFrame({"group": ["Group1", "Group2"] * 2, "value": values}, index=indices)
  535. expected = Series(values, index=indices, name="value")
  536. def reindex_helper(x):
  537. return x.reindex(np.arange(x.index.min(), x.index.max() + 1))
  538. # the following group by raised a ValueError
  539. result = df.groupby("group", group_keys=False).value.apply(reindex_helper)
  540. tm.assert_series_equal(expected, result)
  541. def test_apply_corner_cases():
  542. # #535, can't use sliding iterator
  543. N = 1000
  544. labels = np.random.default_rng(2).integers(0, 100, size=N)
  545. df = DataFrame(
  546. {
  547. "key": labels,
  548. "value1": np.random.default_rng(2).standard_normal(N),
  549. "value2": ["foo", "bar", "baz", "qux"] * (N // 4),
  550. }
  551. )
  552. grouped = df.groupby("key", group_keys=False)
  553. def f(g):
  554. g["value3"] = g["value1"] * 2
  555. return g
  556. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  557. with tm.assert_produces_warning(FutureWarning, match=msg):
  558. result = grouped.apply(f)
  559. assert "value3" in result
  560. def test_apply_numeric_coercion_when_datetime():
  561. # In the past, group-by/apply operations have been over-eager
  562. # in converting dtypes to numeric, in the presence of datetime
  563. # columns. Various GH issues were filed, the reproductions
  564. # for which are here.
  565. # GH 15670
  566. df = DataFrame(
  567. {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]}
  568. )
  569. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  570. with tm.assert_produces_warning(FutureWarning, match=msg):
  571. expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
  572. df.Date = pd.to_datetime(df.Date)
  573. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  574. with tm.assert_produces_warning(FutureWarning, match=msg):
  575. result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
  576. tm.assert_series_equal(result["Str"], expected["Str"])
  577. # GH 15421
  578. df = DataFrame(
  579. {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
  580. )
  581. def get_B(g):
  582. return g.iloc[0][["B"]]
  583. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  584. with tm.assert_produces_warning(FutureWarning, match=msg):
  585. result = df.groupby("A").apply(get_B)["B"]
  586. expected = df.B
  587. expected.index = df.A
  588. tm.assert_series_equal(result, expected)
  589. # GH 14423
  590. def predictions(tool):
  591. out = Series(index=["p1", "p2", "useTime"], dtype=object)
  592. if "step1" in list(tool.State):
  593. out["p1"] = str(tool[tool.State == "step1"].Machine.values[0])
  594. if "step2" in list(tool.State):
  595. out["p2"] = str(tool[tool.State == "step2"].Machine.values[0])
  596. out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0])
  597. return out
  598. df1 = DataFrame(
  599. {
  600. "Key": ["B", "B", "A", "A"],
  601. "State": ["step1", "step2", "step1", "step2"],
  602. "oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"],
  603. "Machine": ["23", "36L", "36R", "36R"],
  604. }
  605. )
  606. df2 = df1.copy()
  607. df2.oTime = pd.to_datetime(df2.oTime)
  608. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  609. with tm.assert_produces_warning(FutureWarning, match=msg):
  610. expected = df1.groupby("Key").apply(predictions).p1
  611. with tm.assert_produces_warning(FutureWarning, match=msg):
  612. result = df2.groupby("Key").apply(predictions).p1
  613. tm.assert_series_equal(expected, result)
  614. def test_apply_aggregating_timedelta_and_datetime():
  615. # Regression test for GH 15562
  616. # The following groupby caused ValueErrors and IndexErrors pre 0.20.0
  617. df = DataFrame(
  618. {
  619. "clientid": ["A", "B", "C"],
  620. "datetime": [np.datetime64("2017-02-01 00:00:00")] * 3,
  621. }
  622. )
  623. df["time_delta_zero"] = df.datetime - df.datetime
  624. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  625. with tm.assert_produces_warning(FutureWarning, match=msg):
  626. result = df.groupby("clientid").apply(
  627. lambda ddf: Series(
  628. {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()}
  629. )
  630. )
  631. expected = DataFrame(
  632. {
  633. "clientid": ["A", "B", "C"],
  634. "clientid_age": [np.timedelta64(0, "D")] * 3,
  635. "date": [np.datetime64("2017-02-01 00:00:00")] * 3,
  636. }
  637. ).set_index("clientid")
  638. tm.assert_frame_equal(result, expected)
  639. def test_apply_groupby_datetimeindex():
  640. # GH 26182
  641. # groupby apply failed on dataframe with DatetimeIndex
  642. data = [["A", 10], ["B", 20], ["B", 30], ["C", 40], ["C", 50]]
  643. df = DataFrame(
  644. data, columns=["Name", "Value"], index=pd.date_range("2020-09-01", "2020-09-05")
  645. )
  646. result = df.groupby("Name").sum()
  647. expected = DataFrame({"Name": ["A", "B", "C"], "Value": [10, 50, 90]})
  648. expected.set_index("Name", inplace=True)
  649. tm.assert_frame_equal(result, expected)
  650. def test_time_field_bug():
  651. # Test a fix for the following error related to GH issue 11324 When
  652. # non-key fields in a group-by dataframe contained time-based fields
  653. # that were not returned by the apply function, an exception would be
  654. # raised.
  655. df = DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]})
  656. def func_with_no_date(batch):
  657. return Series({"c": 2})
  658. def func_with_date(batch):
  659. return Series({"b": datetime(2015, 1, 1), "c": 2})
  660. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  661. with tm.assert_produces_warning(FutureWarning, match=msg):
  662. dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date)
  663. dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1])
  664. dfg_no_conversion_expected.index.name = "a"
  665. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  666. with tm.assert_produces_warning(FutureWarning, match=msg):
  667. dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
  668. dfg_conversion_expected = DataFrame(
  669. {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1]
  670. )
  671. dfg_conversion_expected.index.name = "a"
  672. tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
  673. tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
  674. def test_gb_apply_list_of_unequal_len_arrays():
  675. # GH1738
  676. df = DataFrame(
  677. {
  678. "group1": ["a", "a", "a", "b", "b", "b", "a", "a", "a", "b", "b", "b"],
  679. "group2": ["c", "c", "d", "d", "d", "e", "c", "c", "d", "d", "d", "e"],
  680. "weight": [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
  681. "value": [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3],
  682. }
  683. )
  684. df = df.set_index(["group1", "group2"])
  685. df_grouped = df.groupby(level=["group1", "group2"], sort=True)
  686. def noddy(value, weight):
  687. out = np.array(value * weight).repeat(3)
  688. return out
  689. # the kernel function returns arrays of unequal length
  690. # pandas sniffs the first one, sees it's an array and not
  691. # a list, and assumed the rest are of equal length
  692. # and so tries a vstack
  693. # don't die
  694. df_grouped.apply(lambda x: noddy(x.value, x.weight))
  695. def test_groupby_apply_all_none():
  696. # Tests to make sure no errors if apply function returns all None
  697. # values. Issue 9684.
  698. test_df = DataFrame({"groups": [0, 0, 1, 1], "random_vars": [8, 7, 4, 5]})
  699. def test_func(x):
  700. pass
  701. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  702. with tm.assert_produces_warning(FutureWarning, match=msg):
  703. result = test_df.groupby("groups").apply(test_func)
  704. expected = DataFrame()
  705. tm.assert_frame_equal(result, expected)
  706. def test_groupby_apply_none_first():
  707. # GH 12824. Tests if apply returns None first.
  708. test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
  709. test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
  710. def test_func(x):
  711. if x.shape[0] < 2:
  712. return None
  713. return x.iloc[[0, -1]]
  714. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  715. with tm.assert_produces_warning(FutureWarning, match=msg):
  716. result1 = test_df1.groupby("groups").apply(test_func)
  717. with tm.assert_produces_warning(FutureWarning, match=msg):
  718. result2 = test_df2.groupby("groups").apply(test_func)
  719. index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
  720. index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
  721. expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
  722. expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
  723. tm.assert_frame_equal(result1, expected1)
  724. tm.assert_frame_equal(result2, expected2)
  725. def test_groupby_apply_return_empty_chunk():
  726. # GH 22221: apply filter which returns some empty groups
  727. df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]})
  728. groups = df.groupby("group")
  729. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  730. with tm.assert_produces_warning(FutureWarning, match=msg):
  731. result = groups.apply(lambda group: group[group.value != 1]["value"])
  732. expected = Series(
  733. [0],
  734. name="value",
  735. index=MultiIndex.from_product(
  736. [["empty", "filled"], [0]], names=["group", None]
  737. ).drop("empty"),
  738. )
  739. tm.assert_series_equal(result, expected)
  740. def test_apply_with_mixed_types():
  741. # gh-20949
  742. df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]})
  743. g = df.groupby("A", group_keys=False)
  744. result = g.transform(lambda x: x / x.sum())
  745. expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]})
  746. tm.assert_frame_equal(result, expected)
  747. result = g.apply(lambda x: x / x.sum())
  748. tm.assert_frame_equal(result, expected)
  749. def test_func_returns_object():
  750. # GH 28652
  751. df = DataFrame({"a": [1, 2]}, index=Index([1, 2]))
  752. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  753. with tm.assert_produces_warning(FutureWarning, match=msg):
  754. result = df.groupby("a").apply(lambda g: g.index)
  755. expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a"))
  756. tm.assert_series_equal(result, expected)
  757. @pytest.mark.parametrize(
  758. "group_column_dtlike",
  759. [datetime.today(), datetime.today().date(), datetime.today().time()],
  760. )
  761. def test_apply_datetime_issue(group_column_dtlike):
  762. # GH-28247
  763. # groupby-apply throws an error if one of the columns in the DataFrame
  764. # is a datetime object and the column labels are different from
  765. # standard int values in range(len(num_columns))
  766. df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
  767. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  768. with tm.assert_produces_warning(FutureWarning, match=msg):
  769. result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
  770. expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42])
  771. tm.assert_frame_equal(result, expected)
  772. def test_apply_series_return_dataframe_groups():
  773. # GH 10078
  774. tdf = DataFrame(
  775. {
  776. "day": {
  777. 0: pd.Timestamp("2015-02-24 00:00:00"),
  778. 1: pd.Timestamp("2015-02-24 00:00:00"),
  779. 2: pd.Timestamp("2015-02-24 00:00:00"),
  780. 3: pd.Timestamp("2015-02-24 00:00:00"),
  781. 4: pd.Timestamp("2015-02-24 00:00:00"),
  782. },
  783. "userAgent": {
  784. 0: "some UA string",
  785. 1: "some UA string",
  786. 2: "some UA string",
  787. 3: "another UA string",
  788. 4: "some UA string",
  789. },
  790. "userId": {
  791. 0: "17661101",
  792. 1: "17661101",
  793. 2: "17661101",
  794. 3: "17661101",
  795. 4: "17661101",
  796. },
  797. }
  798. )
  799. def most_common_values(df):
  800. return Series({c: s.value_counts().index[0] for c, s in df.items()})
  801. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  802. with tm.assert_produces_warning(FutureWarning, match=msg):
  803. result = tdf.groupby("day").apply(most_common_values)["userId"]
  804. expected = Series(
  805. ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId"
  806. )
  807. tm.assert_series_equal(result, expected)
  808. @pytest.mark.parametrize("category", [False, True])
  809. def test_apply_multi_level_name(category):
  810. # https://github.com/pandas-dev/pandas/issues/31068
  811. b = [1, 2] * 5
  812. if category:
  813. b = pd.Categorical(b, categories=[1, 2, 3])
  814. expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B")
  815. expected_values = [20, 25, 0]
  816. else:
  817. expected_index = Index([1, 2], name="B")
  818. expected_values = [20, 25]
  819. expected = DataFrame(
  820. {"C": expected_values, "D": expected_values}, index=expected_index
  821. )
  822. df = DataFrame(
  823. {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
  824. ).set_index(["A", "B"])
  825. result = df.groupby("B", observed=False).apply(lambda x: x.sum())
  826. tm.assert_frame_equal(result, expected)
  827. assert df.index.names == ["A", "B"]
  828. def test_groupby_apply_datetime_result_dtypes(using_infer_string):
  829. # GH 14849
  830. data = DataFrame.from_records(
  831. [
  832. (pd.Timestamp(2016, 1, 1), "red", "dark", 1, "8"),
  833. (pd.Timestamp(2015, 1, 1), "green", "stormy", 2, "9"),
  834. (pd.Timestamp(2014, 1, 1), "blue", "bright", 3, "10"),
  835. (pd.Timestamp(2013, 1, 1), "blue", "calm", 4, "potato"),
  836. ],
  837. columns=["observation", "color", "mood", "intensity", "score"],
  838. )
  839. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  840. with tm.assert_produces_warning(FutureWarning, match=msg):
  841. result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
  842. dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object
  843. expected = Series(
  844. [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype],
  845. index=["observation", "color", "mood", "intensity", "score"],
  846. )
  847. tm.assert_series_equal(result, expected)
  848. @pytest.mark.parametrize(
  849. "index",
  850. [
  851. pd.CategoricalIndex(list("abc")),
  852. pd.interval_range(0, 3),
  853. pd.period_range("2020", periods=3, freq="D"),
  854. MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
  855. ],
  856. )
  857. def test_apply_index_has_complex_internals(index):
  858. # GH 31248
  859. df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
  860. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  861. with tm.assert_produces_warning(FutureWarning, match=msg):
  862. result = df.groupby("group", group_keys=False).apply(lambda x: x)
  863. tm.assert_frame_equal(result, df)
  864. @pytest.mark.parametrize(
  865. "function, expected_values",
  866. [
  867. (lambda x: x.index.to_list(), [[0, 1], [2, 3]]),
  868. (lambda x: set(x.index.to_list()), [{0, 1}, {2, 3}]),
  869. (lambda x: tuple(x.index.to_list()), [(0, 1), (2, 3)]),
  870. (
  871. lambda x: dict(enumerate(x.index.to_list())),
  872. [{0: 0, 1: 1}, {0: 2, 1: 3}],
  873. ),
  874. (
  875. lambda x: [{n: i} for (n, i) in enumerate(x.index.to_list())],
  876. [[{0: 0}, {1: 1}], [{0: 2}, {1: 3}]],
  877. ),
  878. ],
  879. )
  880. def test_apply_function_returns_non_pandas_non_scalar(function, expected_values):
  881. # GH 31441
  882. df = DataFrame(["A", "A", "B", "B"], columns=["groups"])
  883. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  884. with tm.assert_produces_warning(FutureWarning, match=msg):
  885. result = df.groupby("groups").apply(function)
  886. expected = Series(expected_values, index=Index(["A", "B"], name="groups"))
  887. tm.assert_series_equal(result, expected)
  888. def test_apply_function_returns_numpy_array():
  889. # GH 31605
  890. def fct(group):
  891. return group["B"].values.flatten()
  892. df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]})
  893. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  894. with tm.assert_produces_warning(FutureWarning, match=msg):
  895. result = df.groupby("A").apply(fct)
  896. expected = Series(
  897. [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A")
  898. )
  899. tm.assert_series_equal(result, expected)
  900. @pytest.mark.parametrize("function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1])
  901. def test_apply_function_index_return(function):
  902. # GH: 22541
  903. df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"])
  904. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  905. with tm.assert_produces_warning(FutureWarning, match=msg):
  906. result = df.groupby("id").apply(function)
  907. expected = Series(
  908. [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])],
  909. index=Index([1, 2, 3], name="id"),
  910. )
  911. tm.assert_series_equal(result, expected)
  912. def test_apply_function_with_indexing_return_column():
  913. # GH#7002, GH#41480, GH#49256
  914. df = DataFrame(
  915. {
  916. "foo1": ["one", "two", "two", "three", "one", "two"],
  917. "foo2": [1, 2, 4, 4, 5, 6],
  918. }
  919. )
  920. result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean())
  921. expected = DataFrame(
  922. {
  923. "foo1": ["one", "three", "two"],
  924. "foo2": [3.0, 4.0, 4.0],
  925. }
  926. )
  927. tm.assert_frame_equal(result, expected)
  928. @pytest.mark.parametrize(
  929. "udf",
  930. [(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))],
  931. )
  932. @pytest.mark.parametrize("group_keys", [True, False])
  933. def test_apply_result_type(group_keys, udf):
  934. # https://github.com/pandas-dev/pandas/issues/34809
  935. # We'd like to control whether the group keys end up in the index
  936. # regardless of whether the UDF happens to be a transform.
  937. df = DataFrame({"A": ["a", "b"], "B": [1, 2]})
  938. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  939. with tm.assert_produces_warning(FutureWarning, match=msg):
  940. df_result = df.groupby("A", group_keys=group_keys).apply(udf)
  941. series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf)
  942. if group_keys:
  943. assert df_result.index.nlevels == 2
  944. assert series_result.index.nlevels == 2
  945. else:
  946. assert df_result.index.nlevels == 1
  947. assert series_result.index.nlevels == 1
  948. def test_result_order_group_keys_false():
  949. # GH 34998
  950. # apply result order should not depend on whether index is the same or just equal
  951. df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]})
  952. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  953. with tm.assert_produces_warning(FutureWarning, match=msg):
  954. result = df.groupby("A", group_keys=False).apply(lambda x: x)
  955. with tm.assert_produces_warning(FutureWarning, match=msg):
  956. expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy())
  957. tm.assert_frame_equal(result, expected)
  958. def test_apply_with_timezones_aware():
  959. # GH: 27212
  960. dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2
  961. index_no_tz = pd.DatetimeIndex(dates)
  962. index_tz = pd.DatetimeIndex(dates, tz="UTC")
  963. df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz})
  964. df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz})
  965. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  966. with tm.assert_produces_warning(FutureWarning, match=msg):
  967. result1 = df1.groupby("x", group_keys=False).apply(
  968. lambda df: df[["x", "y"]].copy()
  969. )
  970. with tm.assert_produces_warning(FutureWarning, match=msg):
  971. result2 = df2.groupby("x", group_keys=False).apply(
  972. lambda df: df[["x", "y"]].copy()
  973. )
  974. tm.assert_frame_equal(result1, result2)
  975. def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):
  976. # GH #34656
  977. # GH #34271
  978. df = DataFrame(
  979. {
  980. "a": [99, 99, 99, 88, 88, 88],
  981. "b": [1, 2, 3, 4, 5, 6],
  982. "c": [10, 20, 30, 40, 50, 60],
  983. }
  984. )
  985. expected = DataFrame(
  986. {"b": [15, 6], "c": [150, 60]},
  987. index=Index([88, 99], name="a"),
  988. )
  989. # Check output when no other methods are called before .apply()
  990. grp = df.groupby(by="a")
  991. msg = "The behavior of DataFrame.sum with axis=None is deprecated"
  992. with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
  993. result = grp.apply(sum, include_groups=False)
  994. tm.assert_frame_equal(result, expected)
  995. # Check output when another method is called before .apply()
  996. grp = df.groupby(by="a")
  997. args = get_groupby_method_args(reduction_func, df)
  998. _ = getattr(grp, reduction_func)(*args)
  999. with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
  1000. result = grp.apply(sum, include_groups=False)
  1001. tm.assert_frame_equal(result, expected)
  1002. def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
  1003. # GH 29617
  1004. df = DataFrame(
  1005. {
  1006. "A": ["a", "a", "a", "b"],
  1007. "B": [
  1008. date(2020, 1, 10),
  1009. date(2020, 1, 10),
  1010. date(2020, 2, 10),
  1011. date(2020, 2, 10),
  1012. ],
  1013. "C": [1, 2, 3, 4],
  1014. },
  1015. index=Index([100, 101, 102, 103], name="idx"),
  1016. )
  1017. grp = df.groupby(["A", "B"])
  1018. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1019. with tm.assert_produces_warning(FutureWarning, match=msg):
  1020. result = grp.apply(lambda x: x.head(1))
  1021. expected = df.iloc[[0, 2, 3]]
  1022. expected = expected.reset_index()
  1023. expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]])
  1024. expected = expected.drop(columns="idx")
  1025. tm.assert_frame_equal(result, expected)
  1026. for val in result.index.levels[1]:
  1027. assert type(val) is date
  1028. def test_apply_by_cols_equals_apply_by_rows_transposed():
  1029. # GH 16646
  1030. # Operating on the columns, or transposing and operating on the rows
  1031. # should give the same result. There was previously a bug where the
  1032. # by_rows operation would work fine, but by_cols would throw a ValueError
  1033. df = DataFrame(
  1034. np.random.default_rng(2).random([6, 4]),
  1035. columns=MultiIndex.from_product([["A", "B"], [1, 2]]),
  1036. )
  1037. msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
  1038. with tm.assert_produces_warning(FutureWarning, match=msg):
  1039. gb = df.T.groupby(axis=0, level=0)
  1040. by_rows = gb.apply(lambda x: x.droplevel(axis=0, level=0))
  1041. msg = "DataFrame.groupby with axis=1 is deprecated"
  1042. with tm.assert_produces_warning(FutureWarning, match=msg):
  1043. gb2 = df.groupby(axis=1, level=0)
  1044. by_cols = gb2.apply(lambda x: x.droplevel(axis=1, level=0))
  1045. tm.assert_frame_equal(by_cols, by_rows.T)
  1046. tm.assert_frame_equal(by_cols, df)
  1047. @pytest.mark.parametrize("dropna", [True, False])
  1048. def test_apply_dropna_with_indexed_same(dropna):
  1049. # GH 38227
  1050. # GH#43205
  1051. df = DataFrame(
  1052. {
  1053. "col": [1, 2, 3, 4, 5],
  1054. "group": ["a", np.nan, np.nan, "b", "b"],
  1055. },
  1056. index=list("xxyxz"),
  1057. )
  1058. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1059. with tm.assert_produces_warning(FutureWarning, match=msg):
  1060. result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x)
  1061. expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]]
  1062. tm.assert_frame_equal(result, expected)
  1063. @pytest.mark.parametrize(
  1064. "as_index, expected",
  1065. [
  1066. pytest.param(
  1067. False,
  1068. DataFrame(
  1069. [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object)
  1070. ),
  1071. marks=pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)"),
  1072. ),
  1073. [
  1074. True,
  1075. Series(
  1076. [1, 1], index=MultiIndex.from_tuples([(1, 1), (2, 2)], names=["a", "b"])
  1077. ),
  1078. ],
  1079. ],
  1080. )
  1081. def test_apply_as_index_constant_lambda(as_index, expected):
  1082. # GH 13217
  1083. df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]})
  1084. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1085. with tm.assert_produces_warning(FutureWarning, match=msg):
  1086. result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1)
  1087. tm.assert_equal(result, expected)
  1088. def test_sort_index_groups():
  1089. # GH 20420
  1090. df = DataFrame(
  1091. {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]},
  1092. index=range(5),
  1093. )
  1094. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1095. with tm.assert_produces_warning(FutureWarning, match=msg):
  1096. result = df.groupby("C").apply(lambda x: x.A.sort_index())
  1097. expected = Series(
  1098. range(1, 6),
  1099. index=MultiIndex.from_tuples(
  1100. [(1, 0), (1, 1), (1, 2), (2, 3), (2, 4)], names=["C", None]
  1101. ),
  1102. name="A",
  1103. )
  1104. tm.assert_series_equal(result, expected)
  1105. def test_positional_slice_groups_datetimelike():
  1106. # GH 21651
  1107. expected = DataFrame(
  1108. {
  1109. "date": pd.date_range("2010-01-01", freq="12h", periods=5),
  1110. "vals": range(5),
  1111. "let": list("abcde"),
  1112. }
  1113. )
  1114. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1115. with tm.assert_produces_warning(FutureWarning, match=msg):
  1116. result = expected.groupby(
  1117. [expected.let, expected.date.dt.date], group_keys=False
  1118. ).apply(lambda x: x.iloc[0:])
  1119. tm.assert_frame_equal(result, expected)
  1120. def test_groupby_apply_shape_cache_safety():
  1121. # GH#42702 this fails if we cache_readonly Block.shape
  1122. df = DataFrame({"A": ["a", "a", "b"], "B": [1, 2, 3], "C": [4, 6, 5]})
  1123. gb = df.groupby("A")
  1124. result = gb[["B", "C"]].apply(lambda x: x.astype(float).max() - x.min())
  1125. expected = DataFrame(
  1126. {"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A")
  1127. )
  1128. tm.assert_frame_equal(result, expected)
  1129. def test_groupby_apply_to_series_name():
  1130. # GH52444
  1131. df = DataFrame.from_dict(
  1132. {
  1133. "a": ["a", "b", "a", "b"],
  1134. "b1": ["aa", "ac", "ac", "ad"],
  1135. "b2": ["aa", "aa", "aa", "ac"],
  1136. }
  1137. )
  1138. grp = df.groupby("a")[["b1", "b2"]]
  1139. result = grp.apply(lambda x: x.unstack().value_counts())
  1140. expected_idx = MultiIndex.from_arrays(
  1141. arrays=[["a", "a", "b", "b", "b"], ["aa", "ac", "ac", "ad", "aa"]],
  1142. names=["a", None],
  1143. )
  1144. expected = Series([3, 1, 2, 1, 1], index=expected_idx, name="count")
  1145. tm.assert_series_equal(result, expected)
  1146. @pytest.mark.parametrize("dropna", [True, False])
  1147. def test_apply_na(dropna):
  1148. # GH#28984
  1149. df = DataFrame(
  1150. {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]}
  1151. )
  1152. dfgrp = df.groupby("grp", dropna=dropna)
  1153. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1154. with tm.assert_produces_warning(FutureWarning, match=msg):
  1155. result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z"))
  1156. with tm.assert_produces_warning(FutureWarning, match=msg):
  1157. expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1))
  1158. tm.assert_frame_equal(result, expected)
  1159. def test_apply_empty_string_nan_coerce_bug():
  1160. # GH#24903
  1161. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1162. with tm.assert_produces_warning(FutureWarning, match=msg):
  1163. result = (
  1164. DataFrame(
  1165. {
  1166. "a": [1, 1, 2, 2],
  1167. "b": ["", "", "", ""],
  1168. "c": pd.to_datetime([1, 2, 3, 4], unit="s"),
  1169. }
  1170. )
  1171. .groupby(["a", "b"])
  1172. .apply(lambda df: df.iloc[-1])
  1173. )
  1174. expected = DataFrame(
  1175. [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]],
  1176. columns=["a", "b", "c"],
  1177. index=MultiIndex.from_tuples([(1, ""), (2, "")], names=["a", "b"]),
  1178. )
  1179. tm.assert_frame_equal(result, expected)
  1180. @pytest.mark.parametrize("index_values", [[1, 2, 3], [1.0, 2.0, 3.0]])
  1181. def test_apply_index_key_error_bug(index_values):
  1182. # GH 44310
  1183. result = DataFrame(
  1184. {
  1185. "a": ["aa", "a2", "a3"],
  1186. "b": [1, 2, 3],
  1187. },
  1188. index=Index(index_values),
  1189. )
  1190. expected = DataFrame(
  1191. {
  1192. "b_mean": [2.0, 3.0, 1.0],
  1193. },
  1194. index=Index(["a2", "a3", "aa"], name="a"),
  1195. )
  1196. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1197. with tm.assert_produces_warning(FutureWarning, match=msg):
  1198. result = result.groupby("a").apply(
  1199. lambda df: Series([df["b"].mean()], index=["b_mean"])
  1200. )
  1201. tm.assert_frame_equal(result, expected)
  1202. @pytest.mark.parametrize(
  1203. "arg,idx",
  1204. [
  1205. [
  1206. [
  1207. 1,
  1208. 2,
  1209. 3,
  1210. ],
  1211. [
  1212. 0.1,
  1213. 0.3,
  1214. 0.2,
  1215. ],
  1216. ],
  1217. [
  1218. [
  1219. 1,
  1220. 2,
  1221. 3,
  1222. ],
  1223. [
  1224. 0.1,
  1225. 0.2,
  1226. 0.3,
  1227. ],
  1228. ],
  1229. [
  1230. [
  1231. 1,
  1232. 4,
  1233. 3,
  1234. ],
  1235. [
  1236. 0.1,
  1237. 0.4,
  1238. 0.2,
  1239. ],
  1240. ],
  1241. ],
  1242. )
  1243. def test_apply_nonmonotonic_float_index(arg, idx):
  1244. # GH 34455
  1245. expected = DataFrame({"col": arg}, index=idx)
  1246. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1247. with tm.assert_produces_warning(FutureWarning, match=msg):
  1248. result = expected.groupby("col", group_keys=False).apply(lambda x: x)
  1249. tm.assert_frame_equal(result, expected)
  1250. @pytest.mark.parametrize("args, kwargs", [([True], {}), ([], {"numeric_only": True})])
  1251. def test_apply_str_with_args(df, args, kwargs):
  1252. # GH#46479
  1253. gb = df.groupby("A")
  1254. result = gb.apply("sum", *args, **kwargs)
  1255. expected = gb.sum(numeric_only=True)
  1256. tm.assert_frame_equal(result, expected)
  1257. @pytest.mark.parametrize("name", ["some_name", None])
  1258. def test_result_name_when_one_group(name):
  1259. # GH 46369
  1260. ser = Series([1, 2], name=name)
  1261. result = ser.groupby(["a", "a"], group_keys=False).apply(lambda x: x)
  1262. expected = Series([1, 2], name=name)
  1263. tm.assert_series_equal(result, expected)
  1264. @pytest.mark.parametrize(
  1265. "method, op",
  1266. [
  1267. ("apply", lambda gb: gb.values[-1]),
  1268. ("apply", lambda gb: gb["b"].iloc[0]),
  1269. ("agg", "skew"),
  1270. ("agg", "prod"),
  1271. ("agg", "sum"),
  1272. ],
  1273. )
  1274. def test_empty_df(method, op):
  1275. # GH 47985
  1276. empty_df = DataFrame({"a": [], "b": []})
  1277. gb = empty_df.groupby("a", group_keys=True)
  1278. group = getattr(gb, "b")
  1279. result = getattr(group, method)(op)
  1280. expected = Series(
  1281. [], name="b", dtype="float64", index=Index([], dtype="float64", name="a")
  1282. )
  1283. tm.assert_series_equal(result, expected)
  1284. @pytest.mark.parametrize("include_groups", [True, False])
  1285. def test_include_groups(include_groups):
  1286. # GH#7155
  1287. df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
  1288. gb = df.groupby("a")
  1289. warn = FutureWarning if include_groups else None
  1290. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1291. with tm.assert_produces_warning(warn, match=msg):
  1292. result = gb.apply(lambda x: x.sum(), include_groups=include_groups)
  1293. expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a"))
  1294. if not include_groups:
  1295. expected = expected[["b"]]
  1296. tm.assert_frame_equal(result, expected)
  1297. @pytest.mark.parametrize("f", [max, min, sum])
  1298. @pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key
  1299. def test_builtins_apply(keys, f):
  1300. # see gh-8155
  1301. rs = np.random.default_rng(2)
  1302. df = DataFrame(rs.integers(1, 7, (10, 2)), columns=["jim", "joe"])
  1303. df["jolie"] = rs.standard_normal(10)
  1304. gb = df.groupby(keys)
  1305. fname = f.__name__
  1306. warn = None if f is not sum else FutureWarning
  1307. msg = "The behavior of DataFrame.sum with axis=None is deprecated"
  1308. with tm.assert_produces_warning(
  1309. warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False
  1310. ):
  1311. # Also warns on deprecation GH#53425
  1312. result = gb.apply(f)
  1313. ngroups = len(df.drop_duplicates(subset=keys))
  1314. assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
  1315. assert result.shape == (ngroups, 3), assert_msg
  1316. npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
  1317. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1318. with tm.assert_produces_warning(FutureWarning, match=msg):
  1319. expected = gb.apply(npfunc)
  1320. tm.assert_frame_equal(result, expected)
  1321. with tm.assert_produces_warning(FutureWarning, match=msg):
  1322. expected2 = gb.apply(lambda x: npfunc(x))
  1323. tm.assert_frame_equal(result, expected2)
  1324. if f != sum:
  1325. expected = gb.agg(fname).reset_index()
  1326. expected.set_index(keys, inplace=True, drop=False)
  1327. tm.assert_frame_equal(result, expected, check_dtype=False)
  1328. tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))