test_reductions.py 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277
  1. import builtins
  2. import datetime as dt
  3. from string import ascii_lowercase
  4. import numpy as np
  5. import pytest
  6. from pandas._libs.tslibs import iNaT
  7. from pandas.core.dtypes.common import pandas_dtype
  8. from pandas.core.dtypes.missing import na_value_for_dtype
  9. import pandas as pd
  10. from pandas import (
  11. DataFrame,
  12. MultiIndex,
  13. Series,
  14. Timestamp,
  15. date_range,
  16. isna,
  17. )
  18. import pandas._testing as tm
  19. from pandas.tests.groupby import get_groupby_method_args
  20. from pandas.util import _test_decorators as td
  21. @pytest.mark.parametrize("agg_func", ["any", "all"])
  22. @pytest.mark.parametrize(
  23. "vals",
  24. [
  25. ["foo", "bar", "baz"],
  26. ["foo", "", ""],
  27. ["", "", ""],
  28. [1, 2, 3],
  29. [1, 0, 0],
  30. [0, 0, 0],
  31. [1.0, 2.0, 3.0],
  32. [1.0, 0.0, 0.0],
  33. [0.0, 0.0, 0.0],
  34. [True, True, True],
  35. [True, False, False],
  36. [False, False, False],
  37. [np.nan, np.nan, np.nan],
  38. ],
  39. )
  40. def test_groupby_bool_aggs(skipna, agg_func, vals):
  41. df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2})
  42. # Figure out expectation using Python builtin
  43. exp = getattr(builtins, agg_func)(vals)
  44. # edge case for missing data with skipna and 'any'
  45. if skipna and all(isna(vals)) and agg_func == "any":
  46. exp = False
  47. expected = DataFrame(
  48. [exp] * 2, columns=["val"], index=pd.Index(["a", "b"], name="key")
  49. )
  50. result = getattr(df.groupby("key"), agg_func)(skipna=skipna)
  51. tm.assert_frame_equal(result, expected)
  52. def test_any():
  53. df = DataFrame(
  54. [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
  55. columns=["A", "B", "C"],
  56. )
  57. expected = DataFrame(
  58. [[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
  59. )
  60. expected.index.name = "A"
  61. result = df.groupby("A").any()
  62. tm.assert_frame_equal(result, expected)
  63. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  64. def test_bool_aggs_dup_column_labels(bool_agg_func):
  65. # GH#21668
  66. df = DataFrame([[True, True]], columns=["a", "a"])
  67. grp_by = df.groupby([0])
  68. result = getattr(grp_by, bool_agg_func)()
  69. expected = df.set_axis(np.array([0]))
  70. tm.assert_frame_equal(result, expected)
  71. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  72. @pytest.mark.parametrize(
  73. "data",
  74. [
  75. [False, False, False],
  76. [True, True, True],
  77. [pd.NA, pd.NA, pd.NA],
  78. [False, pd.NA, False],
  79. [True, pd.NA, True],
  80. [True, pd.NA, False],
  81. ],
  82. )
  83. def test_masked_kleene_logic(bool_agg_func, skipna, data):
  84. # GH#37506
  85. ser = Series(data, dtype="boolean")
  86. # The result should match aggregating on the whole series. Correctness
  87. # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic
  88. expected_data = getattr(ser, bool_agg_func)(skipna=skipna)
  89. expected = Series(expected_data, index=np.array([0]), dtype="boolean")
  90. result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna)
  91. tm.assert_series_equal(result, expected)
  92. @pytest.mark.parametrize(
  93. "dtype1,dtype2,exp_col1,exp_col2",
  94. [
  95. (
  96. "float",
  97. "Float64",
  98. np.array([True], dtype=bool),
  99. pd.array([pd.NA], dtype="boolean"),
  100. ),
  101. (
  102. "Int64",
  103. "float",
  104. pd.array([pd.NA], dtype="boolean"),
  105. np.array([True], dtype=bool),
  106. ),
  107. (
  108. "Int64",
  109. "Int64",
  110. pd.array([pd.NA], dtype="boolean"),
  111. pd.array([pd.NA], dtype="boolean"),
  112. ),
  113. (
  114. "Float64",
  115. "boolean",
  116. pd.array([pd.NA], dtype="boolean"),
  117. pd.array([pd.NA], dtype="boolean"),
  118. ),
  119. ],
  120. )
  121. def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2):
  122. # GH#37506
  123. data = [1.0, np.nan]
  124. df = DataFrame(
  125. {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)}
  126. )
  127. result = df.groupby([1, 1]).agg("all", skipna=False)
  128. expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1]))
  129. tm.assert_frame_equal(result, expected)
  130. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  131. @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
  132. def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series):
  133. # GH#40585
  134. obj = frame_or_series([pd.NA, 1], dtype=dtype)
  135. expected_res = True
  136. if not skipna and bool_agg_func == "all":
  137. expected_res = pd.NA
  138. expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean")
  139. result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna)
  140. tm.assert_equal(result, expected)
  141. @pytest.mark.parametrize(
  142. "bool_agg_func,data,expected_res",
  143. [
  144. ("any", [pd.NA, np.nan], False),
  145. ("any", [pd.NA, 1, np.nan], True),
  146. ("all", [pd.NA, pd.NaT], True),
  147. ("all", [pd.NA, False, pd.NaT], False),
  148. ],
  149. )
  150. def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series):
  151. # GH#37501
  152. obj = frame_or_series(data, dtype=object)
  153. result = obj.groupby([1] * len(data)).agg(bool_agg_func)
  154. expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool")
  155. tm.assert_equal(result, expected)
  156. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  157. def test_object_NA_raises_with_skipna_false(bool_agg_func):
  158. # GH#37501
  159. ser = Series([pd.NA], dtype=object)
  160. with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
  161. ser.groupby([1]).agg(bool_agg_func, skipna=False)
  162. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  163. def test_empty(frame_or_series, bool_agg_func):
  164. # GH 45231
  165. kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"}
  166. obj = frame_or_series(**kwargs, dtype=object)
  167. result = getattr(obj.groupby(obj.index), bool_agg_func)()
  168. expected = frame_or_series(**kwargs, dtype=bool)
  169. tm.assert_equal(result, expected)
  170. @pytest.mark.parametrize("how", ["idxmin", "idxmax"])
  171. def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype):
  172. # GH#57040
  173. if any_real_numpy_dtype is int or any_real_numpy_dtype is float:
  174. # No need to test
  175. return
  176. info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo
  177. min_value = info(any_real_numpy_dtype).min
  178. max_value = info(any_real_numpy_dtype).max
  179. df = DataFrame(
  180. {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]},
  181. dtype=any_real_numpy_dtype,
  182. )
  183. gb = df.groupby("a")
  184. result = getattr(gb, how)()
  185. expected = DataFrame(
  186. {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
  187. )
  188. tm.assert_frame_equal(result, expected)
  189. @pytest.mark.parametrize("how", ["idxmin", "idxmax"])
  190. def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
  191. # GH#57040
  192. min_value = np.finfo(float_numpy_dtype).min
  193. max_value = np.finfo(float_numpy_dtype).max
  194. df = DataFrame(
  195. {
  196. "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"),
  197. "b": Series(
  198. [
  199. np.nan,
  200. min_value,
  201. np.nan,
  202. max_value,
  203. min_value,
  204. np.nan,
  205. max_value,
  206. np.nan,
  207. np.nan,
  208. np.nan,
  209. ],
  210. dtype=float_numpy_dtype,
  211. ),
  212. },
  213. )
  214. gb = df.groupby("a")
  215. warn = None if skipna else FutureWarning
  216. msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values"
  217. with tm.assert_produces_warning(warn, match=msg):
  218. result = getattr(gb, how)(skipna=skipna)
  219. if skipna:
  220. values = [1, 3, 4, 6, np.nan]
  221. else:
  222. values = np.nan
  223. expected = DataFrame(
  224. {"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp")
  225. )
  226. tm.assert_frame_equal(result, expected)
  227. @pytest.mark.parametrize(
  228. "func, values",
  229. [
  230. ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}),
  231. ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}),
  232. ],
  233. )
  234. @pytest.mark.parametrize("numeric_only", [True, False])
  235. def test_idxmin_idxmax_returns_int_types(func, values, numeric_only):
  236. # GH 25444
  237. df = DataFrame(
  238. {
  239. "name": ["A", "A", "B", "B"],
  240. "c_int": [1, 2, 3, 4],
  241. "c_float": [4.02, 3.03, 2.04, 1.05],
  242. "c_date": ["2019", "2018", "2016", "2017"],
  243. }
  244. )
  245. df["c_date"] = pd.to_datetime(df["c_date"])
  246. df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific")
  247. df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0]
  248. df["c_period"] = df["c_date"].dt.to_period("W")
  249. df["c_Integer"] = df["c_int"].astype("Int64")
  250. df["c_Floating"] = df["c_float"].astype("Float64")
  251. result = getattr(df.groupby("name"), func)(numeric_only=numeric_only)
  252. expected = DataFrame(values, index=pd.Index(["A", "B"], name="name"))
  253. if numeric_only:
  254. expected = expected.drop(columns=["c_date"])
  255. else:
  256. expected["c_date_tz"] = expected["c_date"]
  257. expected["c_timedelta"] = expected["c_date"]
  258. expected["c_period"] = expected["c_date"]
  259. expected["c_Integer"] = expected["c_int"]
  260. expected["c_Floating"] = expected["c_float"]
  261. tm.assert_frame_equal(result, expected)
  262. @pytest.mark.parametrize(
  263. "data",
  264. [
  265. (
  266. Timestamp("2011-01-15 12:50:28.502376"),
  267. Timestamp("2011-01-20 12:50:28.593448"),
  268. ),
  269. (24650000000000001, 24650000000000002),
  270. ],
  271. )
  272. @pytest.mark.parametrize("method", ["count", "min", "max", "first", "last"])
  273. def test_groupby_non_arithmetic_agg_int_like_precision(method, data):
  274. # GH#6620, GH#9311
  275. df = DataFrame({"a": [1, 1], "b": data})
  276. grouped = df.groupby("a")
  277. result = getattr(grouped, method)()
  278. if method == "count":
  279. expected_value = 2
  280. elif method == "first":
  281. expected_value = data[0]
  282. elif method == "last":
  283. expected_value = data[1]
  284. else:
  285. expected_value = getattr(df["b"], method)()
  286. expected = DataFrame({"b": [expected_value]}, index=pd.Index([1], name="a"))
  287. tm.assert_frame_equal(result, expected)
  288. @pytest.mark.parametrize("how", ["first", "last"])
  289. def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how):
  290. # GH#57019
  291. na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype))
  292. df = DataFrame(
  293. {
  294. "a": [2, 1, 1, 2, 3, 3],
  295. "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan],
  296. "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan],
  297. },
  298. dtype=any_real_nullable_dtype,
  299. )
  300. gb = df.groupby("a", sort=sort)
  301. method = getattr(gb, how)
  302. result = method(skipna=skipna)
  303. ilocs = {
  304. ("first", True): [3, 1, 4],
  305. ("first", False): [0, 1, 4],
  306. ("last", True): [3, 1, 5],
  307. ("last", False): [3, 2, 5],
  308. }[how, skipna]
  309. expected = df.iloc[ilocs].set_index("a")
  310. if sort:
  311. expected = expected.sort_index()
  312. tm.assert_frame_equal(result, expected)
  313. def test_idxmin_idxmax_axis1():
  314. df = DataFrame(
  315. np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"]
  316. )
  317. df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
  318. gb = df.groupby("A")
  319. warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated"
  320. with tm.assert_produces_warning(FutureWarning, match=warn_msg):
  321. res = gb.idxmax(axis=1)
  322. alt = df.iloc[:, 1:].idxmax(axis=1)
  323. indexer = res.index.get_level_values(1)
  324. tm.assert_series_equal(alt[indexer], res.droplevel("A"))
  325. df["E"] = date_range("2016-01-01", periods=10)
  326. gb2 = df.groupby("A")
  327. msg = "'>' not supported between instances of 'Timestamp' and 'float'"
  328. with pytest.raises(TypeError, match=msg):
  329. with tm.assert_produces_warning(FutureWarning, match=warn_msg):
  330. gb2.idxmax(axis=1)
  331. def test_groupby_mean_no_overflow():
  332. # Regression test for (#22487)
  333. df = DataFrame(
  334. {
  335. "user": ["A", "A", "A", "A", "A"],
  336. "connections": [4970, 4749, 4719, 4704, 18446744073699999744],
  337. }
  338. )
  339. assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840
  340. def test_mean_on_timedelta():
  341. # GH 17382
  342. df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5})
  343. result = df.groupby("cat")["time"].mean()
  344. expected = Series(
  345. pd.to_timedelta([4, 5]), name="time", index=pd.Index(["A", "B"], name="cat")
  346. )
  347. tm.assert_series_equal(result, expected)
  348. def test_cython_median():
  349. arr = np.random.default_rng(2).standard_normal(1000)
  350. arr[::2] = np.nan
  351. df = DataFrame(arr)
  352. labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
  353. labels[::17] = np.nan
  354. result = df.groupby(labels).median()
  355. msg = "using DataFrameGroupBy.median"
  356. with tm.assert_produces_warning(FutureWarning, match=msg):
  357. exp = df.groupby(labels).agg(np.nanmedian)
  358. tm.assert_frame_equal(result, exp)
  359. df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5)))
  360. msg = "using DataFrameGroupBy.median"
  361. with tm.assert_produces_warning(FutureWarning, match=msg):
  362. rs = df.groupby(labels).agg(np.median)
  363. xp = df.groupby(labels).median()
  364. tm.assert_frame_equal(rs, xp)
  365. def test_median_empty_bins(observed):
  366. df = DataFrame(np.random.default_rng(2).integers(0, 44, 500))
  367. grps = range(0, 55, 5)
  368. bins = pd.cut(df[0], grps)
  369. result = df.groupby(bins, observed=observed).median()
  370. expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
  371. tm.assert_frame_equal(result, expected)
  372. def test_max_min_non_numeric():
  373. # #2700
  374. aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]})
  375. result = aa.groupby("nn").max()
  376. assert "ss" in result
  377. result = aa.groupby("nn").max(numeric_only=False)
  378. assert "ss" in result
  379. result = aa.groupby("nn").min()
  380. assert "ss" in result
  381. result = aa.groupby("nn").min(numeric_only=False)
  382. assert "ss" in result
  383. def test_max_min_object_multiple_columns(using_array_manager, using_infer_string):
  384. # GH#41111 case where the aggregation is valid for some columns but not
  385. # others; we split object blocks column-wise, consistent with
  386. # DataFrame._reduce
  387. df = DataFrame(
  388. {
  389. "A": [1, 1, 2, 2, 3],
  390. "B": [1, "foo", 2, "bar", False],
  391. "C": ["a", "b", "c", "d", "e"],
  392. }
  393. )
  394. df._consolidate_inplace() # should already be consolidate, but double-check
  395. if not using_array_manager:
  396. assert len(df._mgr.blocks) == 3 if using_infer_string else 2
  397. gb = df.groupby("A")
  398. result = gb[["C"]].max()
  399. # "max" is valid for column "C" but not for "B"
  400. ei = pd.Index([1, 2, 3], name="A")
  401. expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
  402. tm.assert_frame_equal(result, expected)
  403. result = gb[["C"]].min()
  404. # "min" is valid for column "C" but not for "B"
  405. ei = pd.Index([1, 2, 3], name="A")
  406. expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
  407. tm.assert_frame_equal(result, expected)
  408. def test_min_date_with_nans():
  409. # GH26321
  410. dates = pd.to_datetime(
  411. Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
  412. ).dt.date
  413. df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
  414. result = df.groupby("b", as_index=False)["c"].min()["c"]
  415. expected = pd.to_datetime(
  416. Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
  417. ).dt.date
  418. tm.assert_series_equal(result, expected)
  419. result = df.groupby("b")["c"].min()
  420. expected.index.name = "b"
  421. tm.assert_series_equal(result, expected)
  422. def test_max_inat():
  423. # GH#40767 dont interpret iNaT as NaN
  424. ser = Series([1, iNaT])
  425. key = np.array([1, 1], dtype=np.int64)
  426. gb = ser.groupby(key)
  427. result = gb.max(min_count=2)
  428. expected = Series({1: 1}, dtype=np.int64)
  429. tm.assert_series_equal(result, expected, check_exact=True)
  430. result = gb.min(min_count=2)
  431. expected = Series({1: iNaT}, dtype=np.int64)
  432. tm.assert_series_equal(result, expected, check_exact=True)
  433. # not enough entries -> gets masked to NaN
  434. result = gb.min(min_count=3)
  435. expected = Series({1: np.nan})
  436. tm.assert_series_equal(result, expected, check_exact=True)
  437. def test_max_inat_not_all_na():
  438. # GH#40767 dont interpret iNaT as NaN
  439. # make sure we dont round iNaT+1 to iNaT
  440. ser = Series([1, iNaT, 2, iNaT + 1])
  441. gb = ser.groupby([1, 2, 3, 3])
  442. result = gb.min(min_count=2)
  443. # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
  444. expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
  445. expected.index = expected.index.astype(int)
  446. tm.assert_series_equal(result, expected, check_exact=True)
  447. @pytest.mark.parametrize("func", ["min", "max"])
  448. def test_groupby_aggregate_period_column(func):
  449. # GH 31471
  450. groups = [1, 2]
  451. periods = pd.period_range("2020", periods=2, freq="Y")
  452. df = DataFrame({"a": groups, "b": periods})
  453. result = getattr(df.groupby("a")["b"], func)()
  454. idx = pd.Index([1, 2], name="a")
  455. expected = Series(periods, index=idx, name="b")
  456. tm.assert_series_equal(result, expected)
  457. @pytest.mark.parametrize("func", ["min", "max"])
  458. def test_groupby_aggregate_period_frame(func):
  459. # GH 31471
  460. groups = [1, 2]
  461. periods = pd.period_range("2020", periods=2, freq="Y")
  462. df = DataFrame({"a": groups, "b": periods})
  463. result = getattr(df.groupby("a"), func)()
  464. idx = pd.Index([1, 2], name="a")
  465. expected = DataFrame({"b": periods}, index=idx)
  466. tm.assert_frame_equal(result, expected)
  467. def test_aggregate_numeric_object_dtype():
  468. # https://github.com/pandas-dev/pandas/issues/39329
  469. # simplified case: multiple object columns where one is all-NaN
  470. # -> gets split as the all-NaN is inferred as float
  471. df = DataFrame(
  472. {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
  473. ).astype(object)
  474. result = df.groupby("key").min()
  475. expected = (
  476. DataFrame(
  477. {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]},
  478. )
  479. .set_index("key")
  480. .astype(object)
  481. )
  482. tm.assert_frame_equal(result, expected)
  483. # same but with numbers
  484. df = DataFrame(
  485. {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
  486. ).astype(object)
  487. result = df.groupby("key").min()
  488. expected = (
  489. DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]})
  490. .set_index("key")
  491. .astype(object)
  492. )
  493. tm.assert_frame_equal(result, expected)
  494. @pytest.mark.parametrize("func", ["min", "max"])
  495. def test_aggregate_categorical_lost_index(func: str):
  496. # GH: 28641 groupby drops index, when grouping over categorical column with min/max
  497. ds = Series(["b"], dtype="category").cat.as_ordered()
  498. df = DataFrame({"A": [1997], "B": ds})
  499. result = df.groupby("A").agg({"B": func})
  500. expected = DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A"))
  501. # ordered categorical dtype should be preserved
  502. expected["B"] = expected["B"].astype(ds.dtype)
  503. tm.assert_frame_equal(result, expected)
  504. @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
  505. def test_groupby_min_max_nullable(dtype):
  506. if dtype == "Int64":
  507. # GH#41743 avoid precision loss
  508. ts = 1618556707013635762
  509. elif dtype == "boolean":
  510. ts = 0
  511. else:
  512. ts = 4.0
  513. df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
  514. df["ts"] = df["ts"].astype(dtype)
  515. gb = df.groupby("id")
  516. result = gb.min()
  517. expected = df.iloc[:1].set_index("id")
  518. tm.assert_frame_equal(result, expected)
  519. res_max = gb.max()
  520. expected_max = df.iloc[1:].set_index("id")
  521. tm.assert_frame_equal(res_max, expected_max)
  522. result2 = gb.min(min_count=3)
  523. expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
  524. tm.assert_frame_equal(result2, expected2)
  525. res_max2 = gb.max(min_count=3)
  526. tm.assert_frame_equal(res_max2, expected2)
  527. # Case with NA values
  528. df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
  529. df2["ts"] = df2["ts"].astype(dtype)
  530. gb2 = df2.groupby("id")
  531. result3 = gb2.min()
  532. tm.assert_frame_equal(result3, expected)
  533. res_max3 = gb2.max()
  534. tm.assert_frame_equal(res_max3, expected_max)
  535. result4 = gb2.min(min_count=100)
  536. tm.assert_frame_equal(result4, expected2)
  537. res_max4 = gb2.max(min_count=100)
  538. tm.assert_frame_equal(res_max4, expected2)
  539. def test_min_max_nullable_uint64_empty_group():
  540. # don't raise NotImplementedError from libgroupby
  541. cat = pd.Categorical([0] * 10, categories=[0, 1])
  542. df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))})
  543. gb = df.groupby("A", observed=False)
  544. res = gb.min()
  545. idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A")
  546. expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx)
  547. tm.assert_frame_equal(res, expected)
  548. res = gb.max()
  549. expected.iloc[0, 0] = 9
  550. tm.assert_frame_equal(res, expected)
  551. @pytest.mark.parametrize("func", ["first", "last", "min", "max"])
  552. def test_groupby_min_max_categorical(func):
  553. # GH: 52151
  554. df = DataFrame(
  555. {
  556. "col1": pd.Categorical(["A"], categories=list("AB"), ordered=True),
  557. "col2": pd.Categorical([1], categories=[1, 2], ordered=True),
  558. "value": 0.1,
  559. }
  560. )
  561. result = getattr(df.groupby("col1", observed=False), func)()
  562. idx = pd.CategoricalIndex(data=["A", "B"], name="col1", ordered=True)
  563. expected = DataFrame(
  564. {
  565. "col2": pd.Categorical([1, None], categories=[1, 2], ordered=True),
  566. "value": [0.1, None],
  567. },
  568. index=idx,
  569. )
  570. tm.assert_frame_equal(result, expected)
  571. @pytest.mark.parametrize("func", ["min", "max"])
  572. def test_min_empty_string_dtype(func, string_dtype_no_object):
  573. # GH#55619
  574. dtype = string_dtype_no_object
  575. df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
  576. result = getattr(df.groupby("a"), func)()
  577. expected = DataFrame(
  578. columns=["b", "c"], dtype=dtype, index=pd.Index([], dtype=dtype, name="a")
  579. )
  580. tm.assert_frame_equal(result, expected)
  581. @pytest.mark.parametrize("min_count", [0, 1])
  582. @pytest.mark.parametrize("test_series", [True, False])
  583. def test_string_dtype_all_na(
  584. string_dtype_no_object, reduction_func, min_count, test_series
  585. ):
  586. # https://github.com/pandas-dev/pandas/issues/60985
  587. if reduction_func == "corrwith":
  588. # corrwith is deprecated.
  589. return
  590. dtype = string_dtype_no_object
  591. if reduction_func in [
  592. "any",
  593. "all",
  594. "idxmin",
  595. "idxmax",
  596. "mean",
  597. "median",
  598. "std",
  599. "var",
  600. ]:
  601. kwargs = {}
  602. elif reduction_func in ["kurt"]:
  603. kwargs = {"min_count": min_count}
  604. elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
  605. kwargs = {}
  606. else:
  607. kwargs = {"min_count": min_count}
  608. expected_dtype, expected_value = dtype, pd.NA
  609. if reduction_func in ["all", "any"]:
  610. expected_dtype = "bool"
  611. # TODO: For skipna=False, bool(pd.NA) raises; should groupby?
  612. expected_value = False if reduction_func == "any" else True
  613. elif reduction_func in ["count", "nunique", "size"]:
  614. # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
  615. if (
  616. test_series
  617. and reduction_func == "size"
  618. and dtype.storage == "pyarrow"
  619. and dtype.na_value is pd.NA
  620. ):
  621. expected_dtype = "Int64"
  622. else:
  623. expected_dtype = "int64"
  624. expected_value = 1 if reduction_func == "size" else 0
  625. elif reduction_func in ["idxmin", "idxmax"]:
  626. expected_dtype, expected_value = "float64", np.nan
  627. elif min_count > 0:
  628. expected_value = pd.NA
  629. elif reduction_func == "sum":
  630. # https://github.com/pandas-dev/pandas/pull/60936
  631. expected_value = ""
  632. df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
  633. obj = df["b"] if test_series else df
  634. args = get_groupby_method_args(reduction_func, obj)
  635. gb = obj.groupby(df["a"])
  636. method = getattr(gb, reduction_func)
  637. if reduction_func in [
  638. "mean",
  639. "median",
  640. "kurt",
  641. "prod",
  642. "quantile",
  643. "sem",
  644. "skew",
  645. "std",
  646. "var",
  647. ]:
  648. msg = f"dtype '{dtype}' does not support operation '{reduction_func}'"
  649. with pytest.raises(TypeError, match=msg):
  650. method(*args, **kwargs)
  651. return
  652. result = method(*args, **kwargs)
  653. index = pd.Index(["x"], name="a", dtype=dtype)
  654. if test_series or reduction_func == "size":
  655. name = None if not test_series and reduction_func == "size" else "b"
  656. expected = Series(expected_value, index=index, dtype=expected_dtype, name=name)
  657. else:
  658. expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype)
  659. tm.assert_equal(result, expected)
  660. @pytest.mark.parametrize("min_count", [0, 1])
  661. def test_string_dtype_empty_sum(string_dtype_no_object, min_count):
  662. # https://github.com/pandas-dev/pandas/issues/60229
  663. dtype = string_dtype_no_object
  664. df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
  665. gb = df.groupby("a")
  666. result = gb.sum(min_count=min_count)
  667. value = "" if min_count == 0 else pd.NA
  668. expected = DataFrame(
  669. {"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype
  670. )
  671. tm.assert_frame_equal(result, expected)
  672. def test_max_nan_bug():
  673. df = DataFrame(
  674. {
  675. "Unnamed: 0": ["-04-23", "-05-06", "-05-07"],
  676. "Date": [
  677. "2013-04-23 00:00:00",
  678. "2013-05-06 00:00:00",
  679. "2013-05-07 00:00:00",
  680. ],
  681. "app": Series([np.nan, np.nan, "OE"]),
  682. "File": ["log080001.log", "log.log", "xlsx"],
  683. }
  684. )
  685. gb = df.groupby("Date")
  686. r = gb[["File"]].max()
  687. e = gb["File"].max().to_frame()
  688. tm.assert_frame_equal(r, e)
  689. assert not r["File"].isna().any()
  690. @pytest.mark.slow
  691. @pytest.mark.parametrize("sort", [False, True])
  692. @pytest.mark.parametrize("dropna", [False, True])
  693. @pytest.mark.parametrize("as_index", [True, False])
  694. @pytest.mark.parametrize("with_nan", [True, False])
  695. @pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]])
  696. def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys):
  697. n = 100
  698. m = 10
  699. days = date_range("2015-08-23", periods=10)
  700. df = DataFrame(
  701. {
  702. "jim": np.random.default_rng(2).choice(list(ascii_lowercase), n),
  703. "joe": np.random.default_rng(2).choice(days, n),
  704. "julie": np.random.default_rng(2).integers(0, m, n),
  705. }
  706. )
  707. if with_nan:
  708. df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below
  709. df.loc[1::17, "jim"] = None
  710. df.loc[3::37, "joe"] = None
  711. df.loc[7::19, "julie"] = None
  712. df.loc[8::19, "julie"] = None
  713. df.loc[9::19, "julie"] = None
  714. original_df = df.copy()
  715. gr = df.groupby(keys, as_index=as_index, sort=sort)
  716. left = gr["julie"].nunique(dropna=dropna)
  717. gr = df.groupby(keys, as_index=as_index, sort=sort)
  718. right = gr["julie"].apply(Series.nunique, dropna=dropna)
  719. if not as_index:
  720. right = right.reset_index(drop=True)
  721. if as_index:
  722. tm.assert_series_equal(left, right, check_names=False)
  723. else:
  724. tm.assert_frame_equal(left, right, check_names=False)
  725. tm.assert_frame_equal(df, original_df)
  726. def test_nunique():
  727. df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
  728. expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
  729. result = df.groupby("A", as_index=False).nunique()
  730. tm.assert_frame_equal(result, expected)
  731. # as_index
  732. expected.index = list("abc")
  733. expected.index.name = "A"
  734. expected = expected.drop(columns="A")
  735. result = df.groupby("A").nunique()
  736. tm.assert_frame_equal(result, expected)
  737. # with na
  738. result = df.replace({"x": None}).groupby("A").nunique(dropna=False)
  739. tm.assert_frame_equal(result, expected)
  740. # dropna
  741. expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
  742. expected.index.name = "A"
  743. result = df.replace({"x": None}).groupby("A").nunique()
  744. tm.assert_frame_equal(result, expected)
  745. def test_nunique_with_object():
  746. # GH 11077
  747. data = DataFrame(
  748. [
  749. [100, 1, "Alice"],
  750. [200, 2, "Bob"],
  751. [300, 3, "Charlie"],
  752. [-400, 4, "Dan"],
  753. [500, 5, "Edith"],
  754. ],
  755. columns=["amount", "id", "name"],
  756. )
  757. result = data.groupby(["id", "amount"])["name"].nunique()
  758. index = MultiIndex.from_arrays([data.id, data.amount])
  759. expected = Series([1] * 5, name="name", index=index)
  760. tm.assert_series_equal(result, expected)
  761. def test_nunique_with_empty_series():
  762. # GH 12553
  763. data = Series(name="name", dtype=object)
  764. result = data.groupby(level=0).nunique()
  765. expected = Series(name="name", dtype="int64")
  766. tm.assert_series_equal(result, expected)
  767. def test_nunique_with_timegrouper():
  768. # GH 13453
  769. test = DataFrame(
  770. {
  771. "time": [
  772. Timestamp("2016-06-28 09:35:35"),
  773. Timestamp("2016-06-28 16:09:30"),
  774. Timestamp("2016-06-28 16:46:28"),
  775. ],
  776. "data": ["1", "2", "3"],
  777. }
  778. ).set_index("time")
  779. result = test.groupby(pd.Grouper(freq="h"))["data"].nunique()
  780. expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique)
  781. tm.assert_series_equal(result, expected)
  782. @pytest.mark.parametrize(
  783. "key, data, dropna, expected",
  784. [
  785. (
  786. ["x", "x", "x"],
  787. [Timestamp("2019-01-01"), pd.NaT, Timestamp("2019-01-01")],
  788. True,
  789. Series([1], index=pd.Index(["x"], name="key"), name="data"),
  790. ),
  791. (
  792. ["x", "x", "x"],
  793. [dt.date(2019, 1, 1), pd.NaT, dt.date(2019, 1, 1)],
  794. True,
  795. Series([1], index=pd.Index(["x"], name="key"), name="data"),
  796. ),
  797. (
  798. ["x", "x", "x", "y", "y"],
  799. [
  800. dt.date(2019, 1, 1),
  801. pd.NaT,
  802. dt.date(2019, 1, 1),
  803. pd.NaT,
  804. dt.date(2019, 1, 1),
  805. ],
  806. False,
  807. Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
  808. ),
  809. (
  810. ["x", "x", "x", "x", "y"],
  811. [
  812. dt.date(2019, 1, 1),
  813. pd.NaT,
  814. dt.date(2019, 1, 1),
  815. pd.NaT,
  816. dt.date(2019, 1, 1),
  817. ],
  818. False,
  819. Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
  820. ),
  821. ],
  822. )
  823. def test_nunique_with_NaT(key, data, dropna, expected):
  824. # GH 27951
  825. df = DataFrame({"key": key, "data": data})
  826. result = df.groupby(["key"])["data"].nunique(dropna=dropna)
  827. tm.assert_series_equal(result, expected)
  828. def test_nunique_preserves_column_level_names():
  829. # GH 23222
  830. test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0"))
  831. result = test.groupby([0, 0, 0]).nunique()
  832. expected = DataFrame([2], index=np.array([0]), columns=test.columns)
  833. tm.assert_frame_equal(result, expected)
  834. def test_nunique_transform_with_datetime():
  835. # GH 35109 - transform with nunique on datetimes results in integers
  836. df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"])
  837. result = df.groupby([0, 0, 1])["date"].transform("nunique")
  838. expected = Series([2, 2, 1], name="date")
  839. tm.assert_series_equal(result, expected)
  840. def test_empty_categorical(observed):
  841. # GH#21334
  842. cat = Series([1]).astype("category")
  843. ser = cat[:0]
  844. gb = ser.groupby(ser, observed=observed)
  845. result = gb.nunique()
  846. if observed:
  847. expected = Series([], index=cat[:0], dtype="int64")
  848. else:
  849. expected = Series([0], index=cat, dtype="int64")
  850. tm.assert_series_equal(result, expected)
  851. def test_intercept_builtin_sum():
  852. s = Series([1.0, 2.0, np.nan, 3.0])
  853. grouped = s.groupby([0, 1, 2, 2])
  854. msg = "using SeriesGroupBy.sum"
  855. with tm.assert_produces_warning(FutureWarning, match=msg):
  856. # GH#53425
  857. result = grouped.agg(builtins.sum)
  858. msg = "using np.sum"
  859. with tm.assert_produces_warning(FutureWarning, match=msg):
  860. # GH#53425
  861. result2 = grouped.apply(builtins.sum)
  862. expected = grouped.sum()
  863. tm.assert_series_equal(result, expected)
  864. tm.assert_series_equal(result2, expected)
  865. @pytest.mark.parametrize("min_count", [0, 10])
  866. def test_groupby_sum_mincount_boolean(min_count):
  867. b = True
  868. a = False
  869. na = np.nan
  870. dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean")
  871. df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg})
  872. result = df.groupby("A").sum(min_count=min_count)
  873. if min_count == 0:
  874. expected = DataFrame(
  875. {"B": pd.array([3, 0, 0], dtype="Int64")},
  876. index=pd.Index([1, 2, 3], name="A"),
  877. )
  878. tm.assert_frame_equal(result, expected)
  879. else:
  880. expected = DataFrame(
  881. {"B": pd.array([pd.NA] * 3, dtype="Int64")},
  882. index=pd.Index([1, 2, 3], name="A"),
  883. )
  884. tm.assert_frame_equal(result, expected)
  885. def test_groupby_sum_below_mincount_nullable_integer():
  886. # https://github.com/pandas-dev/pandas/issues/32861
  887. df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
  888. grouped = df.groupby("a")
  889. idx = pd.Index([0, 1, 2], name="a", dtype="Int64")
  890. result = grouped["b"].sum(min_count=2)
  891. expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")
  892. tm.assert_series_equal(result, expected)
  893. result = grouped.sum(min_count=2)
  894. expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx)
  895. tm.assert_frame_equal(result, expected)
  896. def test_groupby_sum_timedelta_with_nat():
  897. # GH#42659
  898. df = DataFrame(
  899. {
  900. "a": [1, 1, 2, 2],
  901. "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT],
  902. }
  903. )
  904. td3 = pd.Timedelta(days=3)
  905. gb = df.groupby("a")
  906. res = gb.sum()
  907. expected = DataFrame({"b": [td3, td3]}, index=pd.Index([1, 2], name="a"))
  908. tm.assert_frame_equal(res, expected)
  909. res = gb["b"].sum()
  910. tm.assert_series_equal(res, expected["b"])
  911. res = gb["b"].sum(min_count=2)
  912. expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index)
  913. tm.assert_series_equal(res, expected)
  914. @pytest.mark.parametrize(
  915. "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
  916. )
  917. @pytest.mark.parametrize(
  918. "method,data",
  919. [
  920. ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
  921. ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
  922. ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
  923. ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
  924. ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
  925. ],
  926. )
  927. def test_groupby_non_arithmetic_agg_types(dtype, method, data):
  928. # GH9311, GH6620
  929. df = DataFrame(
  930. [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
  931. )
  932. df["b"] = df.b.astype(dtype)
  933. if "args" not in data:
  934. data["args"] = []
  935. if "out_type" in data:
  936. out_type = data["out_type"]
  937. else:
  938. out_type = dtype
  939. exp = data["df"]
  940. df_out = DataFrame(exp)
  941. df_out["b"] = df_out.b.astype(out_type)
  942. df_out.set_index("a", inplace=True)
  943. grpd = df.groupby("a")
  944. t = getattr(grpd, method)(*data["args"])
  945. tm.assert_frame_equal(t, df_out)
  946. def scipy_sem(*args, **kwargs):
  947. from scipy.stats import sem
  948. return sem(*args, ddof=1, **kwargs)
  949. @pytest.mark.parametrize(
  950. "op,targop",
  951. [
  952. ("mean", np.mean),
  953. ("median", np.median),
  954. ("std", np.std),
  955. ("var", np.var),
  956. ("sum", np.sum),
  957. ("prod", np.prod),
  958. ("min", np.min),
  959. ("max", np.max),
  960. ("first", lambda x: x.iloc[0]),
  961. ("last", lambda x: x.iloc[-1]),
  962. ("count", np.size),
  963. pytest.param("sem", scipy_sem, marks=td.skip_if_no("scipy")),
  964. ],
  965. )
  966. def test_ops_general(op, targop):
  967. df = DataFrame(np.random.default_rng(2).standard_normal(1000))
  968. labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
  969. result = getattr(df.groupby(labels), op)()
  970. warn = None if op in ("first", "last", "count", "sem") else FutureWarning
  971. msg = f"using DataFrameGroupBy.{op}"
  972. with tm.assert_produces_warning(warn, match=msg):
  973. expected = df.groupby(labels).agg(targop)
  974. tm.assert_frame_equal(result, expected)
  975. @pytest.mark.parametrize(
  976. "values",
  977. [
  978. {
  979. "a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
  980. "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
  981. },
  982. {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
  983. ],
  984. )
  985. @pytest.mark.parametrize("function", ["mean", "median", "var"])
  986. def test_apply_to_nullable_integer_returns_float(values, function):
  987. # https://github.com/pandas-dev/pandas/issues/32219
  988. output = 0.5 if function == "var" else 1.5
  989. arr = np.array([output] * 3, dtype=float)
  990. idx = pd.Index([1, 2, 3], name="a", dtype="Int64")
  991. expected = DataFrame({"b": arr}, index=idx).astype("Float64")
  992. groups = DataFrame(values, dtype="Int64").groupby("a")
  993. result = getattr(groups, function)()
  994. tm.assert_frame_equal(result, expected)
  995. result = groups.agg(function)
  996. tm.assert_frame_equal(result, expected)
  997. result = groups.agg([function])
  998. expected.columns = MultiIndex.from_tuples([("b", function)])
  999. tm.assert_frame_equal(result, expected)
  1000. @pytest.mark.parametrize(
  1001. "op",
  1002. [
  1003. "sum",
  1004. "prod",
  1005. "min",
  1006. "max",
  1007. "median",
  1008. "mean",
  1009. "skew",
  1010. "std",
  1011. "var",
  1012. "sem",
  1013. ],
  1014. )
  1015. @pytest.mark.parametrize("axis", [0, 1])
  1016. @pytest.mark.parametrize("skipna", [True, False])
  1017. @pytest.mark.parametrize("sort", [True, False])
  1018. def test_regression_allowlist_methods(op, axis, skipna, sort):
  1019. # GH6944
  1020. # GH 17537
  1021. # explicitly test the allowlist methods
  1022. raw_frame = DataFrame([0])
  1023. if axis == 0:
  1024. frame = raw_frame
  1025. msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be"
  1026. else:
  1027. frame = raw_frame.T
  1028. msg = "DataFrame.groupby with axis=1 is deprecated"
  1029. with tm.assert_produces_warning(FutureWarning, match=msg):
  1030. grouped = frame.groupby(level=0, axis=axis, sort=sort)
  1031. if op == "skew":
  1032. # skew has skipna
  1033. result = getattr(grouped, op)(skipna=skipna)
  1034. expected = frame.groupby(level=0).apply(
  1035. lambda h: getattr(h, op)(axis=axis, skipna=skipna)
  1036. )
  1037. if sort:
  1038. expected = expected.sort_index(axis=axis)
  1039. tm.assert_frame_equal(result, expected)
  1040. else:
  1041. result = getattr(grouped, op)()
  1042. expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis))
  1043. if sort:
  1044. expected = expected.sort_index(axis=axis)
  1045. tm.assert_frame_equal(result, expected)
  1046. def test_groupby_prod_with_int64_dtype():
  1047. # GH#46573
  1048. data = [
  1049. [1, 11],
  1050. [1, 41],
  1051. [1, 17],
  1052. [1, 37],
  1053. [1, 7],
  1054. [1, 29],
  1055. [1, 31],
  1056. [1, 2],
  1057. [1, 3],
  1058. [1, 43],
  1059. [1, 5],
  1060. [1, 47],
  1061. [1, 19],
  1062. [1, 88],
  1063. ]
  1064. df = DataFrame(data, columns=["A", "B"], dtype="int64")
  1065. result = df.groupby(["A"]).prod().reset_index()
  1066. expected = DataFrame({"A": [1], "B": [180970905912331920]}, dtype="int64")
  1067. tm.assert_frame_equal(result, expected)