test_grouping.py 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238
  1. """
  2. test where we are determining what we are grouping, or getting groups
  3. """
  4. from datetime import (
  5. date,
  6. timedelta,
  7. )
  8. import numpy as np
  9. import pytest
  10. import pandas as pd
  11. from pandas import (
  12. CategoricalIndex,
  13. DataFrame,
  14. Grouper,
  15. Index,
  16. MultiIndex,
  17. Series,
  18. Timestamp,
  19. date_range,
  20. period_range,
  21. )
  22. import pandas._testing as tm
  23. from pandas.core.groupby.grouper import Grouping
  24. # selection
  25. # --------------------------------
  26. class TestSelection:
  27. def test_select_bad_cols(self):
  28. df = DataFrame([[1, 2]], columns=["A", "B"])
  29. g = df.groupby("A")
  30. with pytest.raises(KeyError, match="\"Columns not found: 'C'\""):
  31. g[["C"]]
  32. with pytest.raises(KeyError, match="^[^A]+$"):
  33. # A should not be referenced as a bad column...
  34. # will have to rethink regex if you change message!
  35. g[["A", "C"]]
  36. def test_groupby_duplicated_column_errormsg(self):
  37. # GH7511
  38. df = DataFrame(
  39. columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)]
  40. )
  41. msg = "Grouper for 'A' not 1-dimensional"
  42. with pytest.raises(ValueError, match=msg):
  43. df.groupby("A")
  44. with pytest.raises(ValueError, match=msg):
  45. df.groupby(["A", "B"])
  46. grouped = df.groupby("B")
  47. c = grouped.count()
  48. assert c.columns.nlevels == 1
  49. assert c.columns.size == 3
  50. def test_column_select_via_attr(self, df):
  51. result = df.groupby("A").C.sum()
  52. expected = df.groupby("A")["C"].sum()
  53. tm.assert_series_equal(result, expected)
  54. df["mean"] = 1.5
  55. result = df.groupby("A").mean(numeric_only=True)
  56. expected = df.groupby("A")[["C", "D", "mean"]].agg("mean")
  57. tm.assert_frame_equal(result, expected)
  58. def test_getitem_list_of_columns(self):
  59. df = DataFrame(
  60. {
  61. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  62. "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
  63. "C": np.random.default_rng(2).standard_normal(8),
  64. "D": np.random.default_rng(2).standard_normal(8),
  65. "E": np.random.default_rng(2).standard_normal(8),
  66. }
  67. )
  68. result = df.groupby("A")[["C", "D"]].mean()
  69. result2 = df.groupby("A")[df.columns[2:4]].mean()
  70. expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean()
  71. tm.assert_frame_equal(result, expected)
  72. tm.assert_frame_equal(result2, expected)
  73. def test_getitem_numeric_column_names(self):
  74. # GH #13731
  75. df = DataFrame(
  76. {
  77. 0: list("abcd") * 2,
  78. 2: np.random.default_rng(2).standard_normal(8),
  79. 4: np.random.default_rng(2).standard_normal(8),
  80. 6: np.random.default_rng(2).standard_normal(8),
  81. }
  82. )
  83. result = df.groupby(0)[df.columns[1:3]].mean()
  84. result2 = df.groupby(0)[[2, 4]].mean()
  85. expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
  86. tm.assert_frame_equal(result, expected)
  87. tm.assert_frame_equal(result2, expected)
  88. # per GH 23566 enforced deprecation raises a ValueError
  89. with pytest.raises(ValueError, match="Cannot subset columns with a tuple"):
  90. df.groupby(0)[2, 4].mean()
  91. def test_getitem_single_tuple_of_columns_raises(self, df):
  92. # per GH 23566 enforced deprecation raises a ValueError
  93. with pytest.raises(ValueError, match="Cannot subset columns with a tuple"):
  94. df.groupby("A")["C", "D"].mean()
  95. def test_getitem_single_column(self):
  96. df = DataFrame(
  97. {
  98. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  99. "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
  100. "C": np.random.default_rng(2).standard_normal(8),
  101. "D": np.random.default_rng(2).standard_normal(8),
  102. "E": np.random.default_rng(2).standard_normal(8),
  103. }
  104. )
  105. result = df.groupby("A")["C"].mean()
  106. as_frame = df.loc[:, ["A", "C"]].groupby("A").mean()
  107. as_series = as_frame.iloc[:, 0]
  108. expected = as_series
  109. tm.assert_series_equal(result, expected)
  110. @pytest.mark.parametrize(
  111. "func", [lambda x: x.sum(), lambda x: x.agg(lambda y: y.sum())]
  112. )
  113. def test_getitem_from_grouper(self, func):
  114. # GH 50383
  115. df = DataFrame({"a": [1, 1, 2], "b": 3, "c": 4, "d": 5})
  116. gb = df.groupby(["a", "b"])[["a", "c"]]
  117. idx = MultiIndex.from_tuples([(1, 3), (2, 3)], names=["a", "b"])
  118. expected = DataFrame({"a": [2, 2], "c": [8, 4]}, index=idx)
  119. result = func(gb)
  120. tm.assert_frame_equal(result, expected)
  121. def test_indices_grouped_by_tuple_with_lambda(self):
  122. # GH 36158
  123. df = DataFrame(
  124. {
  125. "Tuples": (
  126. (x, y)
  127. for x in [0, 1]
  128. for y in np.random.default_rng(2).integers(3, 5, 5)
  129. )
  130. }
  131. )
  132. gb = df.groupby("Tuples")
  133. gb_lambda = df.groupby(lambda x: df.iloc[x, 0])
  134. expected = gb.indices
  135. result = gb_lambda.indices
  136. tm.assert_dict_equal(result, expected)
  137. # grouping
  138. # --------------------------------
  139. class TestGrouping:
  140. @pytest.mark.parametrize(
  141. "index",
  142. [
  143. Index(list("abcde")),
  144. Index(np.arange(5)),
  145. Index(np.arange(5, dtype=float)),
  146. date_range("2020-01-01", periods=5),
  147. period_range("2020-01-01", periods=5),
  148. ],
  149. )
  150. def test_grouper_index_types(self, index):
  151. # related GH5375
  152. # groupby misbehaving when using a Floatlike index
  153. df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"), index=index)
  154. df.groupby(list("abcde"), group_keys=False).apply(lambda x: x)
  155. df.index = df.index[::-1]
  156. df.groupby(list("abcde"), group_keys=False).apply(lambda x: x)
  157. def test_grouper_multilevel_freq(self):
  158. # GH 7885
  159. # with level and freq specified in a Grouper
  160. d0 = date.today() - timedelta(days=14)
  161. dates = date_range(d0, date.today())
  162. date_index = MultiIndex.from_product([dates, dates], names=["foo", "bar"])
  163. df = DataFrame(np.random.default_rng(2).integers(0, 100, 225), index=date_index)
  164. # Check string level
  165. expected = (
  166. df.reset_index()
  167. .groupby([Grouper(key="foo", freq="W"), Grouper(key="bar", freq="W")])
  168. .sum()
  169. )
  170. # reset index changes columns dtype to object
  171. expected.columns = Index([0], dtype="int64")
  172. result = df.groupby(
  173. [Grouper(level="foo", freq="W"), Grouper(level="bar", freq="W")]
  174. ).sum()
  175. tm.assert_frame_equal(result, expected)
  176. # Check integer level
  177. result = df.groupby(
  178. [Grouper(level=0, freq="W"), Grouper(level=1, freq="W")]
  179. ).sum()
  180. tm.assert_frame_equal(result, expected)
  181. def test_grouper_creation_bug(self):
  182. # GH 8795
  183. df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
  184. g = df.groupby("A")
  185. expected = g.sum()
  186. g = df.groupby(Grouper(key="A"))
  187. result = g.sum()
  188. tm.assert_frame_equal(result, expected)
  189. msg = "Grouper axis keyword is deprecated and will be removed"
  190. with tm.assert_produces_warning(FutureWarning, match=msg):
  191. gpr = Grouper(key="A", axis=0)
  192. g = df.groupby(gpr)
  193. result = g.sum()
  194. tm.assert_frame_equal(result, expected)
  195. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  196. with tm.assert_produces_warning(FutureWarning, match=msg):
  197. result = g.apply(lambda x: x.sum())
  198. expected["A"] = [0, 2, 4]
  199. expected = expected.loc[:, ["A", "B"]]
  200. tm.assert_frame_equal(result, expected)
  201. def test_grouper_creation_bug2(self):
  202. # GH14334
  203. # Grouper(key=...) may be passed in a list
  204. df = DataFrame(
  205. {"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]}
  206. )
  207. # Group by single column
  208. expected = df.groupby("A").sum()
  209. g = df.groupby([Grouper(key="A")])
  210. result = g.sum()
  211. tm.assert_frame_equal(result, expected)
  212. # Group by two columns
  213. # using a combination of strings and Grouper objects
  214. expected = df.groupby(["A", "B"]).sum()
  215. # Group with two Grouper objects
  216. g = df.groupby([Grouper(key="A"), Grouper(key="B")])
  217. result = g.sum()
  218. tm.assert_frame_equal(result, expected)
  219. # Group with a string and a Grouper object
  220. g = df.groupby(["A", Grouper(key="B")])
  221. result = g.sum()
  222. tm.assert_frame_equal(result, expected)
  223. # Group with a Grouper object and a string
  224. g = df.groupby([Grouper(key="A"), "B"])
  225. result = g.sum()
  226. tm.assert_frame_equal(result, expected)
  227. def test_grouper_creation_bug3(self, unit):
  228. # GH8866
  229. dti = date_range("20130101", periods=2, unit=unit)
  230. mi = MultiIndex.from_product(
  231. [list("ab"), range(2), dti],
  232. names=["one", "two", "three"],
  233. )
  234. ser = Series(
  235. np.arange(8, dtype="int64"),
  236. index=mi,
  237. )
  238. result = ser.groupby(Grouper(level="three", freq="ME")).sum()
  239. exp_dti = pd.DatetimeIndex(
  240. [Timestamp("2013-01-31")], freq="ME", name="three"
  241. ).as_unit(unit)
  242. expected = Series(
  243. [28],
  244. index=exp_dti,
  245. )
  246. tm.assert_series_equal(result, expected)
  247. # just specifying a level breaks
  248. result = ser.groupby(Grouper(level="one")).sum()
  249. expected = ser.groupby(level="one").sum()
  250. tm.assert_series_equal(result, expected)
  251. @pytest.mark.parametrize("func", [False, True])
  252. def test_grouper_returning_tuples(self, func):
  253. # GH 22257 , both with dict and with callable
  254. df = DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]})
  255. mapping = dict(zip(range(4), [("C", 5), ("D", 6)] * 2))
  256. if func:
  257. gb = df.groupby(by=lambda idx: mapping[idx], sort=False)
  258. else:
  259. gb = df.groupby(by=mapping, sort=False)
  260. name, expected = next(iter(gb))
  261. assert name == ("C", 5)
  262. result = gb.get_group(name)
  263. tm.assert_frame_equal(result, expected)
  264. def test_grouper_column_and_index(self):
  265. # GH 14327
  266. # Grouping a multi-index frame by a column and an index level should
  267. # be equivalent to resetting the index and grouping by two columns
  268. idx = MultiIndex.from_tuples(
  269. [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)]
  270. )
  271. idx.names = ["outer", "inner"]
  272. df_multi = DataFrame(
  273. {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]},
  274. index=idx,
  275. )
  276. result = df_multi.groupby(["B", Grouper(level="inner")]).mean(numeric_only=True)
  277. expected = (
  278. df_multi.reset_index().groupby(["B", "inner"]).mean(numeric_only=True)
  279. )
  280. tm.assert_frame_equal(result, expected)
  281. # Test the reverse grouping order
  282. result = df_multi.groupby([Grouper(level="inner"), "B"]).mean(numeric_only=True)
  283. expected = (
  284. df_multi.reset_index().groupby(["inner", "B"]).mean(numeric_only=True)
  285. )
  286. tm.assert_frame_equal(result, expected)
  287. # Grouping a single-index frame by a column and the index should
  288. # be equivalent to resetting the index and grouping by two columns
  289. df_single = df_multi.reset_index("outer")
  290. result = df_single.groupby(["B", Grouper(level="inner")]).mean(
  291. numeric_only=True
  292. )
  293. expected = (
  294. df_single.reset_index().groupby(["B", "inner"]).mean(numeric_only=True)
  295. )
  296. tm.assert_frame_equal(result, expected)
  297. # Test the reverse grouping order
  298. result = df_single.groupby([Grouper(level="inner"), "B"]).mean(
  299. numeric_only=True
  300. )
  301. expected = (
  302. df_single.reset_index().groupby(["inner", "B"]).mean(numeric_only=True)
  303. )
  304. tm.assert_frame_equal(result, expected)
  305. def test_groupby_levels_and_columns(self):
  306. # GH9344, GH9049
  307. idx_names = ["x", "y"]
  308. idx = MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
  309. df = DataFrame(np.arange(12).reshape(-1, 3), index=idx)
  310. by_levels = df.groupby(level=idx_names).mean()
  311. # reset_index changes columns dtype to object
  312. by_columns = df.reset_index().groupby(idx_names).mean()
  313. # without casting, by_columns.columns is object-dtype
  314. by_columns.columns = by_columns.columns.astype(np.int64)
  315. tm.assert_frame_equal(by_levels, by_columns)
  316. def test_groupby_categorical_index_and_columns(self, observed):
  317. # GH18432, adapted for GH25871
  318. columns = ["A", "B", "A", "B"]
  319. categories = ["B", "A"]
  320. data = np.array(
  321. [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int
  322. )
  323. cat_columns = CategoricalIndex(columns, categories=categories, ordered=True)
  324. df = DataFrame(data=data, columns=cat_columns)
  325. depr_msg = "DataFrame.groupby with axis=1 is deprecated"
  326. with tm.assert_produces_warning(FutureWarning, match=depr_msg):
  327. result = df.groupby(axis=1, level=0, observed=observed).sum()
  328. expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
  329. expected_columns = CategoricalIndex(
  330. categories, categories=categories, ordered=True
  331. )
  332. expected = DataFrame(data=expected_data, columns=expected_columns)
  333. tm.assert_frame_equal(result, expected)
  334. # test transposed version
  335. df = DataFrame(data.T, index=cat_columns)
  336. msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
  337. with tm.assert_produces_warning(FutureWarning, match=msg):
  338. result = df.groupby(axis=0, level=0, observed=observed).sum()
  339. expected = DataFrame(data=expected_data.T, index=expected_columns)
  340. tm.assert_frame_equal(result, expected)
  341. def test_grouper_getting_correct_binner(self):
  342. # GH 10063
  343. # using a non-time-based grouper and a time-based grouper
  344. # and specifying levels
  345. df = DataFrame(
  346. {"A": 1},
  347. index=MultiIndex.from_product(
  348. [list("ab"), date_range("20130101", periods=80)], names=["one", "two"]
  349. ),
  350. )
  351. result = df.groupby(
  352. [Grouper(level="one"), Grouper(level="two", freq="ME")]
  353. ).sum()
  354. expected = DataFrame(
  355. {"A": [31, 28, 21, 31, 28, 21]},
  356. index=MultiIndex.from_product(
  357. [list("ab"), date_range("20130101", freq="ME", periods=3)],
  358. names=["one", "two"],
  359. ),
  360. )
  361. tm.assert_frame_equal(result, expected)
  362. def test_grouper_iter(self, df):
  363. gb = df.groupby("A")
  364. msg = "DataFrameGroupBy.grouper is deprecated"
  365. with tm.assert_produces_warning(FutureWarning, match=msg):
  366. grouper = gb.grouper
  367. result = sorted(grouper)
  368. expected = ["bar", "foo"]
  369. assert result == expected
  370. def test_empty_groups(self, df):
  371. # see gh-1048
  372. with pytest.raises(ValueError, match="No group keys passed!"):
  373. df.groupby([])
  374. def test_groupby_grouper(self, df):
  375. grouped = df.groupby("A")
  376. msg = "DataFrameGroupBy.grouper is deprecated"
  377. with tm.assert_produces_warning(FutureWarning, match=msg):
  378. grouper = grouped.grouper
  379. result = df.groupby(grouper).mean(numeric_only=True)
  380. expected = grouped.mean(numeric_only=True)
  381. tm.assert_frame_equal(result, expected)
  382. def test_groupby_dict_mapping(self):
  383. # GH #679
  384. s = Series({"T1": 5})
  385. result = s.groupby({"T1": "T2"}).agg("sum")
  386. expected = s.groupby(["T2"]).agg("sum")
  387. tm.assert_series_equal(result, expected)
  388. s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
  389. mapping = {"a": 0, "b": 0, "c": 1, "d": 1}
  390. result = s.groupby(mapping).mean()
  391. result2 = s.groupby(mapping).agg("mean")
  392. exp_key = np.array([0, 0, 1, 1], dtype=np.int64)
  393. expected = s.groupby(exp_key).mean()
  394. expected2 = s.groupby(exp_key).mean()
  395. tm.assert_series_equal(result, expected)
  396. tm.assert_series_equal(result, result2)
  397. tm.assert_series_equal(result, expected2)
  398. @pytest.mark.parametrize(
  399. "index",
  400. [
  401. [0, 1, 2, 3],
  402. ["a", "b", "c", "d"],
  403. [Timestamp(2021, 7, 28 + i) for i in range(4)],
  404. ],
  405. )
  406. def test_groupby_series_named_with_tuple(self, frame_or_series, index):
  407. # GH 42731
  408. obj = frame_or_series([1, 2, 3, 4], index=index)
  409. groups = Series([1, 0, 1, 0], index=index, name=("a", "a"))
  410. result = obj.groupby(groups).last()
  411. expected = frame_or_series([4, 3])
  412. expected.index.name = ("a", "a")
  413. tm.assert_equal(result, expected)
  414. def test_groupby_grouper_f_sanity_checked(self):
  415. dates = date_range("01-Jan-2013", periods=12, freq="MS")
  416. ts = Series(np.random.default_rng(2).standard_normal(12), index=dates)
  417. # GH51979
  418. # simple check that the passed function doesn't operates on the whole index
  419. msg = "'Timestamp' object is not subscriptable"
  420. with pytest.raises(TypeError, match=msg):
  421. ts.groupby(lambda key: key[0:6])
  422. result = ts.groupby(lambda x: x).sum()
  423. expected = ts.groupby(ts.index).sum()
  424. expected.index.freq = None
  425. tm.assert_series_equal(result, expected)
  426. def test_groupby_with_datetime_key(self):
  427. # GH 51158
  428. df = DataFrame(
  429. {
  430. "id": ["a", "b"] * 3,
  431. "b": date_range("2000-01-01", "2000-01-03", freq="9h"),
  432. }
  433. )
  434. grouper = Grouper(key="b", freq="D")
  435. gb = df.groupby([grouper, "id"])
  436. # test number of groups
  437. expected = {
  438. (Timestamp("2000-01-01"), "a"): [0, 2],
  439. (Timestamp("2000-01-01"), "b"): [1],
  440. (Timestamp("2000-01-02"), "a"): [4],
  441. (Timestamp("2000-01-02"), "b"): [3, 5],
  442. }
  443. tm.assert_dict_equal(gb.groups, expected)
  444. # test number of group keys
  445. assert len(gb.groups.keys()) == 4
  446. def test_grouping_error_on_multidim_input(self, df):
  447. msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional"
  448. with pytest.raises(ValueError, match=msg):
  449. Grouping(df.index, df[["A", "A"]])
  450. def test_multiindex_passthru(self):
  451. # GH 7997
  452. # regression from 0.14.1
  453. df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
  454. df.columns = MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
  455. depr_msg = "DataFrame.groupby with axis=1 is deprecated"
  456. with tm.assert_produces_warning(FutureWarning, match=depr_msg):
  457. gb = df.groupby(axis=1, level=[0, 1])
  458. result = gb.first()
  459. tm.assert_frame_equal(result, df)
  460. def test_multiindex_negative_level(self, multiindex_dataframe_random_data):
  461. # GH 13901
  462. result = multiindex_dataframe_random_data.groupby(level=-1).sum()
  463. expected = multiindex_dataframe_random_data.groupby(level="second").sum()
  464. tm.assert_frame_equal(result, expected)
  465. result = multiindex_dataframe_random_data.groupby(level=-2).sum()
  466. expected = multiindex_dataframe_random_data.groupby(level="first").sum()
  467. tm.assert_frame_equal(result, expected)
  468. result = multiindex_dataframe_random_data.groupby(level=[-2, -1]).sum()
  469. expected = multiindex_dataframe_random_data.sort_index()
  470. tm.assert_frame_equal(result, expected)
  471. result = multiindex_dataframe_random_data.groupby(level=[-1, "first"]).sum()
  472. expected = multiindex_dataframe_random_data.groupby(
  473. level=["second", "first"]
  474. ).sum()
  475. tm.assert_frame_equal(result, expected)
  476. def test_multifunc_select_col_integer_cols(self, df):
  477. df.columns = np.arange(len(df.columns))
  478. # it works!
  479. msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated"
  480. with tm.assert_produces_warning(FutureWarning, match=msg):
  481. df.groupby(1, as_index=False)[2].agg({"Q": np.mean})
  482. def test_multiindex_columns_empty_level(self):
  483. lst = [["count", "values"], ["to filter", ""]]
  484. midx = MultiIndex.from_tuples(lst)
  485. df = DataFrame([[1, "A"]], columns=midx)
  486. grouped = df.groupby("to filter").groups
  487. assert grouped["A"] == [0]
  488. grouped = df.groupby([("to filter", "")]).groups
  489. assert grouped["A"] == [0]
  490. df = DataFrame([[1, "A"], [2, "B"]], columns=midx)
  491. expected = df.groupby("to filter").groups
  492. result = df.groupby([("to filter", "")]).groups
  493. assert result == expected
  494. df = DataFrame([[1, "A"], [2, "A"]], columns=midx)
  495. expected = df.groupby("to filter").groups
  496. result = df.groupby([("to filter", "")]).groups
  497. tm.assert_dict_equal(result, expected)
  498. def test_groupby_multiindex_tuple(self):
  499. # GH 17979
  500. df = DataFrame(
  501. [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
  502. columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
  503. )
  504. expected = df.groupby([("b", 1)]).groups
  505. result = df.groupby(("b", 1)).groups
  506. tm.assert_dict_equal(expected, result)
  507. df2 = DataFrame(
  508. df.values,
  509. columns=MultiIndex.from_arrays(
  510. [["a", "b", "b", "c"], ["d", "d", "e", "e"]]
  511. ),
  512. )
  513. expected = df2.groupby([("b", "d")]).groups
  514. result = df.groupby(("b", 1)).groups
  515. tm.assert_dict_equal(expected, result)
  516. df3 = DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
  517. expected = df3.groupby([("b", "d")]).groups
  518. result = df.groupby(("b", 1)).groups
  519. tm.assert_dict_equal(expected, result)
  520. def test_groupby_multiindex_partial_indexing_equivalence(self):
  521. # GH 17977
  522. df = DataFrame(
  523. [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
  524. columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
  525. )
  526. expected_mean = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].mean()
  527. result_mean = df.groupby([("a", 1)])["b"].mean()
  528. tm.assert_frame_equal(expected_mean, result_mean)
  529. expected_sum = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].sum()
  530. result_sum = df.groupby([("a", 1)])["b"].sum()
  531. tm.assert_frame_equal(expected_sum, result_sum)
  532. expected_count = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].count()
  533. result_count = df.groupby([("a", 1)])["b"].count()
  534. tm.assert_frame_equal(expected_count, result_count)
  535. expected_min = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].min()
  536. result_min = df.groupby([("a", 1)])["b"].min()
  537. tm.assert_frame_equal(expected_min, result_min)
  538. expected_max = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].max()
  539. result_max = df.groupby([("a", 1)])["b"].max()
  540. tm.assert_frame_equal(expected_max, result_max)
  541. expected_groups = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].groups
  542. result_groups = df.groupby([("a", 1)])["b"].groups
  543. tm.assert_dict_equal(expected_groups, result_groups)
  544. @pytest.mark.parametrize("sort", [True, False])
  545. def test_groupby_level(self, sort, multiindex_dataframe_random_data, df):
  546. # GH 17537
  547. frame = multiindex_dataframe_random_data
  548. deleveled = frame.reset_index()
  549. result0 = frame.groupby(level=0, sort=sort).sum()
  550. result1 = frame.groupby(level=1, sort=sort).sum()
  551. expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
  552. expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()
  553. expected0.index.name = "first"
  554. expected1.index.name = "second"
  555. assert result0.index.name == "first"
  556. assert result1.index.name == "second"
  557. tm.assert_frame_equal(result0, expected0)
  558. tm.assert_frame_equal(result1, expected1)
  559. assert result0.index.name == frame.index.names[0]
  560. assert result1.index.name == frame.index.names[1]
  561. # groupby level name
  562. result0 = frame.groupby(level="first", sort=sort).sum()
  563. result1 = frame.groupby(level="second", sort=sort).sum()
  564. tm.assert_frame_equal(result0, expected0)
  565. tm.assert_frame_equal(result1, expected1)
  566. # axis=1
  567. msg = "DataFrame.groupby with axis=1 is deprecated"
  568. with tm.assert_produces_warning(FutureWarning, match=msg):
  569. result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
  570. result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
  571. tm.assert_frame_equal(result0, expected0.T)
  572. tm.assert_frame_equal(result1, expected1.T)
  573. # raise exception for non-MultiIndex
  574. msg = "level > 0 or level < -1 only valid with MultiIndex"
  575. with pytest.raises(ValueError, match=msg):
  576. df.groupby(level=1)
  577. def test_groupby_level_index_names(self, axis):
  578. # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
  579. df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index(
  580. "exp"
  581. )
  582. if axis in (1, "columns"):
  583. df = df.T
  584. depr_msg = "DataFrame.groupby with axis=1 is deprecated"
  585. else:
  586. depr_msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
  587. with tm.assert_produces_warning(FutureWarning, match=depr_msg):
  588. df.groupby(level="exp", axis=axis)
  589. msg = f"level name foo is not the name of the {df._get_axis_name(axis)}"
  590. with pytest.raises(ValueError, match=msg):
  591. with tm.assert_produces_warning(FutureWarning, match=depr_msg):
  592. df.groupby(level="foo", axis=axis)
  593. @pytest.mark.parametrize("sort", [True, False])
  594. def test_groupby_level_with_nas(self, sort):
  595. # GH 17537
  596. index = MultiIndex(
  597. levels=[[1, 0], [0, 1, 2, 3]],
  598. codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
  599. )
  600. # factorizing doesn't confuse things
  601. s = Series(np.arange(8.0), index=index)
  602. result = s.groupby(level=0, sort=sort).sum()
  603. expected = Series([6.0, 22.0], index=[0, 1])
  604. tm.assert_series_equal(result, expected)
  605. index = MultiIndex(
  606. levels=[[1, 0], [0, 1, 2, 3]],
  607. codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
  608. )
  609. # factorizing doesn't confuse things
  610. s = Series(np.arange(8.0), index=index)
  611. result = s.groupby(level=0, sort=sort).sum()
  612. expected = Series([6.0, 18.0], index=[0.0, 1.0])
  613. tm.assert_series_equal(result, expected)
  614. def test_groupby_args(self, multiindex_dataframe_random_data):
  615. # PR8618 and issue 8015
  616. frame = multiindex_dataframe_random_data
  617. msg = "You have to supply one of 'by' and 'level'"
  618. with pytest.raises(TypeError, match=msg):
  619. frame.groupby()
  620. msg = "You have to supply one of 'by' and 'level'"
  621. with pytest.raises(TypeError, match=msg):
  622. frame.groupby(by=None, level=None)
  623. @pytest.mark.parametrize(
  624. "sort,labels",
  625. [
  626. [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
  627. [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
  628. ],
  629. )
  630. def test_level_preserve_order(self, sort, labels, multiindex_dataframe_random_data):
  631. # GH 17537
  632. grouped = multiindex_dataframe_random_data.groupby(level=0, sort=sort)
  633. exp_labels = np.array(labels, np.intp)
  634. tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels)
  635. def test_grouping_labels(self, multiindex_dataframe_random_data):
  636. grouped = multiindex_dataframe_random_data.groupby(
  637. multiindex_dataframe_random_data.index.get_level_values(0)
  638. )
  639. exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
  640. tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels)
  641. def test_list_grouper_with_nat(self):
  642. # GH 14715
  643. df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")})
  644. df.iloc[-1] = pd.NaT
  645. grouper = Grouper(key="date", freq="YS")
  646. # Grouper in a list grouping
  647. result = df.groupby([grouper])
  648. expected = {Timestamp("2011-01-01"): Index(list(range(364)))}
  649. tm.assert_dict_equal(result.groups, expected)
  650. # Test case without a list
  651. result = df.groupby(grouper)
  652. expected = {Timestamp("2011-01-01"): 365}
  653. tm.assert_dict_equal(result.groups, expected)
  654. @pytest.mark.parametrize(
  655. "func,expected",
  656. [
  657. (
  658. "transform",
  659. Series(name=2, dtype=np.float64),
  660. ),
  661. (
  662. "agg",
  663. Series(
  664. name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
  665. ),
  666. ),
  667. (
  668. "apply",
  669. Series(
  670. name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
  671. ),
  672. ),
  673. ],
  674. )
  675. def test_evaluate_with_empty_groups(self, func, expected):
  676. # 26208
  677. # test transform'ing empty groups
  678. # (not testing other agg fns, because they return
  679. # different index objects.
  680. df = DataFrame({1: [], 2: []})
  681. g = df.groupby(1, group_keys=False)
  682. result = getattr(g[2], func)(lambda x: x)
  683. tm.assert_series_equal(result, expected)
  684. def test_groupby_empty(self):
  685. # https://github.com/pandas-dev/pandas/issues/27190
  686. s = Series([], name="name", dtype="float64")
  687. gr = s.groupby([])
  688. result = gr.mean()
  689. expected = s.set_axis(Index([], dtype=np.intp))
  690. tm.assert_series_equal(result, expected)
  691. # check group properties
  692. assert len(gr._grouper.groupings) == 1
  693. tm.assert_numpy_array_equal(
  694. gr._grouper.group_info[0], np.array([], dtype=np.dtype(np.intp))
  695. )
  696. tm.assert_numpy_array_equal(
  697. gr._grouper.group_info[1], np.array([], dtype=np.dtype(np.intp))
  698. )
  699. assert gr._grouper.group_info[2] == 0
  700. # check name
  701. gb = s.groupby(s)
  702. msg = "SeriesGroupBy.grouper is deprecated"
  703. with tm.assert_produces_warning(FutureWarning, match=msg):
  704. grouper = gb.grouper
  705. result = grouper.names
  706. expected = ["name"]
  707. assert result == expected
  708. def test_groupby_level_index_value_all_na(self):
  709. # issue 20519
  710. df = DataFrame(
  711. [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]
  712. ).set_index(["A", "B"])
  713. result = df.groupby(level=["A", "B"]).sum()
  714. expected = DataFrame(
  715. data=[],
  716. index=MultiIndex(
  717. levels=[Index(["x"], dtype="str"), Index([], dtype="float64")],
  718. codes=[[], []],
  719. names=["A", "B"],
  720. ),
  721. columns=["C"],
  722. dtype="int64",
  723. )
  724. tm.assert_frame_equal(result, expected)
  725. def test_groupby_multiindex_level_empty(self):
  726. # https://github.com/pandas-dev/pandas/issues/31670
  727. df = DataFrame(
  728. [[123, "a", 1.0], [123, "b", 2.0]], columns=["id", "category", "value"]
  729. )
  730. df = df.set_index(["id", "category"])
  731. empty = df[df.value < 0]
  732. result = empty.groupby("id").sum()
  733. expected = DataFrame(
  734. dtype="float64",
  735. columns=["value"],
  736. index=Index([], dtype=np.int64, name="id"),
  737. )
  738. tm.assert_frame_equal(result, expected)
  739. # get_group
  740. # --------------------------------
  741. class TestGetGroup:
  742. def test_get_group(self):
  743. # GH 5267
  744. # be datelike friendly
  745. df = DataFrame(
  746. {
  747. "DATE": pd.to_datetime(
  748. [
  749. "10-Oct-2013",
  750. "10-Oct-2013",
  751. "10-Oct-2013",
  752. "11-Oct-2013",
  753. "11-Oct-2013",
  754. "11-Oct-2013",
  755. ]
  756. ),
  757. "label": ["foo", "foo", "bar", "foo", "foo", "bar"],
  758. "VAL": [1, 2, 3, 4, 5, 6],
  759. }
  760. )
  761. g = df.groupby("DATE")
  762. key = next(iter(g.groups))
  763. result1 = g.get_group(key)
  764. result2 = g.get_group(Timestamp(key).to_pydatetime())
  765. result3 = g.get_group(str(Timestamp(key)))
  766. tm.assert_frame_equal(result1, result2)
  767. tm.assert_frame_equal(result1, result3)
  768. g = df.groupby(["DATE", "label"])
  769. key = next(iter(g.groups))
  770. result1 = g.get_group(key)
  771. result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
  772. result3 = g.get_group((str(Timestamp(key[0])), key[1]))
  773. tm.assert_frame_equal(result1, result2)
  774. tm.assert_frame_equal(result1, result3)
  775. # must pass a same-length tuple with multiple keys
  776. msg = "must supply a tuple to get_group with multiple grouping keys"
  777. with pytest.raises(ValueError, match=msg):
  778. g.get_group("foo")
  779. with pytest.raises(ValueError, match=msg):
  780. g.get_group("foo")
  781. msg = "must supply a same-length tuple to get_group with multiple grouping keys"
  782. with pytest.raises(ValueError, match=msg):
  783. g.get_group(("foo", "bar", "baz"))
  784. def test_get_group_empty_bins(self, observed):
  785. d = DataFrame([3, 1, 7, 6])
  786. bins = [0, 5, 10, 15]
  787. g = d.groupby(pd.cut(d[0], bins), observed=observed)
  788. # TODO: should prob allow a str of Interval work as well
  789. # IOW '(0, 5]'
  790. result = g.get_group(pd.Interval(0, 5))
  791. expected = DataFrame([3, 1], index=[0, 1])
  792. tm.assert_frame_equal(result, expected)
  793. msg = r"Interval\(10, 15, closed='right'\)"
  794. with pytest.raises(KeyError, match=msg):
  795. g.get_group(pd.Interval(10, 15))
  796. def test_get_group_grouped_by_tuple(self):
  797. # GH 8121
  798. df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T
  799. gr = df.groupby("ids")
  800. expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2])
  801. result = gr.get_group((1,))
  802. tm.assert_frame_equal(result, expected)
  803. dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"])
  804. df = DataFrame({"ids": [(x,) for x in dt]})
  805. gr = df.groupby("ids")
  806. result = gr.get_group(("2010-01-01",))
  807. expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2])
  808. tm.assert_frame_equal(result, expected)
  809. def test_get_group_grouped_by_tuple_with_lambda(self):
  810. # GH 36158
  811. df = DataFrame(
  812. {
  813. "Tuples": (
  814. (x, y)
  815. for x in [0, 1]
  816. for y in np.random.default_rng(2).integers(3, 5, 5)
  817. )
  818. }
  819. )
  820. gb = df.groupby("Tuples")
  821. gb_lambda = df.groupby(lambda x: df.iloc[x, 0])
  822. expected = gb.get_group(next(iter(gb.groups.keys())))
  823. result = gb_lambda.get_group(next(iter(gb_lambda.groups.keys())))
  824. tm.assert_frame_equal(result, expected)
  825. def test_groupby_with_empty(self):
  826. index = pd.DatetimeIndex(())
  827. data = ()
  828. series = Series(data, index, dtype=object)
  829. grouper = Grouper(freq="D")
  830. grouped = series.groupby(grouper)
  831. assert next(iter(grouped), None) is None
  832. def test_groupby_with_single_column(self):
  833. df = DataFrame({"a": list("abssbab")})
  834. tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
  835. # GH 13530
  836. exp = DataFrame(
  837. index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str")
  838. )
  839. tm.assert_frame_equal(df.groupby("a").count(), exp)
  840. tm.assert_frame_equal(df.groupby("a").sum(), exp)
  841. exp = df.iloc[[3, 4, 5]]
  842. tm.assert_frame_equal(df.groupby("a").nth(1), exp)
  843. def test_gb_key_len_equal_axis_len(self):
  844. # GH16843
  845. # test ensures that index and column keys are recognized correctly
  846. # when number of keys equals axis length of groupby
  847. df = DataFrame(
  848. [["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]],
  849. columns=["first", "second", "third", "one"],
  850. )
  851. df = df.set_index(["first", "second"])
  852. df = df.groupby(["first", "second", "third"]).size()
  853. assert df.loc[("foo", "bar", "B")] == 2
  854. assert df.loc[("foo", "baz", "C")] == 1
  855. # groups & iteration
  856. # --------------------------------
  857. class TestIteration:
  858. def test_groups(self, df):
  859. grouped = df.groupby(["A"])
  860. groups = grouped.groups
  861. assert groups is grouped.groups # caching works
  862. for k, v in grouped.groups.items():
  863. assert (df.loc[v]["A"] == k).all()
  864. grouped = df.groupby(["A", "B"])
  865. groups = grouped.groups
  866. assert groups is grouped.groups # caching works
  867. for k, v in grouped.groups.items():
  868. assert (df.loc[v]["A"] == k[0]).all()
  869. assert (df.loc[v]["B"] == k[1]).all()
  870. def test_grouping_is_iterable(self, tsframe):
  871. # this code path isn't used anywhere else
  872. # not sure it's useful
  873. grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
  874. # test it works
  875. for g in grouped._grouper.groupings[0]:
  876. pass
  877. def test_multi_iter(self):
  878. s = Series(np.arange(6))
  879. k1 = np.array(["a", "a", "a", "b", "b", "b"])
  880. k2 = np.array(["1", "2", "1", "2", "1", "2"])
  881. grouped = s.groupby([k1, k2])
  882. iterated = list(grouped)
  883. expected = [
  884. ("a", "1", s[[0, 2]]),
  885. ("a", "2", s[[1]]),
  886. ("b", "1", s[[4]]),
  887. ("b", "2", s[[3, 5]]),
  888. ]
  889. for i, ((one, two), three) in enumerate(iterated):
  890. e1, e2, e3 = expected[i]
  891. assert e1 == one
  892. assert e2 == two
  893. tm.assert_series_equal(three, e3)
  894. def test_multi_iter_frame(self, three_group):
  895. k1 = np.array(["b", "b", "b", "a", "a", "a"])
  896. k2 = np.array(["1", "2", "1", "2", "1", "2"])
  897. df = DataFrame(
  898. {
  899. "v1": np.random.default_rng(2).standard_normal(6),
  900. "v2": np.random.default_rng(2).standard_normal(6),
  901. "k1": k1,
  902. "k2": k2,
  903. },
  904. index=["one", "two", "three", "four", "five", "six"],
  905. )
  906. grouped = df.groupby(["k1", "k2"])
  907. # things get sorted!
  908. iterated = list(grouped)
  909. idx = df.index
  910. expected = [
  911. ("a", "1", df.loc[idx[[4]]]),
  912. ("a", "2", df.loc[idx[[3, 5]]]),
  913. ("b", "1", df.loc[idx[[0, 2]]]),
  914. ("b", "2", df.loc[idx[[1]]]),
  915. ]
  916. for i, ((one, two), three) in enumerate(iterated):
  917. e1, e2, e3 = expected[i]
  918. assert e1 == one
  919. assert e2 == two
  920. tm.assert_frame_equal(three, e3)
  921. # don't iterate through groups with no data
  922. df["k1"] = np.array(["b", "b", "b", "a", "a", "a"])
  923. df["k2"] = np.array(["1", "1", "1", "2", "2", "2"])
  924. grouped = df.groupby(["k1", "k2"])
  925. # calling `dict` on a DataFrameGroupBy leads to a TypeError,
  926. # we need to use a dictionary comprehension here
  927. # pylint: disable-next=unnecessary-comprehension
  928. groups = {key: gp for key, gp in grouped} # noqa: C416
  929. assert len(groups) == 2
  930. # axis = 1
  931. three_levels = three_group.groupby(["A", "B", "C"]).mean()
  932. depr_msg = "DataFrame.groupby with axis=1 is deprecated"
  933. with tm.assert_produces_warning(FutureWarning, match=depr_msg):
  934. grouped = three_levels.T.groupby(axis=1, level=(1, 2))
  935. for key, group in grouped:
  936. pass
  937. def test_dictify(self, df):
  938. dict(iter(df.groupby("A")))
  939. dict(iter(df.groupby(["A", "B"])))
  940. dict(iter(df["C"].groupby(df["A"])))
  941. dict(iter(df["C"].groupby([df["A"], df["B"]])))
  942. dict(iter(df.groupby("A")["C"]))
  943. dict(iter(df.groupby(["A", "B"])["C"]))
  944. def test_groupby_with_small_elem(self):
  945. # GH 8542
  946. # length=2
  947. df = DataFrame(
  948. {"event": ["start", "start"], "change": [1234, 5678]},
  949. index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]),
  950. )
  951. grouped = df.groupby([Grouper(freq="ME"), "event"])
  952. assert len(grouped.groups) == 2
  953. assert grouped.ngroups == 2
  954. assert (Timestamp("2014-09-30"), "start") in grouped.groups
  955. assert (Timestamp("2013-10-31"), "start") in grouped.groups
  956. res = grouped.get_group((Timestamp("2014-09-30"), "start"))
  957. tm.assert_frame_equal(res, df.iloc[[0], :])
  958. res = grouped.get_group((Timestamp("2013-10-31"), "start"))
  959. tm.assert_frame_equal(res, df.iloc[[1], :])
  960. df = DataFrame(
  961. {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
  962. index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]),
  963. )
  964. grouped = df.groupby([Grouper(freq="ME"), "event"])
  965. assert len(grouped.groups) == 2
  966. assert grouped.ngroups == 2
  967. assert (Timestamp("2014-09-30"), "start") in grouped.groups
  968. assert (Timestamp("2013-10-31"), "start") in grouped.groups
  969. res = grouped.get_group((Timestamp("2014-09-30"), "start"))
  970. tm.assert_frame_equal(res, df.iloc[[0, 2], :])
  971. res = grouped.get_group((Timestamp("2013-10-31"), "start"))
  972. tm.assert_frame_equal(res, df.iloc[[1], :])
  973. # length=3
  974. df = DataFrame(
  975. {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
  976. index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]),
  977. )
  978. grouped = df.groupby([Grouper(freq="ME"), "event"])
  979. assert len(grouped.groups) == 3
  980. assert grouped.ngroups == 3
  981. assert (Timestamp("2014-09-30"), "start") in grouped.groups
  982. assert (Timestamp("2013-10-31"), "start") in grouped.groups
  983. assert (Timestamp("2014-08-31"), "start") in grouped.groups
  984. res = grouped.get_group((Timestamp("2014-09-30"), "start"))
  985. tm.assert_frame_equal(res, df.iloc[[0], :])
  986. res = grouped.get_group((Timestamp("2013-10-31"), "start"))
  987. tm.assert_frame_equal(res, df.iloc[[1], :])
  988. res = grouped.get_group((Timestamp("2014-08-31"), "start"))
  989. tm.assert_frame_equal(res, df.iloc[[2], :])
  990. def test_grouping_string_repr(self):
  991. # GH 13394
  992. mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
  993. df = DataFrame([[1, 2, 3]], columns=mi)
  994. gr = df.groupby(df[("A", "a")])
  995. result = gr._grouper.groupings[0].__repr__()
  996. expected = "Grouping(('A', 'a'))"
  997. assert result == expected
  998. def test_grouping_by_key_is_in_axis():
  999. # GH#50413 - Groupers specified by key are in-axis
  1000. df = DataFrame({"a": [1, 1, 2], "b": [1, 1, 2], "c": [3, 4, 5]}).set_index("a")
  1001. gb = df.groupby([Grouper(level="a"), Grouper(key="b")], as_index=False)
  1002. assert not gb._grouper.groupings[0].in_axis
  1003. assert gb._grouper.groupings[1].in_axis
  1004. # Currently only in-axis groupings are including in the result when as_index=False;
  1005. # This is likely to change in the future.
  1006. msg = "A grouping .* was excluded from the result"
  1007. with tm.assert_produces_warning(FutureWarning, match=msg):
  1008. result = gb.sum()
  1009. expected = DataFrame({"b": [1, 2], "c": [7, 5]})
  1010. tm.assert_frame_equal(result, expected)
  1011. def test_grouper_groups():
  1012. # GH#51182 check Grouper.groups does not raise AttributeError
  1013. df = DataFrame({"a": [1, 2, 3], "b": 1})
  1014. grper = Grouper(key="a")
  1015. gb = df.groupby(grper)
  1016. msg = "Use GroupBy.groups instead"
  1017. with tm.assert_produces_warning(FutureWarning, match=msg):
  1018. res = grper.groups
  1019. assert res is gb.groups
  1020. msg = "Use GroupBy.grouper instead"
  1021. with tm.assert_produces_warning(FutureWarning, match=msg):
  1022. res = grper.grouper
  1023. assert res is gb._grouper
  1024. msg = "Grouper.obj is deprecated and will be removed"
  1025. with tm.assert_produces_warning(FutureWarning, match=msg):
  1026. res = grper.obj
  1027. assert res is gb.obj
  1028. msg = "Use Resampler.ax instead"
  1029. with tm.assert_produces_warning(FutureWarning, match=msg):
  1030. grper.ax
  1031. msg = "Grouper.indexer is deprecated"
  1032. with tm.assert_produces_warning(FutureWarning, match=msg):
  1033. grper.indexer
  1034. @pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"])
  1035. def test_depr_grouping_attrs(attr):
  1036. # GH#56148
  1037. df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
  1038. gb = df.groupby("a")
  1039. msg = f"{attr} is deprecated"
  1040. with tm.assert_produces_warning(FutureWarning, match=msg):
  1041. getattr(gb._grouper.groupings[0], attr)