test_groupby.py 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. DatetimeIndex,
  6. Index,
  7. MultiIndex,
  8. Series,
  9. Timestamp,
  10. date_range,
  11. to_datetime,
  12. )
  13. import pandas._testing as tm
  14. from pandas.api.indexers import BaseIndexer
  15. from pandas.core.groupby.groupby import get_groupby
  16. @pytest.fixture
  17. def times_frame():
  18. """Frame for testing times argument in EWM groupby."""
  19. return DataFrame(
  20. {
  21. "A": ["a", "b", "c", "a", "b", "c", "a", "b", "c", "a"],
  22. "B": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3],
  23. "C": to_datetime(
  24. [
  25. "2020-01-01",
  26. "2020-01-01",
  27. "2020-01-01",
  28. "2020-01-02",
  29. "2020-01-10",
  30. "2020-01-22",
  31. "2020-01-03",
  32. "2020-01-23",
  33. "2020-01-23",
  34. "2020-01-04",
  35. ]
  36. ),
  37. }
  38. )
  39. @pytest.fixture
  40. def roll_frame():
  41. return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
  42. class TestRolling:
  43. def test_groupby_unsupported_argument(self, roll_frame):
  44. msg = r"groupby\(\) got an unexpected keyword argument 'foo'"
  45. with pytest.raises(TypeError, match=msg):
  46. roll_frame.groupby("A", foo=1)
  47. def test_getitem(self, roll_frame):
  48. g = roll_frame.groupby("A")
  49. g_mutated = get_groupby(roll_frame, by="A")
  50. expected = g_mutated.B.apply(lambda x: x.rolling(2).mean())
  51. result = g.rolling(2).mean().B
  52. tm.assert_series_equal(result, expected)
  53. result = g.rolling(2).B.mean()
  54. tm.assert_series_equal(result, expected)
  55. result = g.B.rolling(2).mean()
  56. tm.assert_series_equal(result, expected)
  57. result = roll_frame.B.groupby(roll_frame.A).rolling(2).mean()
  58. tm.assert_series_equal(result, expected)
  59. def test_getitem_multiple(self, roll_frame):
  60. # GH 13174
  61. g = roll_frame.groupby("A")
  62. r = g.rolling(2, min_periods=0)
  63. g_mutated = get_groupby(roll_frame, by="A")
  64. expected = g_mutated.B.apply(lambda x: x.rolling(2, min_periods=0).count())
  65. result = r.B.count()
  66. tm.assert_series_equal(result, expected)
  67. result = r.B.count()
  68. tm.assert_series_equal(result, expected)
  69. @pytest.mark.parametrize(
  70. "f",
  71. [
  72. "sum",
  73. "mean",
  74. "min",
  75. "max",
  76. "count",
  77. "kurt",
  78. "skew",
  79. ],
  80. )
  81. def test_rolling(self, f, roll_frame):
  82. g = roll_frame.groupby("A", group_keys=False)
  83. r = g.rolling(window=4)
  84. result = getattr(r, f)()
  85. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  86. with tm.assert_produces_warning(FutureWarning, match=msg):
  87. expected = g.apply(lambda x: getattr(x.rolling(4), f)())
  88. # groupby.apply doesn't drop the grouped-by column
  89. expected = expected.drop("A", axis=1)
  90. # GH 39732
  91. expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)])
  92. expected.index = expected_index
  93. tm.assert_frame_equal(result, expected)
  94. @pytest.mark.parametrize("f", ["std", "var"])
  95. def test_rolling_ddof(self, f, roll_frame):
  96. g = roll_frame.groupby("A", group_keys=False)
  97. r = g.rolling(window=4)
  98. result = getattr(r, f)(ddof=1)
  99. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  100. with tm.assert_produces_warning(FutureWarning, match=msg):
  101. expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1))
  102. # groupby.apply doesn't drop the grouped-by column
  103. expected = expected.drop("A", axis=1)
  104. # GH 39732
  105. expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)])
  106. expected.index = expected_index
  107. tm.assert_frame_equal(result, expected)
  108. @pytest.mark.parametrize(
  109. "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"]
  110. )
  111. def test_rolling_quantile(self, interpolation, roll_frame):
  112. g = roll_frame.groupby("A", group_keys=False)
  113. r = g.rolling(window=4)
  114. result = r.quantile(0.4, interpolation=interpolation)
  115. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  116. with tm.assert_produces_warning(FutureWarning, match=msg):
  117. expected = g.apply(
  118. lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation)
  119. )
  120. # groupby.apply doesn't drop the grouped-by column
  121. expected = expected.drop("A", axis=1)
  122. # GH 39732
  123. expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)])
  124. expected.index = expected_index
  125. tm.assert_frame_equal(result, expected)
  126. @pytest.mark.parametrize("f, expected_val", [["corr", 1], ["cov", 0.5]])
  127. def test_rolling_corr_cov_other_same_size_as_groups(self, f, expected_val):
  128. # GH 42915
  129. df = DataFrame(
  130. {"value": range(10), "idx1": [1] * 5 + [2] * 5, "idx2": [1, 2, 3, 4, 5] * 2}
  131. ).set_index(["idx1", "idx2"])
  132. other = DataFrame({"value": range(5), "idx2": [1, 2, 3, 4, 5]}).set_index(
  133. "idx2"
  134. )
  135. result = getattr(df.groupby(level=0).rolling(2), f)(other)
  136. expected_data = ([np.nan] + [expected_val] * 4) * 2
  137. expected = DataFrame(
  138. expected_data,
  139. columns=["value"],
  140. index=MultiIndex.from_arrays(
  141. [
  142. [1] * 5 + [2] * 5,
  143. [1] * 5 + [2] * 5,
  144. list(range(1, 6)) * 2,
  145. ],
  146. names=["idx1", "idx1", "idx2"],
  147. ),
  148. )
  149. tm.assert_frame_equal(result, expected)
  150. @pytest.mark.parametrize("f", ["corr", "cov"])
  151. def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame):
  152. g = roll_frame.groupby("A")
  153. r = g.rolling(window=4)
  154. result = getattr(r, f)(roll_frame)
  155. def func(x):
  156. return getattr(x.rolling(4), f)(roll_frame)
  157. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  158. with tm.assert_produces_warning(FutureWarning, match=msg):
  159. expected = g.apply(func)
  160. # GH 39591: The grouped column should be all np.nan
  161. # (groupby.apply inserts 0s for cov)
  162. expected["A"] = np.nan
  163. tm.assert_frame_equal(result, expected)
  164. @pytest.mark.parametrize("f", ["corr", "cov"])
  165. def test_rolling_corr_cov_pairwise(self, f, roll_frame):
  166. g = roll_frame.groupby("A")
  167. r = g.rolling(window=4)
  168. result = getattr(r.B, f)(pairwise=True)
  169. def func(x):
  170. return getattr(x.B.rolling(4), f)(pairwise=True)
  171. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  172. with tm.assert_produces_warning(FutureWarning, match=msg):
  173. expected = g.apply(func)
  174. tm.assert_series_equal(result, expected)
  175. @pytest.mark.parametrize(
  176. "func, expected_values",
  177. [("cov", [[1.0, 1.0], [1.0, 4.0]]), ("corr", [[1.0, 0.5], [0.5, 1.0]])],
  178. )
  179. def test_rolling_corr_cov_unordered(self, func, expected_values):
  180. # GH 43386
  181. df = DataFrame(
  182. {
  183. "a": ["g1", "g2", "g1", "g1"],
  184. "b": [0, 0, 1, 2],
  185. "c": [2, 0, 6, 4],
  186. }
  187. )
  188. rol = df.groupby("a").rolling(3)
  189. result = getattr(rol, func)()
  190. expected = DataFrame(
  191. {
  192. "b": 4 * [np.nan] + expected_values[0] + 2 * [np.nan],
  193. "c": 4 * [np.nan] + expected_values[1] + 2 * [np.nan],
  194. },
  195. index=MultiIndex.from_tuples(
  196. [
  197. ("g1", 0, "b"),
  198. ("g1", 0, "c"),
  199. ("g1", 2, "b"),
  200. ("g1", 2, "c"),
  201. ("g1", 3, "b"),
  202. ("g1", 3, "c"),
  203. ("g2", 1, "b"),
  204. ("g2", 1, "c"),
  205. ],
  206. names=["a", None, None],
  207. ),
  208. )
  209. tm.assert_frame_equal(result, expected)
  210. def test_rolling_apply(self, raw, roll_frame):
  211. g = roll_frame.groupby("A", group_keys=False)
  212. r = g.rolling(window=4)
  213. # reduction
  214. result = r.apply(lambda x: x.sum(), raw=raw)
  215. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  216. with tm.assert_produces_warning(FutureWarning, match=msg):
  217. expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw))
  218. # groupby.apply doesn't drop the grouped-by column
  219. expected = expected.drop("A", axis=1)
  220. # GH 39732
  221. expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)])
  222. expected.index = expected_index
  223. tm.assert_frame_equal(result, expected)
  224. def test_rolling_apply_mutability(self):
  225. # GH 14013
  226. df = DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6})
  227. g = df.groupby("A")
  228. mi = MultiIndex.from_tuples(
  229. [("bar", 3), ("bar", 4), ("bar", 5), ("foo", 0), ("foo", 1), ("foo", 2)]
  230. )
  231. mi.names = ["A", None]
  232. # Grouped column should not be a part of the output
  233. expected = DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi)
  234. result = g.rolling(window=2).sum()
  235. tm.assert_frame_equal(result, expected)
  236. # Call an arbitrary function on the groupby
  237. g.sum()
  238. # Make sure nothing has been mutated
  239. result = g.rolling(window=2).sum()
  240. tm.assert_frame_equal(result, expected)
  241. @pytest.mark.parametrize("expected_value,raw_value", [[1.0, True], [0.0, False]])
  242. def test_groupby_rolling(self, expected_value, raw_value):
  243. # GH 31754
  244. def isnumpyarray(x):
  245. return int(isinstance(x, np.ndarray))
  246. df = DataFrame({"id": [1, 1, 1], "value": [1, 2, 3]})
  247. result = df.groupby("id").value.rolling(1).apply(isnumpyarray, raw=raw_value)
  248. expected = Series(
  249. [expected_value] * 3,
  250. index=MultiIndex.from_tuples(((1, 0), (1, 1), (1, 2)), names=["id", None]),
  251. name="value",
  252. )
  253. tm.assert_series_equal(result, expected)
  254. def test_groupby_rolling_center_center(self):
  255. # GH 35552
  256. series = Series(range(1, 6))
  257. result = series.groupby(series).rolling(center=True, window=3).mean()
  258. expected = Series(
  259. [np.nan] * 5,
  260. index=MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))),
  261. )
  262. tm.assert_series_equal(result, expected)
  263. series = Series(range(1, 5))
  264. result = series.groupby(series).rolling(center=True, window=3).mean()
  265. expected = Series(
  266. [np.nan] * 4,
  267. index=MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))),
  268. )
  269. tm.assert_series_equal(result, expected)
  270. df = DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)})
  271. result = df.groupby("a").rolling(center=True, window=3).mean()
  272. expected = DataFrame(
  273. [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan],
  274. index=MultiIndex.from_tuples(
  275. (
  276. ("a", 0),
  277. ("a", 1),
  278. ("a", 2),
  279. ("a", 3),
  280. ("a", 4),
  281. ("b", 5),
  282. ("b", 6),
  283. ("b", 7),
  284. ("b", 8),
  285. ("b", 9),
  286. ("b", 10),
  287. ),
  288. names=["a", None],
  289. ),
  290. columns=["b"],
  291. )
  292. tm.assert_frame_equal(result, expected)
  293. df = DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)})
  294. result = df.groupby("a").rolling(center=True, window=3).mean()
  295. expected = DataFrame(
  296. [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan],
  297. index=MultiIndex.from_tuples(
  298. (
  299. ("a", 0),
  300. ("a", 1),
  301. ("a", 2),
  302. ("a", 3),
  303. ("a", 4),
  304. ("b", 5),
  305. ("b", 6),
  306. ("b", 7),
  307. ("b", 8),
  308. ("b", 9),
  309. ),
  310. names=["a", None],
  311. ),
  312. columns=["b"],
  313. )
  314. tm.assert_frame_equal(result, expected)
  315. def test_groupby_rolling_center_on(self):
  316. # GH 37141
  317. df = DataFrame(
  318. data={
  319. "Date": date_range("2020-01-01", "2020-01-10"),
  320. "gb": ["group_1"] * 6 + ["group_2"] * 4,
  321. "value": range(10),
  322. }
  323. )
  324. result = (
  325. df.groupby("gb")
  326. .rolling(6, on="Date", center=True, min_periods=1)
  327. .value.mean()
  328. )
  329. mi = MultiIndex.from_arrays([df["gb"], df["Date"]], names=["gb", "Date"])
  330. expected = Series(
  331. [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 7.0, 7.5, 7.5, 7.5],
  332. name="value",
  333. index=mi,
  334. )
  335. tm.assert_series_equal(result, expected)
  336. @pytest.mark.parametrize("min_periods", [5, 4, 3])
  337. def test_groupby_rolling_center_min_periods(self, min_periods):
  338. # GH 36040
  339. df = DataFrame({"group": ["A"] * 10 + ["B"] * 10, "data": range(20)})
  340. window_size = 5
  341. result = (
  342. df.groupby("group")
  343. .rolling(window_size, center=True, min_periods=min_periods)
  344. .mean()
  345. )
  346. result = result.reset_index()[["group", "data"]]
  347. grp_A_mean = [1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 7.5, 8.0]
  348. grp_B_mean = [x + 10.0 for x in grp_A_mean]
  349. num_nans = max(0, min_periods - 3) # For window_size of 5
  350. nans = [np.nan] * num_nans
  351. grp_A_expected = nans + grp_A_mean[num_nans : 10 - num_nans] + nans
  352. grp_B_expected = nans + grp_B_mean[num_nans : 10 - num_nans] + nans
  353. expected = DataFrame(
  354. {"group": ["A"] * 10 + ["B"] * 10, "data": grp_A_expected + grp_B_expected}
  355. )
  356. tm.assert_frame_equal(result, expected)
  357. def test_groupby_subselect_rolling(self):
  358. # GH 35486
  359. df = DataFrame(
  360. {"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0], "c": [10, 20, 30, 20]}
  361. )
  362. result = df.groupby("a")[["b"]].rolling(2).max()
  363. expected = DataFrame(
  364. [np.nan, np.nan, 2.0, np.nan],
  365. columns=["b"],
  366. index=MultiIndex.from_tuples(
  367. ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None]
  368. ),
  369. )
  370. tm.assert_frame_equal(result, expected)
  371. result = df.groupby("a")["b"].rolling(2).max()
  372. expected = Series(
  373. [np.nan, np.nan, 2.0, np.nan],
  374. index=MultiIndex.from_tuples(
  375. ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None]
  376. ),
  377. name="b",
  378. )
  379. tm.assert_series_equal(result, expected)
  380. def test_groupby_rolling_custom_indexer(self):
  381. # GH 35557
  382. class SimpleIndexer(BaseIndexer):
  383. def get_window_bounds(
  384. self,
  385. num_values=0,
  386. min_periods=None,
  387. center=None,
  388. closed=None,
  389. step=None,
  390. ):
  391. min_periods = self.window_size if min_periods is None else 0
  392. end = np.arange(num_values, dtype=np.int64) + 1
  393. start = end.copy() - self.window_size
  394. start[start < 0] = min_periods
  395. return start, end
  396. df = DataFrame(
  397. {"a": [1.0, 2.0, 3.0, 4.0, 5.0] * 3}, index=[0] * 5 + [1] * 5 + [2] * 5
  398. )
  399. result = (
  400. df.groupby(df.index)
  401. .rolling(SimpleIndexer(window_size=3), min_periods=1)
  402. .sum()
  403. )
  404. expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum()
  405. tm.assert_frame_equal(result, expected)
  406. def test_groupby_rolling_subset_with_closed(self):
  407. # GH 35549
  408. df = DataFrame(
  409. {
  410. "column1": range(8),
  411. "column2": range(8),
  412. "group": ["A"] * 4 + ["B"] * 4,
  413. "date": [
  414. Timestamp(date)
  415. for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
  416. ]
  417. * 2,
  418. }
  419. )
  420. result = (
  421. df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum()
  422. )
  423. expected = Series(
  424. [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
  425. index=MultiIndex.from_frame(
  426. df[["group", "date"]],
  427. names=["group", "date"],
  428. ),
  429. name="column1",
  430. )
  431. tm.assert_series_equal(result, expected)
  432. def test_groupby_subset_rolling_subset_with_closed(self):
  433. # GH 35549
  434. df = DataFrame(
  435. {
  436. "column1": range(8),
  437. "column2": range(8),
  438. "group": ["A"] * 4 + ["B"] * 4,
  439. "date": [
  440. Timestamp(date)
  441. for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
  442. ]
  443. * 2,
  444. }
  445. )
  446. result = (
  447. df.groupby("group")[["column1", "date"]]
  448. .rolling("1D", on="date", closed="left")["column1"]
  449. .sum()
  450. )
  451. expected = Series(
  452. [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
  453. index=MultiIndex.from_frame(
  454. df[["group", "date"]],
  455. names=["group", "date"],
  456. ),
  457. name="column1",
  458. )
  459. tm.assert_series_equal(result, expected)
  460. @pytest.mark.parametrize("func", ["max", "min"])
  461. def test_groupby_rolling_index_changed(self, func):
  462. # GH: #36018 nlevels of MultiIndex changed
  463. ds = Series(
  464. [1, 2, 2],
  465. index=MultiIndex.from_tuples(
  466. [("a", "x"), ("a", "y"), ("c", "z")], names=["1", "2"]
  467. ),
  468. name="a",
  469. )
  470. result = getattr(ds.groupby(ds).rolling(2), func)()
  471. expected = Series(
  472. [np.nan, np.nan, 2.0],
  473. index=MultiIndex.from_tuples(
  474. [(1, "a", "x"), (2, "a", "y"), (2, "c", "z")], names=["a", "1", "2"]
  475. ),
  476. name="a",
  477. )
  478. tm.assert_series_equal(result, expected)
  479. def test_groupby_rolling_empty_frame(self):
  480. # GH 36197
  481. expected = DataFrame({"s1": []})
  482. result = expected.groupby("s1").rolling(window=1).sum()
  483. # GH 32262
  484. expected = expected.drop(columns="s1")
  485. # GH-38057 from_tuples gives empty object dtype, we now get float/int levels
  486. # expected.index = MultiIndex.from_tuples([], names=["s1", None])
  487. expected.index = MultiIndex.from_product(
  488. [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
  489. )
  490. tm.assert_frame_equal(result, expected)
  491. expected = DataFrame({"s1": [], "s2": []})
  492. result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
  493. # GH 32262
  494. expected = expected.drop(columns=["s1", "s2"])
  495. expected.index = MultiIndex.from_product(
  496. [
  497. Index([], dtype="float64"),
  498. Index([], dtype="float64"),
  499. Index([], dtype="int64"),
  500. ],
  501. names=["s1", "s2", None],
  502. )
  503. tm.assert_frame_equal(result, expected)
  504. def test_groupby_rolling_string_index(self):
  505. # GH: 36727
  506. df = DataFrame(
  507. [
  508. ["A", "group_1", Timestamp(2019, 1, 1, 9)],
  509. ["B", "group_1", Timestamp(2019, 1, 2, 9)],
  510. ["Z", "group_2", Timestamp(2019, 1, 3, 9)],
  511. ["H", "group_1", Timestamp(2019, 1, 6, 9)],
  512. ["E", "group_2", Timestamp(2019, 1, 20, 9)],
  513. ],
  514. columns=["index", "group", "eventTime"],
  515. ).set_index("index")
  516. groups = df.groupby("group")
  517. df["count_to_date"] = groups.cumcount()
  518. rolling_groups = groups.rolling("10d", on="eventTime")
  519. result = rolling_groups.apply(lambda df: df.shape[0])
  520. expected = DataFrame(
  521. [
  522. ["A", "group_1", Timestamp(2019, 1, 1, 9), 1.0],
  523. ["B", "group_1", Timestamp(2019, 1, 2, 9), 2.0],
  524. ["H", "group_1", Timestamp(2019, 1, 6, 9), 3.0],
  525. ["Z", "group_2", Timestamp(2019, 1, 3, 9), 1.0],
  526. ["E", "group_2", Timestamp(2019, 1, 20, 9), 1.0],
  527. ],
  528. columns=["index", "group", "eventTime", "count_to_date"],
  529. ).set_index(["group", "index"])
  530. tm.assert_frame_equal(result, expected)
  531. def test_groupby_rolling_no_sort(self):
  532. # GH 36889
  533. result = (
  534. DataFrame({"foo": [2, 1], "bar": [2, 1]})
  535. .groupby("foo", sort=False)
  536. .rolling(1)
  537. .min()
  538. )
  539. expected = DataFrame(
  540. np.array([[2.0, 2.0], [1.0, 1.0]]),
  541. columns=["foo", "bar"],
  542. index=MultiIndex.from_tuples([(2, 0), (1, 1)], names=["foo", None]),
  543. )
  544. # GH 32262
  545. expected = expected.drop(columns="foo")
  546. tm.assert_frame_equal(result, expected)
  547. def test_groupby_rolling_count_closed_on(self, unit):
  548. # GH 35869
  549. df = DataFrame(
  550. {
  551. "column1": range(6),
  552. "column2": range(6),
  553. "group": 3 * ["A", "B"],
  554. "date": date_range(end="20190101", periods=6, unit=unit),
  555. }
  556. )
  557. result = (
  558. df.groupby("group")
  559. .rolling("3d", on="date", closed="left")["column1"]
  560. .count()
  561. )
  562. dti = DatetimeIndex(
  563. [
  564. "2018-12-27",
  565. "2018-12-29",
  566. "2018-12-31",
  567. "2018-12-28",
  568. "2018-12-30",
  569. "2019-01-01",
  570. ],
  571. dtype=f"M8[{unit}]",
  572. )
  573. mi = MultiIndex.from_arrays(
  574. [
  575. ["A", "A", "A", "B", "B", "B"],
  576. dti,
  577. ],
  578. names=["group", "date"],
  579. )
  580. expected = Series(
  581. [np.nan, 1.0, 1.0, np.nan, 1.0, 1.0],
  582. name="column1",
  583. index=mi,
  584. )
  585. tm.assert_series_equal(result, expected)
  586. @pytest.mark.parametrize(
  587. ("func", "kwargs"),
  588. [("rolling", {"window": 2, "min_periods": 1}), ("expanding", {})],
  589. )
  590. def test_groupby_rolling_sem(self, func, kwargs):
  591. # GH: 26476
  592. df = DataFrame(
  593. [["a", 1], ["a", 2], ["b", 1], ["b", 2], ["b", 3]], columns=["a", "b"]
  594. )
  595. result = getattr(df.groupby("a"), func)(**kwargs).sem()
  596. expected = DataFrame(
  597. {"a": [np.nan] * 5, "b": [np.nan, 0.70711, np.nan, 0.70711, 0.70711]},
  598. index=MultiIndex.from_tuples(
  599. [("a", 0), ("a", 1), ("b", 2), ("b", 3), ("b", 4)], names=["a", None]
  600. ),
  601. )
  602. # GH 32262
  603. expected = expected.drop(columns="a")
  604. tm.assert_frame_equal(result, expected)
  605. @pytest.mark.parametrize(
  606. ("rollings", "key"), [({"on": "a"}, "a"), ({"on": None}, "index")]
  607. )
  608. def test_groupby_rolling_nans_in_index(self, rollings, key):
  609. # GH: 34617
  610. df = DataFrame(
  611. {
  612. "a": to_datetime(["2020-06-01 12:00", "2020-06-01 14:00", np.nan]),
  613. "b": [1, 2, 3],
  614. "c": [1, 1, 1],
  615. }
  616. )
  617. if key == "index":
  618. df = df.set_index("a")
  619. with pytest.raises(ValueError, match=f"{key} values must not have NaT"):
  620. df.groupby("c").rolling("60min", **rollings)
  621. @pytest.mark.parametrize("group_keys", [True, False])
  622. def test_groupby_rolling_group_keys(self, group_keys):
  623. # GH 37641
  624. # GH 38523: GH 37641 actually was not a bug.
  625. # group_keys only applies to groupby.apply directly
  626. arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
  627. index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
  628. s = Series([1, 2, 3], index=index)
  629. result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean()
  630. expected = Series(
  631. [1.0, 2.0, 3.0],
  632. index=MultiIndex.from_tuples(
  633. [
  634. ("val1", "val1", "val1", "val1"),
  635. ("val1", "val1", "val1", "val1"),
  636. ("val2", "val2", "val2", "val2"),
  637. ],
  638. names=["idx1", "idx2", "idx1", "idx2"],
  639. ),
  640. )
  641. tm.assert_series_equal(result, expected)
  642. def test_groupby_rolling_index_level_and_column_label(self):
  643. # The groupby keys should not appear as a resulting column
  644. arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
  645. index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
  646. df = DataFrame({"A": [1, 1, 2], "B": range(3)}, index=index)
  647. result = df.groupby(["idx1", "A"]).rolling(1).mean()
  648. expected = DataFrame(
  649. {"B": [0.0, 1.0, 2.0]},
  650. index=MultiIndex.from_tuples(
  651. [
  652. ("val1", 1, "val1", "val1"),
  653. ("val1", 1, "val1", "val1"),
  654. ("val2", 2, "val2", "val2"),
  655. ],
  656. names=["idx1", "A", "idx1", "idx2"],
  657. ),
  658. )
  659. tm.assert_frame_equal(result, expected)
  660. def test_groupby_rolling_resulting_multiindex(self):
  661. # a few different cases checking the created MultiIndex of the result
  662. # https://github.com/pandas-dev/pandas/pull/38057
  663. # grouping by 1 columns -> 2-level MI as result
  664. df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4})
  665. result = df.groupby("b").rolling(3).mean()
  666. expected_index = MultiIndex.from_tuples(
  667. [(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)],
  668. names=["b", None],
  669. )
  670. tm.assert_index_equal(result.index, expected_index)
  671. def test_groupby_rolling_resulting_multiindex2(self):
  672. # grouping by 2 columns -> 3-level MI as result
  673. df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3})
  674. result = df.groupby(["b", "c"]).rolling(2).sum()
  675. expected_index = MultiIndex.from_tuples(
  676. [
  677. (1, 1, 0),
  678. (1, 1, 4),
  679. (1, 1, 8),
  680. (1, 3, 2),
  681. (1, 3, 6),
  682. (1, 3, 10),
  683. (2, 2, 1),
  684. (2, 2, 5),
  685. (2, 2, 9),
  686. (2, 4, 3),
  687. (2, 4, 7),
  688. (2, 4, 11),
  689. ],
  690. names=["b", "c", None],
  691. )
  692. tm.assert_index_equal(result.index, expected_index)
  693. def test_groupby_rolling_resulting_multiindex3(self):
  694. # grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result
  695. df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2})
  696. df = df.set_index("c", append=True)
  697. result = df.groupby("b").rolling(3).mean()
  698. expected_index = MultiIndex.from_tuples(
  699. [
  700. (1, 0, 1),
  701. (1, 2, 3),
  702. (1, 4, 1),
  703. (1, 6, 3),
  704. (2, 1, 2),
  705. (2, 3, 4),
  706. (2, 5, 2),
  707. (2, 7, 4),
  708. ],
  709. names=["b", None, "c"],
  710. )
  711. tm.assert_index_equal(result.index, expected_index, exact="equiv")
  712. def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame):
  713. # GH 39732
  714. g = roll_frame.groupby("A", group_keys=False)
  715. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  716. with tm.assert_produces_warning(FutureWarning, match=msg):
  717. expected = g.apply(lambda x: x.rolling(4).sum()).index
  718. _ = g.rolling(window=4)
  719. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  720. with tm.assert_produces_warning(FutureWarning, match=msg):
  721. result = g.apply(lambda x: x.rolling(4).sum()).index
  722. tm.assert_index_equal(result, expected)
  723. @pytest.mark.parametrize(
  724. ("window", "min_periods", "closed", "expected"),
  725. [
  726. (2, 0, "left", [None, 0.0, 1.0, 1.0, None, 0.0, 1.0, 1.0]),
  727. (2, 2, "left", [None, None, 1.0, 1.0, None, None, 1.0, 1.0]),
  728. (4, 4, "left", [None, None, None, None, None, None, None, None]),
  729. (4, 4, "right", [None, None, None, 5.0, None, None, None, 5.0]),
  730. ],
  731. )
  732. def test_groupby_rolling_var(self, window, min_periods, closed, expected):
  733. df = DataFrame([1, 2, 3, 4, 5, 6, 7, 8])
  734. result = (
  735. df.groupby([1, 2, 1, 2, 1, 2, 1, 2])
  736. .rolling(window=window, min_periods=min_periods, closed=closed)
  737. .var(0)
  738. )
  739. expected_result = DataFrame(
  740. np.array(expected, dtype="float64"),
  741. index=MultiIndex(
  742. levels=[np.array([1, 2]), [0, 1, 2, 3, 4, 5, 6, 7]],
  743. codes=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 2, 4, 6, 1, 3, 5, 7]],
  744. ),
  745. )
  746. tm.assert_frame_equal(result, expected_result)
  747. @pytest.mark.parametrize(
  748. "columns", [MultiIndex.from_tuples([("A", ""), ("B", "C")]), ["A", "B"]]
  749. )
  750. def test_by_column_not_in_values(self, columns):
  751. # GH 32262
  752. df = DataFrame([[1, 0]] * 20 + [[2, 0]] * 12 + [[3, 0]] * 8, columns=columns)
  753. g = df.groupby("A")
  754. original_obj = g.obj.copy(deep=True)
  755. r = g.rolling(4)
  756. result = r.sum()
  757. assert "A" not in result.columns
  758. tm.assert_frame_equal(g.obj, original_obj)
  759. def test_groupby_level(self):
  760. # GH 38523, 38787
  761. arrays = [
  762. ["Falcon", "Falcon", "Parrot", "Parrot"],
  763. ["Captive", "Wild", "Captive", "Wild"],
  764. ]
  765. index = MultiIndex.from_arrays(arrays, names=("Animal", "Type"))
  766. df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index)
  767. result = df.groupby(level=0)["Max Speed"].rolling(2).sum()
  768. expected = Series(
  769. [np.nan, 740.0, np.nan, 50.0],
  770. index=MultiIndex.from_tuples(
  771. [
  772. ("Falcon", "Falcon", "Captive"),
  773. ("Falcon", "Falcon", "Wild"),
  774. ("Parrot", "Parrot", "Captive"),
  775. ("Parrot", "Parrot", "Wild"),
  776. ],
  777. names=["Animal", "Animal", "Type"],
  778. ),
  779. name="Max Speed",
  780. )
  781. tm.assert_series_equal(result, expected)
  782. @pytest.mark.parametrize(
  783. "by, expected_data",
  784. [
  785. [["id"], {"num": [100.0, 150.0, 150.0, 200.0]}],
  786. [
  787. ["id", "index"],
  788. {
  789. "date": [
  790. Timestamp("2018-01-01"),
  791. Timestamp("2018-01-02"),
  792. Timestamp("2018-01-01"),
  793. Timestamp("2018-01-02"),
  794. ],
  795. "num": [100.0, 200.0, 150.0, 250.0],
  796. },
  797. ],
  798. ],
  799. )
  800. def test_as_index_false(self, by, expected_data, unit):
  801. # GH 39433
  802. data = [
  803. ["A", "2018-01-01", 100.0],
  804. ["A", "2018-01-02", 200.0],
  805. ["B", "2018-01-01", 150.0],
  806. ["B", "2018-01-02", 250.0],
  807. ]
  808. df = DataFrame(data, columns=["id", "date", "num"])
  809. df["date"] = df["date"].astype(f"M8[{unit}]")
  810. df = df.set_index(["date"])
  811. gp_by = [getattr(df, attr) for attr in by]
  812. result = (
  813. df.groupby(gp_by, as_index=False).rolling(window=2, min_periods=1).mean()
  814. )
  815. expected = {"id": ["A", "A", "B", "B"]}
  816. expected.update(expected_data)
  817. expected = DataFrame(
  818. expected,
  819. index=df.index,
  820. )
  821. if "date" in expected_data:
  822. expected["date"] = expected["date"].astype(f"M8[{unit}]")
  823. tm.assert_frame_equal(result, expected)
  824. def test_nan_and_zero_endpoints(self, any_int_numpy_dtype):
  825. # https://github.com/twosigma/pandas/issues/53
  826. typ = np.dtype(any_int_numpy_dtype).type
  827. size = 1000
  828. idx = np.repeat(typ(0), size)
  829. idx[-1] = 1
  830. val = 5e25
  831. arr = np.repeat(val, size)
  832. arr[0] = np.nan
  833. arr[-1] = 0
  834. df = DataFrame(
  835. {
  836. "index": idx,
  837. "adl2": arr,
  838. }
  839. ).set_index("index")
  840. result = df.groupby("index")["adl2"].rolling(window=10, min_periods=1).mean()
  841. expected = Series(
  842. arr,
  843. name="adl2",
  844. index=MultiIndex.from_arrays(
  845. [
  846. Index([0] * 999 + [1], dtype=typ, name="index"),
  847. Index([0] * 999 + [1], dtype=typ, name="index"),
  848. ],
  849. ),
  850. )
  851. tm.assert_series_equal(result, expected)
  852. def test_groupby_rolling_non_monotonic(self):
  853. # GH 43909
  854. shuffled = [3, 0, 1, 2]
  855. sec = 1_000
  856. df = DataFrame(
  857. [{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
  858. )
  859. with pytest.raises(ValueError, match=r".* must be monotonic"):
  860. df.groupby("c").rolling(on="t", window="3s")
  861. def test_groupby_monotonic(self):
  862. # GH 15130
  863. # we don't need to validate monotonicity when grouping
  864. # GH 43909 we should raise an error here to match
  865. # behaviour of non-groupby rolling.
  866. data = [
  867. ["David", "1/1/2015", 100],
  868. ["David", "1/5/2015", 500],
  869. ["David", "5/30/2015", 50],
  870. ["David", "7/25/2015", 50],
  871. ["Ryan", "1/4/2014", 100],
  872. ["Ryan", "1/19/2015", 500],
  873. ["Ryan", "3/31/2016", 50],
  874. ["Joe", "7/1/2015", 100],
  875. ["Joe", "9/9/2015", 500],
  876. ["Joe", "10/15/2015", 50],
  877. ]
  878. df = DataFrame(data=data, columns=["name", "date", "amount"])
  879. df["date"] = to_datetime(df["date"])
  880. df = df.sort_values("date")
  881. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  882. with tm.assert_produces_warning(FutureWarning, match=msg):
  883. expected = (
  884. df.set_index("date")
  885. .groupby("name")
  886. .apply(lambda x: x.rolling("180D")["amount"].sum())
  887. )
  888. result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
  889. tm.assert_series_equal(result, expected)
  890. def test_datelike_on_monotonic_within_each_group(self):
  891. # GH 13966 (similar to #15130, closed by #15175)
  892. # superseded by 43909
  893. # GH 46061: OK if the on is monotonic relative to each each group
  894. dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
  895. df = DataFrame(
  896. {
  897. "A": [1] * 20 + [2] * 12 + [3] * 8,
  898. "B": np.concatenate((dates, dates)),
  899. "C": np.arange(40),
  900. }
  901. )
  902. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  903. with tm.assert_produces_warning(FutureWarning, match=msg):
  904. expected = (
  905. df.set_index("B")
  906. .groupby("A")
  907. .apply(lambda x: x.rolling("4s")["C"].mean())
  908. )
  909. result = df.groupby("A").rolling("4s", on="B").C.mean()
  910. tm.assert_series_equal(result, expected)
  911. def test_datelike_on_not_monotonic_within_each_group(self):
  912. # GH 46061
  913. df = DataFrame(
  914. {
  915. "A": [1] * 3 + [2] * 3,
  916. "B": [Timestamp(year, 1, 1) for year in [2020, 2021, 2019]] * 2,
  917. "C": range(6),
  918. }
  919. )
  920. with pytest.raises(ValueError, match="Each group within B must be monotonic."):
  921. df.groupby("A").rolling("365D", on="B")
  922. class TestExpanding:
  923. @pytest.fixture
  924. def frame(self):
  925. return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
  926. @pytest.mark.parametrize(
  927. "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"]
  928. )
  929. def test_expanding(self, f, frame):
  930. g = frame.groupby("A", group_keys=False)
  931. r = g.expanding()
  932. result = getattr(r, f)()
  933. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  934. with tm.assert_produces_warning(FutureWarning, match=msg):
  935. expected = g.apply(lambda x: getattr(x.expanding(), f)())
  936. # groupby.apply doesn't drop the grouped-by column
  937. expected = expected.drop("A", axis=1)
  938. # GH 39732
  939. expected_index = MultiIndex.from_arrays([frame["A"], range(40)])
  940. expected.index = expected_index
  941. tm.assert_frame_equal(result, expected)
  942. @pytest.mark.parametrize("f", ["std", "var"])
  943. def test_expanding_ddof(self, f, frame):
  944. g = frame.groupby("A", group_keys=False)
  945. r = g.expanding()
  946. result = getattr(r, f)(ddof=0)
  947. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  948. with tm.assert_produces_warning(FutureWarning, match=msg):
  949. expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0))
  950. # groupby.apply doesn't drop the grouped-by column
  951. expected = expected.drop("A", axis=1)
  952. # GH 39732
  953. expected_index = MultiIndex.from_arrays([frame["A"], range(40)])
  954. expected.index = expected_index
  955. tm.assert_frame_equal(result, expected)
  956. @pytest.mark.parametrize(
  957. "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"]
  958. )
  959. def test_expanding_quantile(self, interpolation, frame):
  960. g = frame.groupby("A", group_keys=False)
  961. r = g.expanding()
  962. result = r.quantile(0.4, interpolation=interpolation)
  963. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  964. with tm.assert_produces_warning(FutureWarning, match=msg):
  965. expected = g.apply(
  966. lambda x: x.expanding().quantile(0.4, interpolation=interpolation)
  967. )
  968. # groupby.apply doesn't drop the grouped-by column
  969. expected = expected.drop("A", axis=1)
  970. # GH 39732
  971. expected_index = MultiIndex.from_arrays([frame["A"], range(40)])
  972. expected.index = expected_index
  973. tm.assert_frame_equal(result, expected)
  974. @pytest.mark.parametrize("f", ["corr", "cov"])
  975. def test_expanding_corr_cov(self, f, frame):
  976. g = frame.groupby("A")
  977. r = g.expanding()
  978. result = getattr(r, f)(frame)
  979. def func_0(x):
  980. return getattr(x.expanding(), f)(frame)
  981. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  982. with tm.assert_produces_warning(FutureWarning, match=msg):
  983. expected = g.apply(func_0)
  984. # GH 39591: groupby.apply returns 1 instead of nan for windows
  985. # with all nan values
  986. null_idx = list(range(20, 61)) + list(range(72, 113))
  987. expected.iloc[null_idx, 1] = np.nan
  988. # GH 39591: The grouped column should be all np.nan
  989. # (groupby.apply inserts 0s for cov)
  990. expected["A"] = np.nan
  991. tm.assert_frame_equal(result, expected)
  992. result = getattr(r.B, f)(pairwise=True)
  993. def func_1(x):
  994. return getattr(x.B.expanding(), f)(pairwise=True)
  995. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  996. with tm.assert_produces_warning(FutureWarning, match=msg):
  997. expected = g.apply(func_1)
  998. tm.assert_series_equal(result, expected)
  999. def test_expanding_apply(self, raw, frame):
  1000. g = frame.groupby("A", group_keys=False)
  1001. r = g.expanding()
  1002. # reduction
  1003. result = r.apply(lambda x: x.sum(), raw=raw)
  1004. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1005. with tm.assert_produces_warning(FutureWarning, match=msg):
  1006. expected = g.apply(
  1007. lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)
  1008. )
  1009. # groupby.apply doesn't drop the grouped-by column
  1010. expected = expected.drop("A", axis=1)
  1011. # GH 39732
  1012. expected_index = MultiIndex.from_arrays([frame["A"], range(40)])
  1013. expected.index = expected_index
  1014. tm.assert_frame_equal(result, expected)
  1015. class TestEWM:
  1016. @pytest.mark.parametrize(
  1017. "method, expected_data",
  1018. [
  1019. ["mean", [0.0, 0.6666666666666666, 1.4285714285714286, 2.2666666666666666]],
  1020. ["std", [np.nan, 0.707107, 0.963624, 1.177164]],
  1021. ["var", [np.nan, 0.5, 0.9285714285714286, 1.3857142857142857]],
  1022. ],
  1023. )
  1024. def test_methods(self, method, expected_data):
  1025. # GH 16037
  1026. df = DataFrame({"A": ["a"] * 4, "B": range(4)})
  1027. result = getattr(df.groupby("A").ewm(com=1.0), method)()
  1028. expected = DataFrame(
  1029. {"B": expected_data},
  1030. index=MultiIndex.from_tuples(
  1031. [
  1032. ("a", 0),
  1033. ("a", 1),
  1034. ("a", 2),
  1035. ("a", 3),
  1036. ],
  1037. names=["A", None],
  1038. ),
  1039. )
  1040. tm.assert_frame_equal(result, expected)
  1041. @pytest.mark.parametrize(
  1042. "method, expected_data",
  1043. [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]],
  1044. )
  1045. def test_pairwise_methods(self, method, expected_data):
  1046. # GH 16037
  1047. df = DataFrame({"A": ["a"] * 4, "B": range(4)})
  1048. result = getattr(df.groupby("A").ewm(com=1.0), method)()
  1049. expected = DataFrame(
  1050. {"B": expected_data},
  1051. index=MultiIndex.from_tuples(
  1052. [
  1053. ("a", 0, "B"),
  1054. ("a", 1, "B"),
  1055. ("a", 2, "B"),
  1056. ("a", 3, "B"),
  1057. ],
  1058. names=["A", None, None],
  1059. ),
  1060. )
  1061. tm.assert_frame_equal(result, expected)
  1062. expected = df.groupby("A")[["B"]].apply(
  1063. lambda x: getattr(x.ewm(com=1.0), method)()
  1064. )
  1065. tm.assert_frame_equal(result, expected)
  1066. def test_times(self, times_frame):
  1067. # GH 40951
  1068. halflife = "23 days"
  1069. # GH#42738
  1070. times = times_frame.pop("C")
  1071. result = times_frame.groupby("A").ewm(halflife=halflife, times=times).mean()
  1072. expected = DataFrame(
  1073. {
  1074. "B": [
  1075. 0.0,
  1076. 0.507534,
  1077. 1.020088,
  1078. 1.537661,
  1079. 0.0,
  1080. 0.567395,
  1081. 1.221209,
  1082. 0.0,
  1083. 0.653141,
  1084. 1.195003,
  1085. ]
  1086. },
  1087. index=MultiIndex.from_tuples(
  1088. [
  1089. ("a", 0),
  1090. ("a", 3),
  1091. ("a", 6),
  1092. ("a", 9),
  1093. ("b", 1),
  1094. ("b", 4),
  1095. ("b", 7),
  1096. ("c", 2),
  1097. ("c", 5),
  1098. ("c", 8),
  1099. ],
  1100. names=["A", None],
  1101. ),
  1102. )
  1103. tm.assert_frame_equal(result, expected)
  1104. def test_times_array(self, times_frame):
  1105. # GH 40951
  1106. halflife = "23 days"
  1107. times = times_frame.pop("C")
  1108. gb = times_frame.groupby("A")
  1109. result = gb.ewm(halflife=halflife, times=times).mean()
  1110. expected = gb.ewm(halflife=halflife, times=times.values).mean()
  1111. tm.assert_frame_equal(result, expected)
  1112. def test_dont_mutate_obj_after_slicing(self):
  1113. # GH 43355
  1114. df = DataFrame(
  1115. {
  1116. "id": ["a", "a", "b", "b", "b"],
  1117. "timestamp": date_range("2021-9-1", periods=5, freq="h"),
  1118. "y": range(5),
  1119. }
  1120. )
  1121. grp = df.groupby("id").rolling("1h", on="timestamp")
  1122. result = grp.count()
  1123. expected_df = DataFrame(
  1124. {
  1125. "timestamp": date_range("2021-9-1", periods=5, freq="h"),
  1126. "y": [1.0] * 5,
  1127. },
  1128. index=MultiIndex.from_arrays(
  1129. [["a", "a", "b", "b", "b"], list(range(5))], names=["id", None]
  1130. ),
  1131. )
  1132. tm.assert_frame_equal(result, expected_df)
  1133. result = grp["y"].count()
  1134. expected_series = Series(
  1135. [1.0] * 5,
  1136. index=MultiIndex.from_arrays(
  1137. [
  1138. ["a", "a", "b", "b", "b"],
  1139. date_range("2021-9-1", periods=5, freq="h"),
  1140. ],
  1141. names=["id", "timestamp"],
  1142. ),
  1143. name="y",
  1144. )
  1145. tm.assert_series_equal(result, expected_series)
  1146. # This is the key test
  1147. result = grp.count()
  1148. tm.assert_frame_equal(result, expected_df)
  1149. def test_rolling_corr_with_single_integer_in_index():
  1150. # GH 44078
  1151. df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]})
  1152. gb = df.groupby(["a"])
  1153. result = gb.rolling(2).corr(other=df)
  1154. index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None])
  1155. expected = DataFrame(
  1156. {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index
  1157. )
  1158. tm.assert_frame_equal(result, expected)
  1159. def test_rolling_corr_with_tuples_in_index():
  1160. # GH 44078
  1161. df = DataFrame(
  1162. {
  1163. "a": [
  1164. (
  1165. 1,
  1166. 2,
  1167. ),
  1168. (
  1169. 1,
  1170. 2,
  1171. ),
  1172. (
  1173. 1,
  1174. 2,
  1175. ),
  1176. ],
  1177. "b": [4, 5, 6],
  1178. }
  1179. )
  1180. gb = df.groupby(["a"])
  1181. result = gb.rolling(2).corr(other=df)
  1182. index = MultiIndex.from_tuples(
  1183. [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None]
  1184. )
  1185. expected = DataFrame(
  1186. {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index
  1187. )
  1188. tm.assert_frame_equal(result, expected)