test_nth.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. MultiIndex,
  8. Series,
  9. Timestamp,
  10. isna,
  11. )
  12. import pandas._testing as tm
  13. def test_first_last_nth(df):
  14. # tests for first / last / nth
  15. grouped = df.groupby("A")
  16. first = grouped.first()
  17. expected = df.loc[[1, 0], ["B", "C", "D"]]
  18. expected.index = Index(["bar", "foo"], name="A")
  19. expected = expected.sort_index()
  20. tm.assert_frame_equal(first, expected)
  21. nth = grouped.nth(0)
  22. expected = df.loc[[0, 1]]
  23. tm.assert_frame_equal(nth, expected)
  24. last = grouped.last()
  25. expected = df.loc[[5, 7], ["B", "C", "D"]]
  26. expected.index = Index(["bar", "foo"], name="A")
  27. tm.assert_frame_equal(last, expected)
  28. nth = grouped.nth(-1)
  29. expected = df.iloc[[5, 7]]
  30. tm.assert_frame_equal(nth, expected)
  31. nth = grouped.nth(1)
  32. expected = df.iloc[[2, 3]]
  33. tm.assert_frame_equal(nth, expected)
  34. # it works!
  35. grouped["B"].first()
  36. grouped["B"].last()
  37. grouped["B"].nth(0)
  38. df = df.copy()
  39. df.loc[df["A"] == "foo", "B"] = np.nan
  40. grouped = df.groupby("A")
  41. assert isna(grouped["B"].first()["foo"])
  42. assert isna(grouped["B"].last()["foo"])
  43. assert isna(grouped["B"].nth(0).iloc[0])
  44. # v0.14.0 whatsnew
  45. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
  46. g = df.groupby("A")
  47. result = g.first()
  48. expected = df.iloc[[1, 2]].set_index("A")
  49. tm.assert_frame_equal(result, expected)
  50. expected = df.iloc[[1, 2]]
  51. result = g.nth(0, dropna="any")
  52. tm.assert_frame_equal(result, expected)
  53. @pytest.mark.parametrize("method", ["first", "last"])
  54. def test_first_last_with_na_object(method, nulls_fixture):
  55. # https://github.com/pandas-dev/pandas/issues/32123
  56. groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
  57. result = getattr(groups, method)()
  58. if method == "first":
  59. values = [1, 3]
  60. else:
  61. values = [2, 3]
  62. values = np.array(values, dtype=result["b"].dtype)
  63. idx = Index([1, 2], name="a")
  64. expected = DataFrame({"b": values}, index=idx)
  65. tm.assert_frame_equal(result, expected)
  66. @pytest.mark.parametrize("index", [0, -1])
  67. def test_nth_with_na_object(index, nulls_fixture):
  68. # https://github.com/pandas-dev/pandas/issues/32123
  69. df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
  70. groups = df.groupby("a")
  71. result = groups.nth(index)
  72. expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
  73. tm.assert_frame_equal(result, expected)
  74. @pytest.mark.parametrize("method", ["first", "last"])
  75. def test_first_last_with_None(method):
  76. # https://github.com/pandas-dev/pandas/issues/32800
  77. # None should be preserved as object dtype
  78. df = DataFrame.from_dict({"id": ["a"], "value": [None]})
  79. groups = df.groupby("id", as_index=False)
  80. result = getattr(groups, method)()
  81. tm.assert_frame_equal(result, df)
  82. @pytest.mark.parametrize("method", ["first", "last"])
  83. @pytest.mark.parametrize(
  84. "df, expected",
  85. [
  86. (
  87. DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
  88. DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
  89. ),
  90. (
  91. DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
  92. DataFrame({"value": [None]}, index=Index(["a"], name="id")),
  93. ),
  94. ],
  95. )
  96. def test_first_last_with_None_expanded(method, df, expected):
  97. # GH 32800, 38286
  98. result = getattr(df.groupby("id"), method)()
  99. tm.assert_frame_equal(result, expected)
  100. def test_first_last_nth_dtypes():
  101. df = DataFrame(
  102. {
  103. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  104. "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
  105. "C": np.random.default_rng(2).standard_normal(8),
  106. "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
  107. }
  108. )
  109. df["E"] = True
  110. df["F"] = 1
  111. # tests for first / last / nth
  112. grouped = df.groupby("A")
  113. first = grouped.first()
  114. expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
  115. expected.index = Index(["bar", "foo"], name="A")
  116. expected = expected.sort_index()
  117. tm.assert_frame_equal(first, expected)
  118. last = grouped.last()
  119. expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
  120. expected.index = Index(["bar", "foo"], name="A")
  121. expected = expected.sort_index()
  122. tm.assert_frame_equal(last, expected)
  123. nth = grouped.nth(1)
  124. expected = df.iloc[[2, 3]]
  125. tm.assert_frame_equal(nth, expected)
  126. def test_first_last_nth_dtypes2():
  127. # GH 2763, first/last shifting dtypes
  128. idx = list(range(10))
  129. idx.append(9)
  130. ser = Series(data=range(11), index=idx, name="IntCol")
  131. assert ser.dtype == "int64"
  132. f = ser.groupby(level=0).first()
  133. assert f.dtype == "int64"
  134. def test_first_last_nth_nan_dtype():
  135. # GH 33591
  136. df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
  137. grouped = df.groupby("data")
  138. expected = df.set_index("data").nans
  139. tm.assert_series_equal(grouped.nans.first(), expected)
  140. tm.assert_series_equal(grouped.nans.last(), expected)
  141. expected = df.nans
  142. tm.assert_series_equal(grouped.nans.nth(-1), expected)
  143. tm.assert_series_equal(grouped.nans.nth(0), expected)
  144. def test_first_strings_timestamps():
  145. # GH 11244
  146. test = DataFrame(
  147. {
  148. Timestamp("2012-01-01 00:00:00"): ["a", "b"],
  149. Timestamp("2012-01-02 00:00:00"): ["c", "d"],
  150. "name": ["e", "e"],
  151. "aaaa": ["f", "g"],
  152. }
  153. )
  154. result = test.groupby("name").first()
  155. expected = DataFrame(
  156. [["a", "c", "f"]],
  157. columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
  158. index=Index(["e"], name="name"),
  159. )
  160. tm.assert_frame_equal(result, expected)
  161. def test_nth():
  162. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
  163. gb = df.groupby("A")
  164. tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
  165. tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
  166. tm.assert_frame_equal(gb.nth(2), df.loc[[]])
  167. tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
  168. tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
  169. tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
  170. tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
  171. tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
  172. tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
  173. tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
  174. tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
  175. tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
  176. tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
  177. def test_nth2():
  178. # out of bounds, regression from 0.13.1
  179. # GH 6621
  180. df = DataFrame(
  181. {
  182. "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
  183. "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
  184. "two": {
  185. 0: 1.5456590000000001,
  186. 1: -0.070345000000000005,
  187. 2: -2.4004539999999999,
  188. 3: 0.46206000000000003,
  189. 4: 0.52350799999999997,
  190. },
  191. "one": {
  192. 0: 0.56573799999999996,
  193. 1: -0.9742360000000001,
  194. 2: 1.033801,
  195. 3: -0.78543499999999999,
  196. 4: 0.70422799999999997,
  197. },
  198. }
  199. ).set_index(["color", "food"])
  200. result = df.groupby(level=0, as_index=False).nth(2)
  201. expected = df.iloc[[-1]]
  202. tm.assert_frame_equal(result, expected)
  203. result = df.groupby(level=0, as_index=False).nth(3)
  204. expected = df.loc[[]]
  205. tm.assert_frame_equal(result, expected)
  206. def test_nth3():
  207. # GH 7559
  208. # from the vbench
  209. df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
  210. ser = df[1]
  211. gb = df[0]
  212. expected = ser.groupby(gb).first()
  213. expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
  214. tm.assert_series_equal(expected2, expected, check_names=False)
  215. assert expected.name == 1
  216. assert expected2.name == 1
  217. # validate first
  218. v = ser[gb == 1].iloc[0]
  219. assert expected.iloc[0] == v
  220. assert expected2.iloc[0] == v
  221. with pytest.raises(ValueError, match="For a DataFrame"):
  222. ser.groupby(gb, sort=False).nth(0, dropna=True)
  223. def test_nth4():
  224. # doc example
  225. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
  226. gb = df.groupby("A")
  227. result = gb.B.nth(0, dropna="all")
  228. expected = df.B.iloc[[1, 2]]
  229. tm.assert_series_equal(result, expected)
  230. def test_nth5():
  231. # test multiple nth values
  232. df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
  233. gb = df.groupby("A")
  234. tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
  235. tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
  236. tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
  237. tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
  238. tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
  239. tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
  240. tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
  241. tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
  242. def test_nth_bdays(unit):
  243. business_dates = pd.date_range(
  244. start="4/1/2014", end="6/30/2014", freq="B", unit=unit
  245. )
  246. df = DataFrame(1, index=business_dates, columns=["a", "b"])
  247. # get the first, fourth and last two business days for each month
  248. key = [df.index.year, df.index.month]
  249. result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
  250. expected_dates = pd.to_datetime(
  251. [
  252. "2014/4/1",
  253. "2014/4/4",
  254. "2014/4/29",
  255. "2014/4/30",
  256. "2014/5/1",
  257. "2014/5/6",
  258. "2014/5/29",
  259. "2014/5/30",
  260. "2014/6/2",
  261. "2014/6/5",
  262. "2014/6/27",
  263. "2014/6/30",
  264. ]
  265. ).as_unit(unit)
  266. expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
  267. tm.assert_frame_equal(result, expected)
  268. def test_nth_multi_grouper(three_group):
  269. # PR 9090, related to issue 8979
  270. # test nth on multiple groupers
  271. grouped = three_group.groupby(["A", "B"])
  272. result = grouped.nth(0)
  273. expected = three_group.iloc[[0, 3, 4, 7]]
  274. tm.assert_frame_equal(result, expected)
  275. @pytest.mark.parametrize(
  276. "data, expected_first, expected_last",
  277. [
  278. (
  279. {
  280. "id": ["A"],
  281. "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
  282. "foo": [1],
  283. },
  284. {
  285. "id": ["A"],
  286. "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
  287. "foo": [1],
  288. },
  289. {
  290. "id": ["A"],
  291. "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
  292. "foo": [1],
  293. },
  294. ),
  295. (
  296. {
  297. "id": ["A", "B", "A"],
  298. "time": [
  299. Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
  300. Timestamp("2012-02-01 14:00:00", tz="US/Central"),
  301. Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
  302. ],
  303. "foo": [1, 2, 3],
  304. },
  305. {
  306. "id": ["A", "B"],
  307. "time": [
  308. Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
  309. Timestamp("2012-02-01 14:00:00", tz="US/Central"),
  310. ],
  311. "foo": [1, 2],
  312. },
  313. {
  314. "id": ["A", "B"],
  315. "time": [
  316. Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
  317. Timestamp("2012-02-01 14:00:00", tz="US/Central"),
  318. ],
  319. "foo": [3, 2],
  320. },
  321. ),
  322. ],
  323. )
  324. def test_first_last_tz(data, expected_first, expected_last):
  325. # GH15884
  326. # Test that the timezone is retained when calling first
  327. # or last on groupby with as_index=False
  328. df = DataFrame(data)
  329. result = df.groupby("id", as_index=False).first()
  330. expected = DataFrame(expected_first)
  331. cols = ["id", "time", "foo"]
  332. tm.assert_frame_equal(result[cols], expected[cols])
  333. result = df.groupby("id", as_index=False)["time"].first()
  334. tm.assert_frame_equal(result, expected[["id", "time"]])
  335. result = df.groupby("id", as_index=False).last()
  336. expected = DataFrame(expected_last)
  337. cols = ["id", "time", "foo"]
  338. tm.assert_frame_equal(result[cols], expected[cols])
  339. result = df.groupby("id", as_index=False)["time"].last()
  340. tm.assert_frame_equal(result, expected[["id", "time"]])
  341. @pytest.mark.parametrize(
  342. "method, ts, alpha",
  343. [
  344. ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
  345. ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
  346. ],
  347. )
  348. def test_first_last_tz_multi_column(method, ts, alpha, unit):
  349. # GH 21603
  350. category_string = Series(list("abc")).astype("category")
  351. dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
  352. df = DataFrame(
  353. {
  354. "group": [1, 1, 2],
  355. "category_string": category_string,
  356. "datetimetz": dti,
  357. }
  358. )
  359. result = getattr(df.groupby("group"), method)()
  360. expected = DataFrame(
  361. {
  362. "category_string": pd.Categorical(
  363. [alpha, "c"], dtype=category_string.dtype
  364. ),
  365. "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
  366. },
  367. index=Index([1, 2], name="group"),
  368. )
  369. expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
  370. tm.assert_frame_equal(result, expected)
  371. @pytest.mark.parametrize(
  372. "values",
  373. [
  374. pd.array([True, False], dtype="boolean"),
  375. pd.array([1, 2], dtype="Int64"),
  376. pd.to_datetime(["2020-01-01", "2020-02-01"]),
  377. pd.to_timedelta([1, 2], unit="D"),
  378. ],
  379. )
  380. @pytest.mark.parametrize("function", ["first", "last", "min", "max"])
  381. def test_first_last_extension_array_keeps_dtype(values, function):
  382. # https://github.com/pandas-dev/pandas/issues/33071
  383. # https://github.com/pandas-dev/pandas/issues/32194
  384. df = DataFrame({"a": [1, 2], "b": values})
  385. grouped = df.groupby("a")
  386. idx = Index([1, 2], name="a")
  387. expected_series = Series(values, name="b", index=idx)
  388. expected_frame = DataFrame({"b": values}, index=idx)
  389. result_series = getattr(grouped["b"], function)()
  390. tm.assert_series_equal(result_series, expected_series)
  391. result_frame = grouped.agg({"b": function})
  392. tm.assert_frame_equal(result_frame, expected_frame)
  393. def test_nth_multi_index_as_expected():
  394. # PR 9090, related to issue 8979
  395. # test nth on MultiIndex
  396. three_group = DataFrame(
  397. {
  398. "A": [
  399. "foo",
  400. "foo",
  401. "foo",
  402. "foo",
  403. "bar",
  404. "bar",
  405. "bar",
  406. "bar",
  407. "foo",
  408. "foo",
  409. "foo",
  410. ],
  411. "B": [
  412. "one",
  413. "one",
  414. "one",
  415. "two",
  416. "one",
  417. "one",
  418. "one",
  419. "two",
  420. "two",
  421. "two",
  422. "one",
  423. ],
  424. "C": [
  425. "dull",
  426. "dull",
  427. "shiny",
  428. "dull",
  429. "dull",
  430. "shiny",
  431. "shiny",
  432. "dull",
  433. "shiny",
  434. "shiny",
  435. "shiny",
  436. ],
  437. }
  438. )
  439. grouped = three_group.groupby(["A", "B"])
  440. result = grouped.nth(0)
  441. expected = three_group.iloc[[0, 3, 4, 7]]
  442. tm.assert_frame_equal(result, expected)
  443. @pytest.mark.parametrize(
  444. "op, n, expected_rows",
  445. [
  446. ("head", -1, [0]),
  447. ("head", 0, []),
  448. ("head", 1, [0, 2]),
  449. ("head", 7, [0, 1, 2]),
  450. ("tail", -1, [1]),
  451. ("tail", 0, []),
  452. ("tail", 1, [1, 2]),
  453. ("tail", 7, [0, 1, 2]),
  454. ],
  455. )
  456. @pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
  457. @pytest.mark.parametrize("as_index", [True, False])
  458. def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
  459. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
  460. g = df.groupby("A", as_index=as_index)
  461. expected = df.iloc[expected_rows]
  462. if columns is not None:
  463. g = g[columns]
  464. expected = expected[columns]
  465. result = getattr(g, op)(n)
  466. tm.assert_frame_equal(result, expected)
  467. @pytest.mark.parametrize(
  468. "op, n, expected_cols",
  469. [
  470. ("head", -1, [0]),
  471. ("head", 0, []),
  472. ("head", 1, [0, 2]),
  473. ("head", 7, [0, 1, 2]),
  474. ("tail", -1, [1]),
  475. ("tail", 0, []),
  476. ("tail", 1, [1, 2]),
  477. ("tail", 7, [0, 1, 2]),
  478. ],
  479. )
  480. def test_groupby_head_tail_axis_1(op, n, expected_cols):
  481. # GH 9772
  482. df = DataFrame(
  483. [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
  484. )
  485. msg = "DataFrame.groupby with axis=1 is deprecated"
  486. with tm.assert_produces_warning(FutureWarning, match=msg):
  487. g = df.groupby([0, 0, 1], axis=1)
  488. expected = df.iloc[:, expected_cols]
  489. result = getattr(g, op)(n)
  490. tm.assert_frame_equal(result, expected)
  491. def test_group_selection_cache():
  492. # GH 12839 nth, head, and tail should return same result consistently
  493. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
  494. expected = df.iloc[[0, 2]]
  495. g = df.groupby("A")
  496. result1 = g.head(n=2)
  497. result2 = g.nth(0)
  498. tm.assert_frame_equal(result1, df)
  499. tm.assert_frame_equal(result2, expected)
  500. g = df.groupby("A")
  501. result1 = g.tail(n=2)
  502. result2 = g.nth(0)
  503. tm.assert_frame_equal(result1, df)
  504. tm.assert_frame_equal(result2, expected)
  505. g = df.groupby("A")
  506. result1 = g.nth(0)
  507. result2 = g.head(n=2)
  508. tm.assert_frame_equal(result1, expected)
  509. tm.assert_frame_equal(result2, df)
  510. g = df.groupby("A")
  511. result1 = g.nth(0)
  512. result2 = g.tail(n=2)
  513. tm.assert_frame_equal(result1, expected)
  514. tm.assert_frame_equal(result2, df)
  515. def test_nth_empty():
  516. # GH 16064
  517. df = DataFrame(index=[0], columns=["a", "b", "c"])
  518. result = df.groupby("a").nth(10)
  519. expected = df.iloc[:0]
  520. tm.assert_frame_equal(result, expected)
  521. result = df.groupby(["a", "b"]).nth(10)
  522. expected = df.iloc[:0]
  523. tm.assert_frame_equal(result, expected)
  524. def test_nth_column_order():
  525. # GH 20760
  526. # Check that nth preserves column order
  527. df = DataFrame(
  528. [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
  529. columns=["A", "C", "B"],
  530. )
  531. result = df.groupby("A").nth(0)
  532. expected = df.iloc[[0, 3]]
  533. tm.assert_frame_equal(result, expected)
  534. result = df.groupby("A").nth(-1, dropna="any")
  535. expected = df.iloc[[1, 4]]
  536. tm.assert_frame_equal(result, expected)
  537. @pytest.mark.parametrize("dropna", [None, "any", "all"])
  538. def test_nth_nan_in_grouper(dropna):
  539. # GH 26011
  540. df = DataFrame(
  541. {
  542. "a": [np.nan, "a", np.nan, "b", np.nan],
  543. "b": [0, 2, 4, 6, 8],
  544. "c": [1, 3, 5, 7, 9],
  545. }
  546. )
  547. result = df.groupby("a").nth(0, dropna=dropna)
  548. expected = df.iloc[[1, 3]]
  549. tm.assert_frame_equal(result, expected)
  550. @pytest.mark.parametrize("dropna", [None, "any", "all"])
  551. def test_nth_nan_in_grouper_series(dropna):
  552. # GH 26454
  553. df = DataFrame(
  554. {
  555. "a": [np.nan, "a", np.nan, "b", np.nan],
  556. "b": [0, 2, 4, 6, 8],
  557. }
  558. )
  559. result = df.groupby("a")["b"].nth(0, dropna=dropna)
  560. expected = df["b"].iloc[[1, 3]]
  561. tm.assert_series_equal(result, expected)
  562. def test_first_categorical_and_datetime_data_nat():
  563. # GH 20520
  564. df = DataFrame(
  565. {
  566. "group": ["first", "first", "second", "third", "third"],
  567. "time": 5 * [np.datetime64("NaT")],
  568. "categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
  569. }
  570. )
  571. result = df.groupby("group").first()
  572. expected = DataFrame(
  573. {
  574. "time": 3 * [np.datetime64("NaT")],
  575. "categories": Series(["a", "c", "a"]).astype(
  576. pd.CategoricalDtype(["a", "b", "c"])
  577. ),
  578. }
  579. )
  580. expected.index = Index(["first", "second", "third"], name="group")
  581. tm.assert_frame_equal(result, expected)
  582. def test_first_multi_key_groupby_categorical():
  583. # GH 22512
  584. df = DataFrame(
  585. {
  586. "A": [1, 1, 1, 2, 2],
  587. "B": [100, 100, 200, 100, 100],
  588. "C": ["apple", "orange", "mango", "mango", "orange"],
  589. "D": ["jupiter", "mercury", "mars", "venus", "venus"],
  590. }
  591. )
  592. df = df.astype({"D": "category"})
  593. result = df.groupby(by=["A", "B"]).first()
  594. expected = DataFrame(
  595. {
  596. "C": ["apple", "mango", "mango"],
  597. "D": Series(["jupiter", "mars", "venus"]).astype(
  598. pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
  599. ),
  600. }
  601. )
  602. expected.index = MultiIndex.from_tuples(
  603. [(1, 100), (1, 200), (2, 100)], names=["A", "B"]
  604. )
  605. tm.assert_frame_equal(result, expected)
  606. @pytest.mark.parametrize("method", ["first", "last", "nth"])
  607. def test_groupby_last_first_nth_with_none(method, nulls_fixture):
  608. # GH29645
  609. expected = Series(["y"], dtype=object)
  610. data = Series(
  611. [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
  612. index=[0, 0, 0, 0, 0],
  613. dtype=object,
  614. ).groupby(level=0)
  615. if method == "nth":
  616. result = getattr(data, method)(3)
  617. else:
  618. result = getattr(data, method)()
  619. tm.assert_series_equal(result, expected)
  620. @pytest.mark.parametrize(
  621. "arg, expected_rows",
  622. [
  623. [slice(None, 3, 2), [0, 1, 4, 5]],
  624. [slice(None, -2), [0, 2, 5]],
  625. [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
  626. [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
  627. ],
  628. )
  629. def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
  630. # Test slices GH #42947
  631. result = slice_test_grouped.nth[arg]
  632. equivalent = slice_test_grouped.nth(arg)
  633. expected = slice_test_df.iloc[expected_rows]
  634. tm.assert_frame_equal(result, expected)
  635. tm.assert_frame_equal(equivalent, expected)
  636. def test_nth_indexed(slice_test_df, slice_test_grouped):
  637. # Test index notation GH #44688
  638. result = slice_test_grouped.nth[0, 1, -2:]
  639. equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
  640. expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
  641. tm.assert_frame_equal(result, expected)
  642. tm.assert_frame_equal(equivalent, expected)
  643. def test_invalid_argument(slice_test_grouped):
  644. # Test for error on invalid argument
  645. with pytest.raises(TypeError, match="Invalid index"):
  646. slice_test_grouped.nth(3.14)
  647. def test_negative_step(slice_test_grouped):
  648. # Test for error on negative slice step
  649. with pytest.raises(ValueError, match="Invalid step"):
  650. slice_test_grouped.nth(slice(None, None, -1))
  651. def test_np_ints(slice_test_df, slice_test_grouped):
  652. # Test np ints work
  653. result = slice_test_grouped.nth(np.array([0, 1]))
  654. expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
  655. tm.assert_frame_equal(result, expected)
  656. def test_groupby_nth_with_column_axis():
  657. # GH43926
  658. df = DataFrame(
  659. [
  660. [4, 5, 6],
  661. [8, 8, 7],
  662. ],
  663. index=["z", "y"],
  664. columns=["C", "B", "A"],
  665. )
  666. msg = "DataFrame.groupby with axis=1 is deprecated"
  667. with tm.assert_produces_warning(FutureWarning, match=msg):
  668. gb = df.groupby(df.iloc[1], axis=1)
  669. result = gb.nth(0)
  670. expected = df.iloc[:, [0, 2]]
  671. tm.assert_frame_equal(result, expected)
  672. def test_groupby_nth_interval():
  673. # GH#24205
  674. idx_result = MultiIndex(
  675. [
  676. pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
  677. pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
  678. ],
  679. [[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
  680. )
  681. df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
  682. result = df_result.groupby(level=[0, 1], observed=False).nth(0)
  683. val_expected = [0, 1, 3]
  684. idx_expected = MultiIndex(
  685. [
  686. pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
  687. pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
  688. ],
  689. [[0, 0, 1], [0, 1, 0]],
  690. )
  691. expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
  692. tm.assert_frame_equal(result, expected)
  693. @pytest.mark.parametrize(
  694. "start, stop, expected_values, expected_columns",
  695. [
  696. (None, None, [0, 1, 2, 3, 4], list("ABCDE")),
  697. (None, 1, [0, 3], list("AD")),
  698. (None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
  699. (None, -1, [0, 1, 3], list("ABD")),
  700. (1, None, [1, 2, 4], list("BCE")),
  701. (1, -1, [1], list("B")),
  702. (-1, None, [2, 4], list("CE")),
  703. (-1, 2, [4], list("E")),
  704. ],
  705. )
  706. @pytest.mark.parametrize("method", ["call", "index"])
  707. def test_nth_slices_with_column_axis(
  708. start, stop, expected_values, expected_columns, method
  709. ):
  710. df = DataFrame([range(5)], columns=[list("ABCDE")])
  711. msg = "DataFrame.groupby with axis=1 is deprecated"
  712. with tm.assert_produces_warning(FutureWarning, match=msg):
  713. gb = df.groupby([5, 5, 5, 6, 6], axis=1)
  714. result = {
  715. "call": lambda start, stop: gb.nth(slice(start, stop)),
  716. "index": lambda start, stop: gb.nth[start:stop],
  717. }[method](start, stop)
  718. expected = DataFrame([expected_values], columns=[expected_columns])
  719. tm.assert_frame_equal(result, expected)
  720. @pytest.mark.filterwarnings(
  721. "ignore:invalid value encountered in remainder:RuntimeWarning"
  722. )
  723. def test_head_tail_dropna_true():
  724. # GH#45089
  725. df = DataFrame(
  726. [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
  727. )
  728. expected = DataFrame([["a", "z"]], columns=["X", "Y"])
  729. result = df.groupby(["X", "Y"]).head(n=1)
  730. tm.assert_frame_equal(result, expected)
  731. result = df.groupby(["X", "Y"]).tail(n=1)
  732. tm.assert_frame_equal(result, expected)
  733. result = df.groupby(["X", "Y"]).nth(n=0)
  734. tm.assert_frame_equal(result, expected)
  735. def test_head_tail_dropna_false():
  736. # GH#45089
  737. df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
  738. expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
  739. result = df.groupby(["X", "Y"], dropna=False).head(n=1)
  740. tm.assert_frame_equal(result, expected)
  741. result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
  742. tm.assert_frame_equal(result, expected)
  743. result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
  744. tm.assert_frame_equal(result, expected)
  745. @pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
  746. @pytest.mark.parametrize("dropna", ["any", "all", None])
  747. def test_nth_after_selection(selection, dropna):
  748. # GH#11038, GH#53518
  749. df = DataFrame(
  750. {
  751. "a": [1, 1, 2],
  752. "b": [np.nan, 3, 4],
  753. "c": [5, 6, 7],
  754. }
  755. )
  756. gb = df.groupby("a")[selection]
  757. result = gb.nth(0, dropna=dropna)
  758. if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
  759. locs = [1, 2]
  760. else:
  761. locs = [0, 2]
  762. expected = df.loc[locs, selection]
  763. tm.assert_equal(result, expected)
  764. @pytest.mark.parametrize(
  765. "data",
  766. [
  767. (
  768. Timestamp("2011-01-15 12:50:28.502376"),
  769. Timestamp("2011-01-20 12:50:28.593448"),
  770. ),
  771. (24650000000000001, 24650000000000002),
  772. ],
  773. )
  774. def test_groupby_nth_int_like_precision(data):
  775. # GH#6620, GH#9311
  776. df = DataFrame({"a": [1, 1], "b": data})
  777. grouped = df.groupby("a")
  778. result = grouped.nth(0)
  779. expected = DataFrame({"a": 1, "b": [data[0]]})
  780. tm.assert_frame_equal(result, expected)