test_counting.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. from itertools import product
  2. from string import ascii_lowercase
  3. import numpy as np
  4. import pytest
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. MultiIndex,
  9. Period,
  10. Series,
  11. Timedelta,
  12. Timestamp,
  13. date_range,
  14. )
  15. import pandas._testing as tm
  16. class TestCounting:
  17. def test_cumcount(self):
  18. df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
  19. g = df.groupby("A")
  20. sg = g.A
  21. expected = Series([0, 1, 2, 0, 3])
  22. tm.assert_series_equal(expected, g.cumcount())
  23. tm.assert_series_equal(expected, sg.cumcount())
  24. def test_cumcount_empty(self):
  25. ge = DataFrame().groupby(level=0)
  26. se = Series(dtype=object).groupby(level=0)
  27. # edge case, as this is usually considered float
  28. e = Series(dtype="int64")
  29. tm.assert_series_equal(e, ge.cumcount())
  30. tm.assert_series_equal(e, se.cumcount())
  31. def test_cumcount_dupe_index(self):
  32. df = DataFrame(
  33. [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
  34. )
  35. g = df.groupby("A")
  36. sg = g.A
  37. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  38. tm.assert_series_equal(expected, g.cumcount())
  39. tm.assert_series_equal(expected, sg.cumcount())
  40. def test_cumcount_mi(self):
  41. mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
  42. df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
  43. g = df.groupby("A")
  44. sg = g.A
  45. expected = Series([0, 1, 2, 0, 3], index=mi)
  46. tm.assert_series_equal(expected, g.cumcount())
  47. tm.assert_series_equal(expected, sg.cumcount())
  48. def test_cumcount_groupby_not_col(self):
  49. df = DataFrame(
  50. [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
  51. )
  52. g = df.groupby([0, 0, 0, 1, 0])
  53. sg = g.A
  54. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  55. tm.assert_series_equal(expected, g.cumcount())
  56. tm.assert_series_equal(expected, sg.cumcount())
  57. def test_ngroup(self):
  58. df = DataFrame({"A": list("aaaba")})
  59. g = df.groupby("A")
  60. sg = g.A
  61. expected = Series([0, 0, 0, 1, 0])
  62. tm.assert_series_equal(expected, g.ngroup())
  63. tm.assert_series_equal(expected, sg.ngroup())
  64. def test_ngroup_distinct(self):
  65. df = DataFrame({"A": list("abcde")})
  66. g = df.groupby("A")
  67. sg = g.A
  68. expected = Series(range(5), dtype="int64")
  69. tm.assert_series_equal(expected, g.ngroup())
  70. tm.assert_series_equal(expected, sg.ngroup())
  71. def test_ngroup_one_group(self):
  72. df = DataFrame({"A": [0] * 5})
  73. g = df.groupby("A")
  74. sg = g.A
  75. expected = Series([0] * 5)
  76. tm.assert_series_equal(expected, g.ngroup())
  77. tm.assert_series_equal(expected, sg.ngroup())
  78. def test_ngroup_empty(self):
  79. ge = DataFrame().groupby(level=0)
  80. se = Series(dtype=object).groupby(level=0)
  81. # edge case, as this is usually considered float
  82. e = Series(dtype="int64")
  83. tm.assert_series_equal(e, ge.ngroup())
  84. tm.assert_series_equal(e, se.ngroup())
  85. def test_ngroup_series_matches_frame(self):
  86. df = DataFrame({"A": list("aaaba")})
  87. s = Series(list("aaaba"))
  88. tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
  89. def test_ngroup_dupe_index(self):
  90. df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
  91. g = df.groupby("A")
  92. sg = g.A
  93. expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
  94. tm.assert_series_equal(expected, g.ngroup())
  95. tm.assert_series_equal(expected, sg.ngroup())
  96. def test_ngroup_mi(self):
  97. mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
  98. df = DataFrame({"A": list("aaaba")}, index=mi)
  99. g = df.groupby("A")
  100. sg = g.A
  101. expected = Series([0, 0, 0, 1, 0], index=mi)
  102. tm.assert_series_equal(expected, g.ngroup())
  103. tm.assert_series_equal(expected, sg.ngroup())
  104. def test_ngroup_groupby_not_col(self):
  105. df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
  106. g = df.groupby([0, 0, 0, 1, 0])
  107. sg = g.A
  108. expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
  109. tm.assert_series_equal(expected, g.ngroup())
  110. tm.assert_series_equal(expected, sg.ngroup())
  111. def test_ngroup_descending(self):
  112. df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
  113. g = df.groupby(["A"])
  114. ascending = Series([0, 0, 1, 0, 1])
  115. descending = Series([1, 1, 0, 1, 0])
  116. tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
  117. tm.assert_series_equal(ascending, g.ngroup(ascending=True))
  118. tm.assert_series_equal(descending, g.ngroup(ascending=False))
  119. def test_ngroup_matches_cumcount(self):
  120. # verify one manually-worked out case works
  121. df = DataFrame(
  122. [["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
  123. columns=["A", "X"],
  124. )
  125. g = df.groupby(["A", "X"])
  126. g_ngroup = g.ngroup()
  127. g_cumcount = g.cumcount()
  128. expected_ngroup = Series([0, 1, 2, 0, 3])
  129. expected_cumcount = Series([0, 0, 0, 1, 0])
  130. tm.assert_series_equal(g_ngroup, expected_ngroup)
  131. tm.assert_series_equal(g_cumcount, expected_cumcount)
  132. def test_ngroup_cumcount_pair(self):
  133. # brute force comparison for all small series
  134. for p in product(range(3), repeat=4):
  135. df = DataFrame({"a": p})
  136. g = df.groupby(["a"])
  137. order = sorted(set(p))
  138. ngroupd = [order.index(val) for val in p]
  139. cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
  140. tm.assert_series_equal(g.ngroup(), Series(ngroupd))
  141. tm.assert_series_equal(g.cumcount(), Series(cumcounted))
  142. def test_ngroup_respects_groupby_order(self, sort):
  143. df = DataFrame({"a": np.random.default_rng(2).choice(list("abcdef"), 100)})
  144. g = df.groupby("a", sort=sort)
  145. df["group_id"] = -1
  146. df["group_index"] = -1
  147. for i, (_, group) in enumerate(g):
  148. df.loc[group.index, "group_id"] = i
  149. for j, ind in enumerate(group.index):
  150. df.loc[ind, "group_index"] = j
  151. tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
  152. tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
  153. @pytest.mark.parametrize(
  154. "datetimelike",
  155. [
  156. [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
  157. [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
  158. [Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
  159. [Timedelta(x, unit="h") for x in range(1, 4)],
  160. [Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
  161. ],
  162. )
  163. def test_count_with_datetimelike(self, datetimelike):
  164. # test for #13393, where DataframeGroupBy.count() fails
  165. # when counting a datetimelike column.
  166. df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
  167. res = df.groupby("x").count()
  168. expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
  169. expected.index.name = "x"
  170. tm.assert_frame_equal(expected, res)
  171. def test_count_with_only_nans_in_first_group(self):
  172. # GH21956
  173. df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
  174. result = df.groupby(["A", "B"]).C.count()
  175. mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
  176. expected = Series([], index=mi, dtype=np.int64, name="C")
  177. tm.assert_series_equal(result, expected, check_index_type=False)
  178. def test_count_groupby_column_with_nan_in_groupby_column(self):
  179. # https://github.com/pandas-dev/pandas/issues/32841
  180. df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]})
  181. res = df.groupby(["B"]).count()
  182. expected = DataFrame(
  183. index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
  184. )
  185. tm.assert_frame_equal(expected, res)
  186. def test_groupby_count_dateparseerror(self):
  187. dr = date_range(start="1/1/2012", freq="5min", periods=10)
  188. # BAD Example, datetimes first
  189. ser = Series(np.arange(10), index=[dr, np.arange(10)])
  190. grouped = ser.groupby(lambda x: x[1] % 2 == 0)
  191. result = grouped.count()
  192. ser = Series(np.arange(10), index=[np.arange(10), dr])
  193. grouped = ser.groupby(lambda x: x[0] % 2 == 0)
  194. expected = grouped.count()
  195. tm.assert_series_equal(result, expected)
  196. def test_groupby_timedelta_cython_count():
  197. df = DataFrame(
  198. {"g": list("ab" * 2), "delta": np.arange(4).astype("timedelta64[ns]")}
  199. )
  200. expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delta")
  201. result = df.groupby("g").delta.count()
  202. tm.assert_series_equal(expected, result)
  203. def test_count():
  204. n = 1 << 15
  205. dr = date_range("2015-08-30", periods=n // 10, freq="min")
  206. df = DataFrame(
  207. {
  208. "1st": np.random.default_rng(2).choice(list(ascii_lowercase), n),
  209. "2nd": np.random.default_rng(2).integers(0, 5, n),
  210. "3rd": np.random.default_rng(2).standard_normal(n).round(3),
  211. "4th": np.random.default_rng(2).integers(-10, 10, n),
  212. "5th": np.random.default_rng(2).choice(dr, n),
  213. "6th": np.random.default_rng(2).standard_normal(n).round(3),
  214. "7th": np.random.default_rng(2).standard_normal(n).round(3),
  215. "8th": np.random.default_rng(2).choice(dr, n)
  216. - np.random.default_rng(2).choice(dr, 1),
  217. "9th": np.random.default_rng(2).choice(list(ascii_lowercase), n),
  218. }
  219. )
  220. for col in df.columns.drop(["1st", "2nd", "4th"]):
  221. df.loc[np.random.default_rng(2).choice(n, n // 10), col] = np.nan
  222. df["9th"] = df["9th"].astype("category")
  223. for key in ["1st", "2nd", ["1st", "2nd"]]:
  224. left = df.groupby(key).count()
  225. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  226. with tm.assert_produces_warning(FutureWarning, match=msg):
  227. right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
  228. tm.assert_frame_equal(left, right)
  229. def test_count_non_nulls():
  230. # GH#5610
  231. # count counts non-nulls
  232. df = DataFrame(
  233. [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
  234. columns=["A", "B", "C"],
  235. )
  236. count_as = df.groupby("A").count()
  237. count_not_as = df.groupby("A", as_index=False).count()
  238. expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
  239. expected.index.name = "A"
  240. tm.assert_frame_equal(count_not_as, expected.reset_index())
  241. tm.assert_frame_equal(count_as, expected)
  242. count_B = df.groupby("A")["B"].count()
  243. tm.assert_series_equal(count_B, expected["B"])
  244. def test_count_object():
  245. df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
  246. result = df.groupby("c").a.count()
  247. expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
  248. tm.assert_series_equal(result, expected)
  249. df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
  250. result = df.groupby("c").a.count()
  251. expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
  252. tm.assert_series_equal(result, expected)
  253. def test_count_cross_type():
  254. # GH8169
  255. # Set float64 dtype to avoid upcast when setting nan below
  256. vals = np.hstack(
  257. (
  258. np.random.default_rng(2).integers(0, 5, (100, 2)),
  259. np.random.default_rng(2).integers(0, 2, (100, 2)),
  260. )
  261. ).astype("float64")
  262. df = DataFrame(vals, columns=["a", "b", "c", "d"])
  263. df[df == 2] = np.nan
  264. expected = df.groupby(["c", "d"]).count()
  265. for t in ["float32", "object"]:
  266. df["a"] = df["a"].astype(t)
  267. df["b"] = df["b"].astype(t)
  268. result = df.groupby(["c", "d"]).count()
  269. tm.assert_frame_equal(result, expected)
  270. def test_lower_int_prec_count():
  271. df = DataFrame(
  272. {
  273. "a": np.array([0, 1, 2, 100], np.int8),
  274. "b": np.array([1, 2, 3, 6], np.uint32),
  275. "c": np.array([4, 5, 6, 8], np.int16),
  276. "grp": list("ab" * 2),
  277. }
  278. )
  279. result = df.groupby("grp").count()
  280. expected = DataFrame(
  281. {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
  282. )
  283. tm.assert_frame_equal(result, expected)
  284. def test_count_uses_size_on_exception():
  285. class RaisingObjectException(Exception):
  286. pass
  287. class RaisingObject:
  288. def __init__(self, msg="I will raise inside Cython") -> None:
  289. super().__init__()
  290. self.msg = msg
  291. def __eq__(self, other):
  292. # gets called in Cython to check that raising calls the method
  293. raise RaisingObjectException(self.msg)
  294. df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
  295. result = df.groupby("grp").count()
  296. expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
  297. tm.assert_frame_equal(result, expected)
  298. def test_count_arrow_string_array(any_string_dtype):
  299. # GH#54751
  300. pytest.importorskip("pyarrow")
  301. df = DataFrame(
  302. {"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)}
  303. )
  304. result = df.groupby("a").count()
  305. expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a"))
  306. tm.assert_frame_equal(result, expected)