groupby.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. import re
  2. import pytest
  3. from pandas.core.dtypes.common import (
  4. is_bool_dtype,
  5. is_numeric_dtype,
  6. is_object_dtype,
  7. is_string_dtype,
  8. )
  9. import pandas as pd
  10. import pandas._testing as tm
  11. @pytest.mark.filterwarnings(
  12. "ignore:The default of observed=False is deprecated:FutureWarning"
  13. )
  14. class BaseGroupbyTests:
  15. """Groupby-specific tests."""
  16. def test_grouping_grouper(self, data_for_grouping):
  17. df = pd.DataFrame(
  18. {
  19. "A": pd.Series(
  20. ["B", "B", None, None, "A", "A", "B", "C"], dtype=object
  21. ),
  22. "B": data_for_grouping,
  23. }
  24. )
  25. gr1 = df.groupby("A")._grouper.groupings[0]
  26. gr2 = df.groupby("B")._grouper.groupings[0]
  27. tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
  28. tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
  29. @pytest.mark.parametrize("as_index", [True, False])
  30. def test_groupby_extension_agg(self, as_index, data_for_grouping):
  31. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  32. is_bool = data_for_grouping.dtype._is_boolean
  33. if is_bool:
  34. # only 2 unique values, and the final entry has c==b
  35. # (see data_for_grouping docstring)
  36. df = df.iloc[:-1]
  37. result = df.groupby("B", as_index=as_index).A.mean()
  38. _, uniques = pd.factorize(data_for_grouping, sort=True)
  39. exp_vals = [3.0, 1.0, 4.0]
  40. if is_bool:
  41. exp_vals = exp_vals[:-1]
  42. if as_index:
  43. index = pd.Index(uniques, name="B")
  44. expected = pd.Series(exp_vals, index=index, name="A")
  45. tm.assert_series_equal(result, expected)
  46. else:
  47. expected = pd.DataFrame({"B": uniques, "A": exp_vals})
  48. tm.assert_frame_equal(result, expected)
  49. def test_groupby_agg_extension(self, data_for_grouping):
  50. # GH#38980 groupby agg on extension type fails for non-numeric types
  51. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  52. expected = df.iloc[[0, 2, 4, 7]]
  53. expected = expected.set_index("A")
  54. result = df.groupby("A").agg({"B": "first"})
  55. tm.assert_frame_equal(result, expected)
  56. result = df.groupby("A").agg("first")
  57. tm.assert_frame_equal(result, expected)
  58. result = df.groupby("A").first()
  59. tm.assert_frame_equal(result, expected)
  60. def test_groupby_extension_no_sort(self, data_for_grouping):
  61. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  62. is_bool = data_for_grouping.dtype._is_boolean
  63. if is_bool:
  64. # only 2 unique values, and the final entry has c==b
  65. # (see data_for_grouping docstring)
  66. df = df.iloc[:-1]
  67. result = df.groupby("B", sort=False).A.mean()
  68. _, index = pd.factorize(data_for_grouping, sort=False)
  69. index = pd.Index(index, name="B")
  70. exp_vals = [1.0, 3.0, 4.0]
  71. if is_bool:
  72. exp_vals = exp_vals[:-1]
  73. expected = pd.Series(exp_vals, index=index, name="A")
  74. tm.assert_series_equal(result, expected)
  75. def test_groupby_extension_transform(self, data_for_grouping):
  76. is_bool = data_for_grouping.dtype._is_boolean
  77. valid = data_for_grouping[~data_for_grouping.isna()]
  78. df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid})
  79. is_bool = data_for_grouping.dtype._is_boolean
  80. if is_bool:
  81. # only 2 unique values, and the final entry has c==b
  82. # (see data_for_grouping docstring)
  83. df = df.iloc[:-1]
  84. result = df.groupby("B").A.transform(len)
  85. expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
  86. if is_bool:
  87. expected = expected[:-1]
  88. tm.assert_series_equal(result, expected)
  89. def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
  90. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  91. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  92. with tm.assert_produces_warning(FutureWarning, match=msg):
  93. df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op)
  94. df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op)
  95. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  96. with tm.assert_produces_warning(FutureWarning, match=msg):
  97. df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op)
  98. df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op)
  99. def test_groupby_apply_identity(self, data_for_grouping):
  100. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  101. result = df.groupby("A").B.apply(lambda x: x.array)
  102. expected = pd.Series(
  103. [
  104. df.B.iloc[[0, 1, 6]].array,
  105. df.B.iloc[[2, 3]].array,
  106. df.B.iloc[[4, 5]].array,
  107. df.B.iloc[[7]].array,
  108. ],
  109. index=pd.Index([1, 2, 3, 4], name="A"),
  110. name="B",
  111. )
  112. tm.assert_series_equal(result, expected)
  113. def test_in_numeric_groupby(self, data_for_grouping):
  114. df = pd.DataFrame(
  115. {
  116. "A": [1, 1, 2, 2, 3, 3, 1, 4],
  117. "B": data_for_grouping,
  118. "C": [1, 1, 1, 1, 1, 1, 1, 1],
  119. }
  120. )
  121. dtype = data_for_grouping.dtype
  122. if (
  123. is_numeric_dtype(dtype)
  124. or is_bool_dtype(dtype)
  125. or dtype.name == "decimal"
  126. or is_string_dtype(dtype)
  127. or is_object_dtype(dtype)
  128. or dtype.kind == "m" # in particular duration[*][pyarrow]
  129. ):
  130. expected = pd.Index(["B", "C"])
  131. result = df.groupby("A").sum().columns
  132. else:
  133. expected = pd.Index(["C"])
  134. msg = "|".join(
  135. [
  136. # period/datetime
  137. "does not support sum operations",
  138. # all others
  139. re.escape(f"agg function failed [how->sum,dtype->{dtype}"),
  140. ]
  141. )
  142. with pytest.raises(TypeError, match=msg):
  143. df.groupby("A").sum()
  144. result = df.groupby("A").sum(numeric_only=True).columns
  145. tm.assert_index_equal(result, expected)