test_describe.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. MultiIndex,
  8. Series,
  9. Timestamp,
  10. date_range,
  11. )
  12. import pandas._testing as tm
  13. def test_apply_describe_bug(multiindex_dataframe_random_data):
  14. grouped = multiindex_dataframe_random_data.groupby(level="first")
  15. grouped.describe() # it works!
  16. def test_series_describe_multikey():
  17. ts = Series(
  18. np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
  19. )
  20. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  21. result = grouped.describe()
  22. tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
  23. tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
  24. tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
  25. def test_series_describe_single():
  26. ts = Series(
  27. np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
  28. )
  29. grouped = ts.groupby(lambda x: x.month)
  30. result = grouped.apply(lambda x: x.describe())
  31. expected = grouped.describe().stack(future_stack=True)
  32. tm.assert_series_equal(result, expected)
  33. @pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
  34. def test_series_describe_as_index(as_index, keys):
  35. # GH#49256
  36. df = DataFrame(
  37. {
  38. "key1": ["one", "two", "two", "three", "two"],
  39. "key2": ["one", "two", "two", "three", "two"],
  40. "foo2": [1, 2, 4, 4, 6],
  41. }
  42. )
  43. gb = df.groupby(keys, as_index=as_index)["foo2"]
  44. result = gb.describe()
  45. expected = DataFrame(
  46. {
  47. "key1": ["one", "three", "two"],
  48. "count": [1.0, 1.0, 3.0],
  49. "mean": [1.0, 4.0, 4.0],
  50. "std": [np.nan, np.nan, 2.0],
  51. "min": [1.0, 4.0, 2.0],
  52. "25%": [1.0, 4.0, 3.0],
  53. "50%": [1.0, 4.0, 4.0],
  54. "75%": [1.0, 4.0, 5.0],
  55. "max": [1.0, 4.0, 6.0],
  56. }
  57. )
  58. if len(keys) == 2:
  59. expected.insert(1, "key2", expected["key1"])
  60. if as_index:
  61. expected = expected.set_index(keys)
  62. tm.assert_frame_equal(result, expected)
  63. def test_frame_describe_multikey(tsframe, using_infer_string):
  64. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  65. result = grouped.describe()
  66. desc_groups = []
  67. for col in tsframe:
  68. group = grouped[col].describe()
  69. # GH 17464 - Remove duplicate MultiIndex levels
  70. group_col = MultiIndex(
  71. levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
  72. codes=[[0] * len(group.columns), range(len(group.columns))],
  73. )
  74. group = DataFrame(group.values, columns=group_col, index=group.index)
  75. desc_groups.append(group)
  76. expected = pd.concat(desc_groups, axis=1)
  77. tm.assert_frame_equal(result, expected)
  78. # remainder of the tests fails with string dtype but is testing deprecated behaviour
  79. if using_infer_string:
  80. return
  81. msg = "DataFrame.groupby with axis=1 is deprecated"
  82. with tm.assert_produces_warning(FutureWarning, match=msg):
  83. groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
  84. result = groupedT.describe()
  85. expected = tsframe.describe().T
  86. # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
  87. expected.index = MultiIndex(
  88. levels=[[0, 1], expected.index],
  89. codes=[[0, 0, 1, 1], range(len(expected.index))],
  90. )
  91. tm.assert_frame_equal(result, expected)
  92. def test_frame_describe_tupleindex():
  93. # GH 14848 - regression from 0.19.0 to 0.19.1
  94. df1 = DataFrame(
  95. {
  96. "x": [1, 2, 3, 4, 5] * 3,
  97. "y": [10, 20, 30, 40, 50] * 3,
  98. "z": [100, 200, 300, 400, 500] * 3,
  99. }
  100. )
  101. df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
  102. df2 = df1.rename(columns={"k": "key"})
  103. msg = "Names should be list-like for a MultiIndex"
  104. with pytest.raises(ValueError, match=msg):
  105. df1.groupby("k").describe()
  106. with pytest.raises(ValueError, match=msg):
  107. df2.groupby("key").describe()
  108. def test_frame_describe_unstacked_format():
  109. # GH 4792
  110. prices = {
  111. Timestamp("2011-01-06 10:59:05", tz=None): 24990,
  112. Timestamp("2011-01-06 12:43:33", tz=None): 25499,
  113. Timestamp("2011-01-06 12:54:09", tz=None): 25499,
  114. }
  115. volumes = {
  116. Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
  117. Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
  118. Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
  119. }
  120. df = DataFrame({"PRICE": prices, "VOLUME": volumes})
  121. result = df.groupby("PRICE").VOLUME.describe()
  122. data = [
  123. df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
  124. df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
  125. ]
  126. expected = DataFrame(
  127. data,
  128. index=Index([24990, 25499], name="PRICE"),
  129. columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  130. )
  131. tm.assert_frame_equal(result, expected)
  132. @pytest.mark.filterwarnings(
  133. "ignore:"
  134. "indexing past lexsort depth may impact performance:"
  135. "pandas.errors.PerformanceWarning"
  136. )
  137. @pytest.mark.parametrize("as_index", [True, False])
  138. @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
  139. def test_describe_with_duplicate_output_column_names(as_index, keys):
  140. # GH 35314
  141. df = DataFrame(
  142. {
  143. "a1": [99, 99, 99, 88, 88, 88],
  144. "a2": [99, 99, 99, 88, 88, 88],
  145. "b": [1, 2, 3, 4, 5, 6],
  146. "c": [10, 20, 30, 40, 50, 60],
  147. },
  148. columns=["a1", "a2", "b", "b"],
  149. copy=False,
  150. )
  151. if keys == ["a1"]:
  152. df = df.drop(columns="a2")
  153. expected = (
  154. DataFrame.from_records(
  155. [
  156. ("b", "count", 3.0, 3.0),
  157. ("b", "mean", 5.0, 2.0),
  158. ("b", "std", 1.0, 1.0),
  159. ("b", "min", 4.0, 1.0),
  160. ("b", "25%", 4.5, 1.5),
  161. ("b", "50%", 5.0, 2.0),
  162. ("b", "75%", 5.5, 2.5),
  163. ("b", "max", 6.0, 3.0),
  164. ("b", "count", 3.0, 3.0),
  165. ("b", "mean", 5.0, 2.0),
  166. ("b", "std", 1.0, 1.0),
  167. ("b", "min", 4.0, 1.0),
  168. ("b", "25%", 4.5, 1.5),
  169. ("b", "50%", 5.0, 2.0),
  170. ("b", "75%", 5.5, 2.5),
  171. ("b", "max", 6.0, 3.0),
  172. ],
  173. )
  174. .set_index([0, 1])
  175. .T
  176. )
  177. expected.columns.names = [None, None]
  178. if len(keys) == 2:
  179. expected.index = MultiIndex(
  180. levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
  181. )
  182. else:
  183. expected.index = Index([88, 99], name="a1")
  184. if not as_index:
  185. expected = expected.reset_index()
  186. result = df.groupby(keys, as_index=as_index).describe()
  187. tm.assert_frame_equal(result, expected)
  188. def test_describe_duplicate_columns():
  189. # GH#50806
  190. df = DataFrame([[0, 1, 2, 3]])
  191. df.columns = [0, 1, 2, 0]
  192. gb = df.groupby(df[1])
  193. result = gb.describe(percentiles=[])
  194. columns = ["count", "mean", "std", "min", "50%", "max"]
  195. frames = [
  196. DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
  197. for val in (0.0, 2.0, 3.0)
  198. ]
  199. expected = pd.concat(frames, axis=1)
  200. expected.columns = MultiIndex(
  201. levels=[[0, 2], columns],
  202. codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
  203. )
  204. expected.index.names = [1]
  205. tm.assert_frame_equal(result, expected)
  206. class TestGroupByNonCythonPaths:
  207. # GH#5610 non-cython calls should not include the grouper
  208. # Tests for code not expected to go through cython paths.
  209. @pytest.fixture
  210. def df(self):
  211. df = DataFrame(
  212. [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
  213. columns=["A", "B", "C"],
  214. )
  215. return df
  216. @pytest.fixture
  217. def gb(self, df):
  218. gb = df.groupby("A")
  219. return gb
  220. @pytest.fixture
  221. def gni(self, df):
  222. gni = df.groupby("A", as_index=False)
  223. return gni
  224. def test_describe(self, df, gb, gni):
  225. # describe
  226. expected_index = Index([1, 3], name="A")
  227. expected_col = MultiIndex(
  228. levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
  229. codes=[[0] * 8, list(range(8))],
  230. )
  231. expected = DataFrame(
  232. [
  233. [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
  234. [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
  235. ],
  236. index=expected_index,
  237. columns=expected_col,
  238. )
  239. result = gb.describe()
  240. tm.assert_frame_equal(result, expected)
  241. expected = expected.reset_index()
  242. result = gni.describe()
  243. tm.assert_frame_equal(result, expected)
  244. @pytest.mark.parametrize("dtype", [int, float, object])
  245. @pytest.mark.parametrize(
  246. "kwargs",
  247. [
  248. {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
  249. {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
  250. {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
  251. ],
  252. )
  253. def test_groupby_empty_dataset(dtype, kwargs):
  254. # GH#41575
  255. df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
  256. df["B"] = df["B"].astype(int)
  257. df["C"] = df["C"].astype(float)
  258. result = df.iloc[:0].groupby("A").describe(**kwargs)
  259. expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
  260. tm.assert_frame_equal(result, expected)
  261. result = df.iloc[:0].groupby("A").B.describe(**kwargs)
  262. expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
  263. expected.index = Index([], dtype=df.columns.dtype)
  264. tm.assert_frame_equal(result, expected)