test_multilevel.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. import datetime
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. MultiIndex,
  8. Series,
  9. )
  10. import pandas._testing as tm
  11. class TestMultiLevel:
  12. def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data):
  13. # axis=0
  14. ymd = multiindex_year_month_day_dataframe_random_data
  15. month_sums = ymd.groupby("month").sum()
  16. result = month_sums.reindex(ymd.index, level=1)
  17. expected = ymd.groupby(level="month").transform("sum")
  18. tm.assert_frame_equal(result, expected)
  19. # Series
  20. result = month_sums["A"].reindex(ymd.index, level=1)
  21. expected = ymd["A"].groupby(level="month").transform("sum")
  22. tm.assert_series_equal(result, expected, check_names=False)
  23. # axis=1
  24. msg = "DataFrame.groupby with axis=1 is deprecated"
  25. with tm.assert_produces_warning(FutureWarning, match=msg):
  26. gb = ymd.T.groupby("month", axis=1)
  27. month_sums = gb.sum()
  28. result = month_sums.reindex(columns=ymd.index, level=1)
  29. expected = ymd.groupby(level="month").transform("sum").T
  30. tm.assert_frame_equal(result, expected)
  31. def test_reindex(self, multiindex_dataframe_random_data):
  32. frame = multiindex_dataframe_random_data
  33. expected = frame.iloc[[0, 3]]
  34. reindexed = frame.loc[[("foo", "one"), ("bar", "one")]]
  35. tm.assert_frame_equal(reindexed, expected)
  36. def test_reindex_preserve_levels(
  37. self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write
  38. ):
  39. ymd = multiindex_year_month_day_dataframe_random_data
  40. new_index = ymd.index[::10]
  41. chunk = ymd.reindex(new_index)
  42. if using_copy_on_write:
  43. assert chunk.index.is_(new_index)
  44. else:
  45. assert chunk.index is new_index
  46. chunk = ymd.loc[new_index]
  47. assert chunk.index.equals(new_index)
  48. ymdT = ymd.T
  49. chunk = ymdT.reindex(columns=new_index)
  50. if using_copy_on_write:
  51. assert chunk.columns.is_(new_index)
  52. else:
  53. assert chunk.columns is new_index
  54. chunk = ymdT.loc[:, new_index]
  55. assert chunk.columns.equals(new_index)
  56. def test_groupby_transform(self, multiindex_dataframe_random_data):
  57. frame = multiindex_dataframe_random_data
  58. s = frame["A"]
  59. grouper = s.index.get_level_values(0)
  60. grouped = s.groupby(grouper, group_keys=False)
  61. applied = grouped.apply(lambda x: x * 2)
  62. expected = grouped.transform(lambda x: x * 2)
  63. result = applied.reindex(expected.index)
  64. tm.assert_series_equal(result, expected, check_names=False)
  65. def test_groupby_corner(self):
  66. midx = MultiIndex(
  67. levels=[["foo"], ["bar"], ["baz"]],
  68. codes=[[0], [0], [0]],
  69. names=["one", "two", "three"],
  70. )
  71. df = DataFrame(
  72. [np.random.default_rng(2).random(4)],
  73. columns=["a", "b", "c", "d"],
  74. index=midx,
  75. )
  76. # should work
  77. df.groupby(level="three")
  78. def test_groupby_level_no_obs(self):
  79. # #1697
  80. midx = MultiIndex.from_tuples(
  81. [
  82. ("f1", "s1"),
  83. ("f1", "s2"),
  84. ("f2", "s1"),
  85. ("f2", "s2"),
  86. ("f3", "s1"),
  87. ("f3", "s2"),
  88. ]
  89. )
  90. df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx)
  91. df1 = df.loc(axis=1)[df.columns.map(lambda u: u[0] in ["f2", "f3"])]
  92. msg = "DataFrame.groupby with axis=1 is deprecated"
  93. with tm.assert_produces_warning(FutureWarning, match=msg):
  94. grouped = df1.groupby(axis=1, level=0)
  95. result = grouped.sum()
  96. assert (result.columns == ["f2", "f3"]).all()
  97. def test_setitem_with_expansion_multiindex_columns(
  98. self, multiindex_year_month_day_dataframe_random_data
  99. ):
  100. ymd = multiindex_year_month_day_dataframe_random_data
  101. df = ymd[:5].T
  102. df[2000, 1, 10] = df[2000, 1, 7]
  103. assert isinstance(df.columns, MultiIndex)
  104. assert (df[2000, 1, 10] == df[2000, 1, 7]).all()
  105. def test_alignment(self):
  106. x = Series(
  107. data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])
  108. )
  109. y = Series(
  110. data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])
  111. )
  112. res = x - y
  113. exp_index = x.index.union(y.index)
  114. exp = x.reindex(exp_index) - y.reindex(exp_index)
  115. tm.assert_series_equal(res, exp)
  116. # hit non-monotonic code path
  117. res = x[::-1] - y[::-1]
  118. exp_index = x.index.union(y.index)
  119. exp = x.reindex(exp_index) - y.reindex(exp_index)
  120. tm.assert_series_equal(res, exp)
  121. def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_data):
  122. ymd = multiindex_year_month_day_dataframe_random_data
  123. result = ymd.groupby(level=[0, 1]).mean()
  124. k1 = ymd.index.get_level_values(0)
  125. k2 = ymd.index.get_level_values(1)
  126. expected = ymd.groupby([k1, k2]).mean()
  127. # TODO groupby with level_values drops names
  128. tm.assert_frame_equal(result, expected, check_names=False)
  129. assert result.index.names == ymd.index.names[:2]
  130. result2 = ymd.groupby(level=ymd.index.names[:2]).mean()
  131. tm.assert_frame_equal(result, result2)
  132. def test_multilevel_consolidate(self):
  133. index = MultiIndex.from_tuples(
  134. [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")]
  135. )
  136. df = DataFrame(
  137. np.random.default_rng(2).standard_normal((4, 4)), index=index, columns=index
  138. )
  139. df["Totals", ""] = df.sum(1)
  140. df = df._consolidate()
  141. def test_level_with_tuples(self):
  142. index = MultiIndex(
  143. levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]],
  144. codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
  145. )
  146. series = Series(np.random.default_rng(2).standard_normal(6), index=index)
  147. frame = DataFrame(np.random.default_rng(2).standard_normal((6, 4)), index=index)
  148. result = series[("foo", "bar", 0)]
  149. result2 = series.loc[("foo", "bar", 0)]
  150. expected = series[:2]
  151. expected.index = expected.index.droplevel(0)
  152. tm.assert_series_equal(result, expected)
  153. tm.assert_series_equal(result2, expected)
  154. with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"):
  155. series[("foo", "bar", 0), 2]
  156. result = frame.loc[("foo", "bar", 0)]
  157. result2 = frame.xs(("foo", "bar", 0))
  158. expected = frame[:2]
  159. expected.index = expected.index.droplevel(0)
  160. tm.assert_frame_equal(result, expected)
  161. tm.assert_frame_equal(result2, expected)
  162. index = MultiIndex(
  163. levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]],
  164. codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
  165. )
  166. series = Series(np.random.default_rng(2).standard_normal(6), index=index)
  167. frame = DataFrame(np.random.default_rng(2).standard_normal((6, 4)), index=index)
  168. result = series[("foo", "bar")]
  169. result2 = series.loc[("foo", "bar")]
  170. expected = series[:2]
  171. expected.index = expected.index.droplevel(0)
  172. tm.assert_series_equal(result, expected)
  173. tm.assert_series_equal(result2, expected)
  174. result = frame.loc[("foo", "bar")]
  175. result2 = frame.xs(("foo", "bar"))
  176. expected = frame[:2]
  177. expected.index = expected.index.droplevel(0)
  178. tm.assert_frame_equal(result, expected)
  179. tm.assert_frame_equal(result2, expected)
  180. def test_reindex_level_partial_selection(self, multiindex_dataframe_random_data):
  181. frame = multiindex_dataframe_random_data
  182. result = frame.reindex(["foo", "qux"], level=0)
  183. expected = frame.iloc[[0, 1, 2, 7, 8, 9]]
  184. tm.assert_frame_equal(result, expected)
  185. result = frame.T.reindex(["foo", "qux"], axis=1, level=0)
  186. tm.assert_frame_equal(result, expected.T)
  187. result = frame.loc[["foo", "qux"]]
  188. tm.assert_frame_equal(result, expected)
  189. result = frame["A"].loc[["foo", "qux"]]
  190. tm.assert_series_equal(result, expected["A"])
  191. result = frame.T.loc[:, ["foo", "qux"]]
  192. tm.assert_frame_equal(result, expected.T)
  193. @pytest.mark.parametrize("d", [4, "d"])
  194. def test_empty_frame_groupby_dtypes_consistency(self, d):
  195. # GH 20888
  196. group_keys = ["a", "b", "c"]
  197. df = DataFrame({"a": [1], "b": [2], "c": [3], "d": [d]})
  198. g = df[df.a == 2].groupby(group_keys)
  199. result = g.first().index
  200. expected = MultiIndex(
  201. levels=[[1], [2], [3]], codes=[[], [], []], names=["a", "b", "c"]
  202. )
  203. tm.assert_index_equal(result, expected)
  204. def test_duplicate_groupby_issues(self):
  205. idx_tp = [
  206. ("600809", "20061231"),
  207. ("600809", "20070331"),
  208. ("600809", "20070630"),
  209. ("600809", "20070331"),
  210. ]
  211. dt = ["demo", "demo", "demo", "demo"]
  212. idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"])
  213. s = Series(dt, index=idx)
  214. result = s.groupby(s.index).first()
  215. assert len(result) == 3
  216. def test_subsets_multiindex_dtype(self):
  217. # GH 20757
  218. data = [["x", 1]]
  219. columns = [("a", "b", np.nan), ("a", "c", 0.0)]
  220. df = DataFrame(data, columns=MultiIndex.from_tuples(columns))
  221. expected = df.dtypes.a.b
  222. result = df.a.b.dtypes
  223. tm.assert_series_equal(result, expected)
  224. def test_datetime_object_multiindex(self):
  225. data_dic = {
  226. (0, datetime.date(2018, 3, 3)): {"A": 1, "B": 10},
  227. (0, datetime.date(2018, 3, 4)): {"A": 2, "B": 11},
  228. (1, datetime.date(2018, 3, 3)): {"A": 3, "B": 12},
  229. (1, datetime.date(2018, 3, 4)): {"A": 4, "B": 13},
  230. }
  231. result = DataFrame.from_dict(data_dic, orient="index")
  232. data = {"A": [1, 2, 3, 4], "B": [10, 11, 12, 13]}
  233. index = [
  234. [0, 0, 1, 1],
  235. [
  236. datetime.date(2018, 3, 3),
  237. datetime.date(2018, 3, 4),
  238. datetime.date(2018, 3, 3),
  239. datetime.date(2018, 3, 4),
  240. ],
  241. ]
  242. expected = DataFrame(data=data, index=index)
  243. tm.assert_frame_equal(result, expected)
  244. def test_multiindex_with_na(self):
  245. df = DataFrame(
  246. [
  247. ["A", np.nan, 1.23, 4.56],
  248. ["A", "G", 1.23, 4.56],
  249. ["A", "D", 9.87, 10.54],
  250. ],
  251. columns=["pivot_0", "pivot_1", "col_1", "col_2"],
  252. ).set_index(["pivot_0", "pivot_1"])
  253. df.at[("A", "F"), "col_2"] = 0.0
  254. expected = DataFrame(
  255. [
  256. ["A", np.nan, 1.23, 4.56],
  257. ["A", "G", 1.23, 4.56],
  258. ["A", "D", 9.87, 10.54],
  259. ["A", "F", np.nan, 0.0],
  260. ],
  261. columns=["pivot_0", "pivot_1", "col_1", "col_2"],
  262. ).set_index(["pivot_0", "pivot_1"])
  263. tm.assert_frame_equal(df, expected)
  264. class TestSorted:
  265. """everything you wanted to test about sorting"""
  266. def test_sort_non_lexsorted(self):
  267. # degenerate case where we sort but don't
  268. # have a satisfying result :<
  269. # GH 15797
  270. idx = MultiIndex(
  271. [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]]
  272. )
  273. df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64")
  274. assert df.index.is_monotonic_increasing is False
  275. sorted = df.sort_index()
  276. assert sorted.index.is_monotonic_increasing is True
  277. expected = DataFrame(
  278. {"col": [1, 4, 5, 2]},
  279. index=MultiIndex.from_tuples(
  280. [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")]
  281. ),
  282. dtype="int64",
  283. )
  284. result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :]
  285. tm.assert_frame_equal(result, expected)