test_multilevel.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. import datetime
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import (
  6. ArrowDtype,
  7. DataFrame,
  8. MultiIndex,
  9. Series,
  10. )
  11. import pandas._testing as tm
  12. class TestMultiLevel:
  13. def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data):
  14. # axis=0
  15. ymd = multiindex_year_month_day_dataframe_random_data
  16. month_sums = ymd.groupby("month").sum()
  17. result = month_sums.reindex(ymd.index, level=1)
  18. expected = ymd.groupby(level="month").transform("sum")
  19. tm.assert_frame_equal(result, expected)
  20. # Series
  21. result = month_sums["A"].reindex(ymd.index, level=1)
  22. expected = ymd["A"].groupby(level="month").transform("sum")
  23. tm.assert_series_equal(result, expected, check_names=False)
  24. def test_reindex(self, multiindex_dataframe_random_data):
  25. frame = multiindex_dataframe_random_data
  26. expected = frame.iloc[[0, 3]]
  27. reindexed = frame.loc[[("foo", "one"), ("bar", "one")]]
  28. tm.assert_frame_equal(reindexed, expected)
  29. def test_reindex_preserve_levels(
  30. self, multiindex_year_month_day_dataframe_random_data
  31. ):
  32. ymd = multiindex_year_month_day_dataframe_random_data
  33. new_index = ymd.index[::10]
  34. chunk = ymd.reindex(new_index)
  35. assert chunk.index.is_(new_index)
  36. chunk = ymd.loc[new_index]
  37. assert chunk.index.equals(new_index)
  38. ymdT = ymd.T
  39. chunk = ymdT.reindex(columns=new_index)
  40. assert chunk.columns.is_(new_index)
  41. chunk = ymdT.loc[:, new_index]
  42. assert chunk.columns.equals(new_index)
  43. def test_groupby_transform(self, multiindex_dataframe_random_data):
  44. frame = multiindex_dataframe_random_data
  45. s = frame["A"]
  46. grouper = s.index.get_level_values(0)
  47. grouped = s.groupby(grouper, group_keys=False)
  48. applied = grouped.apply(lambda x: x * 2)
  49. expected = grouped.transform(lambda x: x * 2)
  50. result = applied.reindex(expected.index)
  51. tm.assert_series_equal(result, expected, check_names=False)
  52. def test_groupby_corner(self):
  53. midx = MultiIndex(
  54. levels=[["foo"], ["bar"], ["baz"]],
  55. codes=[[0], [0], [0]],
  56. names=["one", "two", "three"],
  57. )
  58. df = DataFrame(
  59. [np.random.default_rng(2).random(4)],
  60. columns=["a", "b", "c", "d"],
  61. index=midx,
  62. )
  63. # should work
  64. df.groupby(level="three")
  65. def test_setitem_with_expansion_multiindex_columns(
  66. self, multiindex_year_month_day_dataframe_random_data
  67. ):
  68. ymd = multiindex_year_month_day_dataframe_random_data
  69. df = ymd[:5].T
  70. df[2000, 1, 10] = df[2000, 1, 7]
  71. assert isinstance(df.columns, MultiIndex)
  72. assert (df[2000, 1, 10] == df[2000, 1, 7]).all()
  73. def test_alignment(self):
  74. x = Series(
  75. data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])
  76. )
  77. y = Series(
  78. data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])
  79. )
  80. res = x - y
  81. exp_index = x.index.union(y.index)
  82. exp = x.reindex(exp_index) - y.reindex(exp_index)
  83. tm.assert_series_equal(res, exp)
  84. # hit non-monotonic code path
  85. res = x[::-1] - y[::-1]
  86. exp_index = x.index.union(y.index)
  87. exp = x.reindex(exp_index) - y.reindex(exp_index)
  88. tm.assert_series_equal(res, exp)
  89. def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_data):
  90. ymd = multiindex_year_month_day_dataframe_random_data
  91. result = ymd.groupby(level=[0, 1]).mean()
  92. k1 = ymd.index.get_level_values(0)
  93. k2 = ymd.index.get_level_values(1)
  94. expected = ymd.groupby([k1, k2]).mean()
  95. tm.assert_frame_equal(result, expected)
  96. assert result.index.names == ymd.index.names[:2]
  97. result2 = ymd.groupby(level=ymd.index.names[:2]).mean()
  98. tm.assert_frame_equal(result, result2)
  99. def test_multilevel_consolidate(self):
  100. index = MultiIndex.from_tuples(
  101. [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")]
  102. )
  103. df = DataFrame(
  104. np.random.default_rng(2).standard_normal((4, 4)), index=index, columns=index
  105. )
  106. df["Totals", ""] = df.sum(axis=1)
  107. df = df._consolidate()
  108. def test_level_with_tuples(self):
  109. index = MultiIndex(
  110. levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]],
  111. codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
  112. )
  113. series = Series(np.random.default_rng(2).standard_normal(6), index=index)
  114. frame = DataFrame(np.random.default_rng(2).standard_normal((6, 4)), index=index)
  115. result = series[("foo", "bar", 0)]
  116. result2 = series.loc[("foo", "bar", 0)]
  117. expected = series[:2]
  118. expected.index = expected.index.droplevel(0)
  119. tm.assert_series_equal(result, expected)
  120. tm.assert_series_equal(result2, expected)
  121. with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"):
  122. series[("foo", "bar", 0), 2]
  123. result = frame.loc[("foo", "bar", 0)]
  124. result2 = frame.xs(("foo", "bar", 0))
  125. expected = frame[:2]
  126. expected.index = expected.index.droplevel(0)
  127. tm.assert_frame_equal(result, expected)
  128. tm.assert_frame_equal(result2, expected)
  129. index = MultiIndex(
  130. levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]],
  131. codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
  132. )
  133. series = Series(np.random.default_rng(2).standard_normal(6), index=index)
  134. frame = DataFrame(np.random.default_rng(2).standard_normal((6, 4)), index=index)
  135. result = series[("foo", "bar")]
  136. result2 = series.loc[("foo", "bar")]
  137. expected = series[:2]
  138. expected.index = expected.index.droplevel(0)
  139. tm.assert_series_equal(result, expected)
  140. tm.assert_series_equal(result2, expected)
  141. result = frame.loc[("foo", "bar")]
  142. result2 = frame.xs(("foo", "bar"))
  143. expected = frame[:2]
  144. expected.index = expected.index.droplevel(0)
  145. tm.assert_frame_equal(result, expected)
  146. tm.assert_frame_equal(result2, expected)
  147. def test_reindex_level_partial_selection(self, multiindex_dataframe_random_data):
  148. frame = multiindex_dataframe_random_data
  149. result = frame.reindex(["foo", "qux"], level=0)
  150. expected = frame.iloc[[0, 1, 2, 7, 8, 9]]
  151. tm.assert_frame_equal(result, expected)
  152. result = frame.T.reindex(["foo", "qux"], axis=1, level=0)
  153. tm.assert_frame_equal(result, expected.T)
  154. result = frame.loc[["foo", "qux"]]
  155. tm.assert_frame_equal(result, expected)
  156. result = frame["A"].loc[["foo", "qux"]]
  157. tm.assert_series_equal(result, expected["A"])
  158. result = frame.T.loc[:, ["foo", "qux"]]
  159. tm.assert_frame_equal(result, expected.T)
  160. @pytest.mark.parametrize("d", [4, "d"])
  161. def test_empty_frame_groupby_dtypes_consistency(self, d):
  162. # GH 20888
  163. group_keys = ["a", "b", "c"]
  164. df = DataFrame({"a": [1], "b": [2], "c": [3], "d": [d]})
  165. g = df[df.a == 2].groupby(group_keys)
  166. result = g.first().index
  167. expected = MultiIndex(
  168. levels=[[1], [2], [3]], codes=[[], [], []], names=["a", "b", "c"]
  169. )
  170. tm.assert_index_equal(result, expected)
  171. def test_duplicate_groupby_issues(self):
  172. idx_tp = [
  173. ("600809", "20061231"),
  174. ("600809", "20070331"),
  175. ("600809", "20070630"),
  176. ("600809", "20070331"),
  177. ]
  178. dt = ["demo", "demo", "demo", "demo"]
  179. idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"])
  180. s = Series(dt, index=idx)
  181. result = s.groupby(s.index).first()
  182. assert len(result) == 3
  183. def test_subsets_multiindex_dtype(self):
  184. # GH 20757
  185. data = [["x", 1]]
  186. columns = [("a", "b", np.nan), ("a", "c", 0.0)]
  187. df = DataFrame(data, columns=MultiIndex.from_tuples(columns))
  188. expected = df.dtypes.a.b
  189. result = df.a.b.dtypes
  190. tm.assert_series_equal(result, expected)
  191. def test_datetime_object_multiindex(self):
  192. data_dic = {
  193. (0, datetime.date(2018, 3, 3)): {"A": 1, "B": 10},
  194. (0, datetime.date(2018, 3, 4)): {"A": 2, "B": 11},
  195. (1, datetime.date(2018, 3, 3)): {"A": 3, "B": 12},
  196. (1, datetime.date(2018, 3, 4)): {"A": 4, "B": 13},
  197. }
  198. result = DataFrame.from_dict(data_dic, orient="index")
  199. data = {"A": [1, 2, 3, 4], "B": [10, 11, 12, 13]}
  200. index = [
  201. [0, 0, 1, 1],
  202. [
  203. datetime.date(2018, 3, 3),
  204. datetime.date(2018, 3, 4),
  205. datetime.date(2018, 3, 3),
  206. datetime.date(2018, 3, 4),
  207. ],
  208. ]
  209. expected = DataFrame(data=data, index=index)
  210. tm.assert_frame_equal(result, expected)
  211. def test_multiindex_with_na(self):
  212. df = DataFrame(
  213. [
  214. ["A", np.nan, 1.23, 4.56],
  215. ["A", "G", 1.23, 4.56],
  216. ["A", "D", 9.87, 10.54],
  217. ],
  218. columns=["pivot_0", "pivot_1", "col_1", "col_2"],
  219. ).set_index(["pivot_0", "pivot_1"])
  220. df.at[("A", "F"), "col_2"] = 0.0
  221. expected = DataFrame(
  222. [
  223. ["A", np.nan, 1.23, 4.56],
  224. ["A", "G", 1.23, 4.56],
  225. ["A", "D", 9.87, 10.54],
  226. ["A", "F", np.nan, 0.0],
  227. ],
  228. columns=["pivot_0", "pivot_1", "col_1", "col_2"],
  229. ).set_index(["pivot_0", "pivot_1"])
  230. tm.assert_frame_equal(df, expected)
  231. @pytest.mark.parametrize("na", [None, np.nan])
  232. def test_multiindex_insert_level_with_na(self, na):
  233. # GH 59003
  234. df = DataFrame([0], columns=[["A"], ["B"]])
  235. df[na, "B"] = 1
  236. tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"]))
  237. def test_multiindex_dt_with_nan(self):
  238. # GH#60388
  239. df = DataFrame(
  240. [
  241. [1, np.nan, 5, np.nan],
  242. [2, np.nan, 6, np.nan],
  243. [np.nan, 3, np.nan, 7],
  244. [np.nan, 4, np.nan, 8],
  245. ],
  246. index=Series(["a", "b", "c", "d"], dtype=object, name="sub"),
  247. columns=MultiIndex.from_product(
  248. [
  249. ["value1", "value2"],
  250. [datetime.datetime(2024, 11, 1), datetime.datetime(2024, 11, 2)],
  251. ],
  252. names=[None, "Date"],
  253. ),
  254. )
  255. df = df.reset_index()
  256. result = df[df.columns[0]]
  257. expected = Series(["a", "b", "c", "d"], name=("sub", np.nan))
  258. tm.assert_series_equal(result, expected)
  259. @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
  260. def test_multiindex_with_pyarrow_categorical(self):
  261. # GH#53051
  262. pa = pytest.importorskip("pyarrow")
  263. df = DataFrame(
  264. {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}
  265. ).astype(
  266. {
  267. "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())),
  268. "number_column": "float[pyarrow]",
  269. }
  270. )
  271. df = df.set_index(["string_column", "number_column"])
  272. df_expected = DataFrame(
  273. index=MultiIndex.from_arrays(
  274. [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"]
  275. )
  276. )
  277. tm.assert_frame_equal(
  278. df,
  279. df_expected,
  280. check_index_type=False,
  281. check_column_type=False,
  282. )
  283. class TestSorted:
  284. """everything you wanted to test about sorting"""
  285. def test_sort_non_lexsorted(self):
  286. # degenerate case where we sort but don't
  287. # have a satisfying result :<
  288. # GH 15797
  289. idx = MultiIndex(
  290. [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]]
  291. )
  292. df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64")
  293. assert df.index.is_monotonic_increasing is False
  294. sorted = df.sort_index()
  295. assert sorted.index.is_monotonic_increasing is True
  296. expected = DataFrame(
  297. {"col": [1, 4, 5, 2]},
  298. index=MultiIndex.from_tuples(
  299. [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")]
  300. ),
  301. dtype="int64",
  302. )
  303. result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :]
  304. tm.assert_frame_equal(result, expected)