test_multiindex.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. import numpy as np
  2. import pytest
  3. import pandas._libs.index as libindex
  4. from pandas.errors import PerformanceWarning
  5. import pandas as pd
  6. from pandas import (
  7. CategoricalDtype,
  8. DataFrame,
  9. Index,
  10. MultiIndex,
  11. Series,
  12. )
  13. import pandas._testing as tm
  14. from pandas.core.arrays.boolean import BooleanDtype
  15. class TestMultiIndexBasic:
  16. def test_multiindex_perf_warn(self):
  17. df = DataFrame(
  18. {
  19. "jim": [0, 0, 1, 1],
  20. "joe": ["x", "x", "z", "y"],
  21. "jolie": np.random.default_rng(2).random(4),
  22. }
  23. ).set_index(["jim", "joe"])
  24. with tm.assert_produces_warning(PerformanceWarning):
  25. df.loc[(1, "z")]
  26. df = df.iloc[[2, 1, 3, 0]]
  27. with tm.assert_produces_warning(PerformanceWarning):
  28. df.loc[(0,)]
  29. @pytest.mark.parametrize("offset", [-5, 5])
  30. def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset):
  31. size_cutoff = 20
  32. n = size_cutoff + offset
  33. with monkeypatch.context():
  34. monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
  35. s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n))))
  36. # hai it works!
  37. assert s[("a", 5)] == 5
  38. assert s[("a", 6)] == 6
  39. assert s[("a", 7)] == 7
  40. def test_multi_nan_indexing(self):
  41. # GH 3588
  42. df = DataFrame(
  43. {
  44. "a": ["R1", "R2", np.nan, "R4"],
  45. "b": ["C1", "C2", "C3", "C4"],
  46. "c": [10, 15, np.nan, 20],
  47. }
  48. )
  49. result = df.set_index(["a", "b"], drop=False)
  50. expected = DataFrame(
  51. {
  52. "a": ["R1", "R2", np.nan, "R4"],
  53. "b": ["C1", "C2", "C3", "C4"],
  54. "c": [10, 15, np.nan, 20],
  55. },
  56. index=[
  57. Index(["R1", "R2", np.nan, "R4"], name="a"),
  58. Index(["C1", "C2", "C3", "C4"], name="b"),
  59. ],
  60. )
  61. tm.assert_frame_equal(result, expected)
  62. def test_exclusive_nat_column_indexing(self):
  63. # GH 38025
  64. # test multi indexing when one column exclusively contains NaT values
  65. df = DataFrame(
  66. {
  67. "a": [pd.NaT, pd.NaT, pd.NaT, pd.NaT],
  68. "b": ["C1", "C2", "C3", "C4"],
  69. "c": [10, 15, np.nan, 20],
  70. }
  71. )
  72. df = df.set_index(["a", "b"])
  73. expected = DataFrame(
  74. {
  75. "c": [10, 15, np.nan, 20],
  76. },
  77. index=[
  78. Index([pd.NaT, pd.NaT, pd.NaT, pd.NaT], name="a"),
  79. Index(["C1", "C2", "C3", "C4"], name="b"),
  80. ],
  81. )
  82. tm.assert_frame_equal(df, expected)
  83. def test_nested_tuples_duplicates(self):
  84. # GH#30892
  85. dti = pd.to_datetime(["20190101", "20190101", "20190102"])
  86. idx = Index(["a", "a", "c"])
  87. mi = MultiIndex.from_arrays([dti, idx], names=["index1", "index2"])
  88. df = DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi)
  89. expected = DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi)
  90. df2 = df.copy(deep=True)
  91. df2.loc[(dti[0], "a"), "c2"] = 1.0
  92. tm.assert_frame_equal(df2, expected)
  93. df3 = df.copy(deep=True)
  94. df3.loc[[(dti[0], "a")], "c2"] = 1.0
  95. tm.assert_frame_equal(df3, expected)
  96. def test_multiindex_with_datatime_level_preserves_freq(self):
  97. # https://github.com/pandas-dev/pandas/issues/35563
  98. idx = Index(range(2), name="A")
  99. dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B")
  100. mi = MultiIndex.from_product([idx, dti])
  101. df = DataFrame(np.random.default_rng(2).standard_normal((14, 2)), index=mi)
  102. result = df.loc[0].index
  103. tm.assert_index_equal(result, dti)
  104. assert result.freq == dti.freq
  105. def test_multiindex_complex(self):
  106. # GH#42145
  107. complex_data = [1 + 2j, 4 - 3j, 10 - 1j]
  108. non_complex_data = [3, 4, 5]
  109. result = DataFrame(
  110. {
  111. "x": complex_data,
  112. "y": non_complex_data,
  113. "z": non_complex_data,
  114. }
  115. )
  116. result.set_index(["x", "y"], inplace=True)
  117. expected = DataFrame(
  118. {"z": non_complex_data},
  119. index=MultiIndex.from_arrays(
  120. [complex_data, non_complex_data],
  121. names=("x", "y"),
  122. ),
  123. )
  124. tm.assert_frame_equal(result, expected)
  125. def test_rename_multiindex_with_duplicates(self):
  126. # GH 38015
  127. mi = MultiIndex.from_tuples([("A", "cat"), ("B", "cat"), ("B", "cat")])
  128. df = DataFrame(index=mi)
  129. df = df.rename(index={"A": "Apple"}, level=0)
  130. mi2 = MultiIndex.from_tuples([("Apple", "cat"), ("B", "cat"), ("B", "cat")])
  131. expected = DataFrame(index=mi2)
  132. tm.assert_frame_equal(df, expected)
  133. def test_series_align_multiindex_with_nan_overlap_only(self):
  134. # GH 38439
  135. mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
  136. mi2 = MultiIndex.from_arrays([[np.nan, 82.0], [np.nan, np.nan]])
  137. ser1 = Series([1, 2], index=mi1)
  138. ser2 = Series([1, 2], index=mi2)
  139. result1, result2 = ser1.align(ser2)
  140. mi = MultiIndex.from_arrays([[81.0, 82.0, np.nan], [np.nan, np.nan, np.nan]])
  141. expected1 = Series([1.0, np.nan, 2.0], index=mi)
  142. expected2 = Series([np.nan, 2.0, 1.0], index=mi)
  143. tm.assert_series_equal(result1, expected1)
  144. tm.assert_series_equal(result2, expected2)
  145. def test_series_align_multiindex_with_nan(self):
  146. # GH 38439
  147. mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
  148. mi2 = MultiIndex.from_arrays([[np.nan, 81.0], [np.nan, np.nan]])
  149. ser1 = Series([1, 2], index=mi1)
  150. ser2 = Series([1, 2], index=mi2)
  151. result1, result2 = ser1.align(ser2)
  152. mi = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]])
  153. expected1 = Series([1, 2], index=mi)
  154. expected2 = Series([2, 1], index=mi)
  155. tm.assert_series_equal(result1, expected1)
  156. tm.assert_series_equal(result2, expected2)
  157. def test_nunique_smoke(self):
  158. # GH 34019
  159. n = DataFrame([[1, 2], [1, 2]]).set_index([0, 1]).index.nunique()
  160. assert n == 1
  161. def test_multiindex_repeated_keys(self):
  162. # GH19414
  163. tm.assert_series_equal(
  164. Series([1, 2], MultiIndex.from_arrays([["a", "b"]])).loc[
  165. ["a", "a", "b", "b"]
  166. ],
  167. Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])),
  168. )
  169. def test_multiindex_with_na_missing_key(self):
  170. # GH46173
  171. df = DataFrame.from_dict(
  172. {
  173. ("foo",): [1, 2, 3],
  174. ("bar",): [5, 6, 7],
  175. (None,): [8, 9, 0],
  176. }
  177. )
  178. with pytest.raises(KeyError, match="missing_key"):
  179. df[[("missing_key",)]]
  180. def test_multiindex_dtype_preservation(self):
  181. # GH51261
  182. columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"])
  183. df = DataFrame(["value"], columns=columns).astype("category")
  184. df_no_multiindex = df["A"]
  185. assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype)
  186. # geopandas 1763 analogue
  187. df = DataFrame(
  188. [[1, 0], [0, 1]],
  189. columns=[
  190. ["foo", "foo"],
  191. ["location", "location"],
  192. ["x", "y"],
  193. ],
  194. ).assign(bools=Series([True, False], dtype="boolean"))
  195. assert isinstance(df["bools"].dtype, BooleanDtype)
  196. def test_multiindex_from_tuples_with_nan(self):
  197. # GH#23578
  198. result = MultiIndex.from_tuples([("a", "b", "c"), np.nan, ("d", "", "")])
  199. expected = MultiIndex.from_tuples(
  200. [("a", "b", "c"), (np.nan, np.nan, np.nan), ("d", "", "")]
  201. )
  202. tm.assert_index_equal(result, expected)