test_integrity.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas._libs import index as libindex
  5. from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
  6. import pandas as pd
  7. from pandas import (
  8. Index,
  9. IntervalIndex,
  10. MultiIndex,
  11. RangeIndex,
  12. )
  13. import pandas._testing as tm
  14. def test_labels_dtypes():
  15. # GH 8456
  16. i = MultiIndex.from_tuples([("A", 1), ("A", 2)])
  17. assert i.codes[0].dtype == "int8"
  18. assert i.codes[1].dtype == "int8"
  19. i = MultiIndex.from_product([["a"], range(40)])
  20. assert i.codes[1].dtype == "int8"
  21. i = MultiIndex.from_product([["a"], range(400)])
  22. assert i.codes[1].dtype == "int16"
  23. i = MultiIndex.from_product([["a"], range(40000)])
  24. assert i.codes[1].dtype == "int32"
  25. i = MultiIndex.from_product([["a"], range(1000)])
  26. assert (i.codes[0] >= 0).all()
  27. assert (i.codes[1] >= 0).all()
  28. def test_values_boxed():
  29. tuples = [
  30. (1, pd.Timestamp("2000-01-01")),
  31. (2, pd.NaT),
  32. (3, pd.Timestamp("2000-01-03")),
  33. (1, pd.Timestamp("2000-01-04")),
  34. (2, pd.Timestamp("2000-01-02")),
  35. (3, pd.Timestamp("2000-01-03")),
  36. ]
  37. result = MultiIndex.from_tuples(tuples)
  38. expected = construct_1d_object_array_from_listlike(tuples)
  39. tm.assert_numpy_array_equal(result.values, expected)
  40. # Check that code branches for boxed values produce identical results
  41. tm.assert_numpy_array_equal(result.values[:4], result[:4].values)
  42. def test_values_multiindex_datetimeindex():
  43. # Test to ensure we hit the boxing / nobox part of MI.values
  44. ints = np.arange(10**18, 10**18 + 5)
  45. naive = pd.DatetimeIndex(ints)
  46. aware = pd.DatetimeIndex(ints, tz="US/Central")
  47. idx = MultiIndex.from_arrays([naive, aware])
  48. result = idx.values
  49. outer = pd.DatetimeIndex([x[0] for x in result])
  50. tm.assert_index_equal(outer, naive)
  51. inner = pd.DatetimeIndex([x[1] for x in result])
  52. tm.assert_index_equal(inner, aware)
  53. # n_lev > n_lab
  54. result = idx[:2].values
  55. outer = pd.DatetimeIndex([x[0] for x in result])
  56. tm.assert_index_equal(outer, naive[:2])
  57. inner = pd.DatetimeIndex([x[1] for x in result])
  58. tm.assert_index_equal(inner, aware[:2])
  59. def test_values_multiindex_periodindex():
  60. # Test to ensure we hit the boxing / nobox part of MI.values
  61. ints = np.arange(2007, 2012)
  62. pidx = pd.PeriodIndex(ints, freq="D")
  63. idx = MultiIndex.from_arrays([ints, pidx])
  64. result = idx.values
  65. outer = Index([x[0] for x in result])
  66. tm.assert_index_equal(outer, Index(ints, dtype=np.int64))
  67. inner = pd.PeriodIndex([x[1] for x in result])
  68. tm.assert_index_equal(inner, pidx)
  69. # n_lev > n_lab
  70. result = idx[:2].values
  71. outer = Index([x[0] for x in result])
  72. tm.assert_index_equal(outer, Index(ints[:2], dtype=np.int64))
  73. inner = pd.PeriodIndex([x[1] for x in result])
  74. tm.assert_index_equal(inner, pidx[:2])
  75. def test_consistency():
  76. # need to construct an overflow
  77. major_axis = list(range(70000))
  78. minor_axis = list(range(10))
  79. major_codes = np.arange(70000)
  80. minor_codes = np.repeat(range(10), 7000)
  81. # the fact that is works means it's consistent
  82. index = MultiIndex(
  83. levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
  84. )
  85. # inconsistent
  86. major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3])
  87. minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1])
  88. index = MultiIndex(
  89. levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
  90. )
  91. assert index.is_unique is False
  92. @pytest.mark.slow
  93. def test_hash_collisions(monkeypatch):
  94. # non-smoke test that we don't get hash collisions
  95. size_cutoff = 50
  96. with monkeypatch.context() as m:
  97. m.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
  98. index = MultiIndex.from_product(
  99. [np.arange(8), np.arange(8)], names=["one", "two"]
  100. )
  101. result = index.get_indexer(index.values)
  102. tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp"))
  103. for i in [0, 1, len(index) - 2, len(index) - 1]:
  104. result = index.get_loc(index[i])
  105. assert result == i
  106. def test_dims():
  107. pass
  108. def test_take_invalid_kwargs():
  109. vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]]
  110. idx = MultiIndex.from_product(vals, names=["str", "dt"])
  111. indices = [1, 2]
  112. msg = r"take\(\) got an unexpected keyword argument 'foo'"
  113. with pytest.raises(TypeError, match=msg):
  114. idx.take(indices, foo=2)
  115. msg = "the 'out' parameter is not supported"
  116. with pytest.raises(ValueError, match=msg):
  117. idx.take(indices, out=indices)
  118. msg = "the 'mode' parameter is not supported"
  119. with pytest.raises(ValueError, match=msg):
  120. idx.take(indices, mode="clip")
  121. def test_isna_behavior(idx):
  122. # should not segfault GH5123
  123. # NOTE: if MI representation changes, may make sense to allow
  124. # isna(MI)
  125. msg = "isna is not defined for MultiIndex"
  126. with pytest.raises(NotImplementedError, match=msg):
  127. pd.isna(idx)
  128. def test_large_multiindex_error(monkeypatch):
  129. # GH12527
  130. size_cutoff = 50
  131. with monkeypatch.context() as m:
  132. m.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
  133. df_below_cutoff = pd.DataFrame(
  134. 1,
  135. index=MultiIndex.from_product([[1, 2], range(size_cutoff - 1)]),
  136. columns=["dest"],
  137. )
  138. with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):
  139. df_below_cutoff.loc[(-1, 0), "dest"]
  140. with pytest.raises(KeyError, match=r"^\(3, 0\)$"):
  141. df_below_cutoff.loc[(3, 0), "dest"]
  142. df_above_cutoff = pd.DataFrame(
  143. 1,
  144. index=MultiIndex.from_product([[1, 2], range(size_cutoff + 1)]),
  145. columns=["dest"],
  146. )
  147. with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):
  148. df_above_cutoff.loc[(-1, 0), "dest"]
  149. with pytest.raises(KeyError, match=r"^\(3, 0\)$"):
  150. df_above_cutoff.loc[(3, 0), "dest"]
  151. def test_mi_hashtable_populated_attribute_error(monkeypatch):
  152. # GH 18165
  153. monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 50)
  154. r = range(50)
  155. df = pd.DataFrame({"a": r, "b": r}, index=MultiIndex.from_arrays([r, r]))
  156. msg = "'Series' object has no attribute 'foo'"
  157. with pytest.raises(AttributeError, match=msg):
  158. df["a"].foo()
  159. def test_can_hold_identifiers(idx):
  160. key = idx[0]
  161. assert idx._can_hold_identifiers_and_holds_name(key) is True
  162. def test_metadata_immutable(idx):
  163. levels, codes = idx.levels, idx.codes
  164. # shouldn't be able to set at either the top level or base level
  165. mutable_regex = re.compile("does not support mutable operations")
  166. with pytest.raises(TypeError, match=mutable_regex):
  167. levels[0] = levels[0]
  168. with pytest.raises(TypeError, match=mutable_regex):
  169. levels[0][0] = levels[0][0]
  170. # ditto for labels
  171. with pytest.raises(TypeError, match=mutable_regex):
  172. codes[0] = codes[0]
  173. with pytest.raises(ValueError, match="assignment destination is read-only"):
  174. codes[0][0] = codes[0][0]
  175. # and for names
  176. names = idx.names
  177. with pytest.raises(TypeError, match=mutable_regex):
  178. names[0] = names[0]
  179. def test_level_setting_resets_attributes():
  180. ind = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
  181. assert ind.is_monotonic_increasing
  182. ind = ind.set_levels([["A", "B"], [1, 3, 2]])
  183. # if this fails, probably didn't reset the cache correctly.
  184. assert not ind.is_monotonic_increasing
  185. def test_rangeindex_fallback_coercion_bug():
  186. # GH 12893
  187. df1 = pd.DataFrame(np.arange(100).reshape((10, 10)))
  188. df2 = pd.DataFrame(np.arange(100).reshape((10, 10)))
  189. df = pd.concat(
  190. {"df1": df1.stack(future_stack=True), "df2": df2.stack(future_stack=True)},
  191. axis=1,
  192. )
  193. df.index.names = ["fizz", "buzz"]
  194. expected = pd.DataFrame(
  195. {"df2": np.arange(100), "df1": np.arange(100)},
  196. index=MultiIndex.from_product([range(10), range(10)], names=["fizz", "buzz"]),
  197. )
  198. tm.assert_frame_equal(df, expected, check_like=True)
  199. result = df.index.get_level_values("fizz")
  200. expected = Index(np.arange(10, dtype=np.int64), name="fizz").repeat(10)
  201. tm.assert_index_equal(result, expected)
  202. result = df.index.get_level_values("buzz")
  203. expected = Index(np.tile(np.arange(10, dtype=np.int64), 10), name="buzz")
  204. tm.assert_index_equal(result, expected)
  205. def test_memory_usage(idx):
  206. result = idx.memory_usage()
  207. if len(idx):
  208. idx.get_loc(idx[0])
  209. result2 = idx.memory_usage()
  210. result3 = idx.memory_usage(deep=True)
  211. # RangeIndex, IntervalIndex
  212. # don't have engines
  213. if not isinstance(idx, (RangeIndex, IntervalIndex)):
  214. assert result2 > result
  215. if idx.inferred_type == "object":
  216. assert result3 > result2
  217. else:
  218. # we report 0 for no-length
  219. assert result == 0
  220. def test_nlevels(idx):
  221. assert idx.nlevels == 2