test_duplicates.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. from itertools import product
  2. import numpy as np
  3. import pytest
  4. from pandas._libs import (
  5. hashtable,
  6. index as libindex,
  7. )
  8. from pandas import (
  9. NA,
  10. DatetimeIndex,
  11. Index,
  12. MultiIndex,
  13. Series,
  14. )
  15. import pandas._testing as tm
  16. @pytest.fixture
  17. def idx_dup():
  18. # compare tests/indexes/multi/conftest.py
  19. major_axis = Index(["foo", "bar", "baz", "qux"])
  20. minor_axis = Index(["one", "two"])
  21. major_codes = np.array([0, 0, 1, 0, 1, 1])
  22. minor_codes = np.array([0, 1, 0, 1, 0, 1])
  23. index_names = ["first", "second"]
  24. mi = MultiIndex(
  25. levels=[major_axis, minor_axis],
  26. codes=[major_codes, minor_codes],
  27. names=index_names,
  28. verify_integrity=False,
  29. )
  30. return mi
  31. @pytest.mark.parametrize("names", [None, ["first", "second"]])
  32. def test_unique(names):
  33. mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names)
  34. res = mi.unique()
  35. exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names)
  36. tm.assert_index_equal(res, exp)
  37. mi = MultiIndex.from_arrays([list("aaaa"), list("abab")], names=names)
  38. res = mi.unique()
  39. exp = MultiIndex.from_arrays([list("aa"), list("ab")], names=mi.names)
  40. tm.assert_index_equal(res, exp)
  41. mi = MultiIndex.from_arrays([list("aaaa"), list("aaaa")], names=names)
  42. res = mi.unique()
  43. exp = MultiIndex.from_arrays([["a"], ["a"]], names=mi.names)
  44. tm.assert_index_equal(res, exp)
  45. # GH #20568 - empty MI
  46. mi = MultiIndex.from_arrays([[], []], names=names)
  47. res = mi.unique()
  48. tm.assert_index_equal(mi, res)
  49. def test_unique_datetimelike():
  50. idx1 = DatetimeIndex(
  51. ["2015-01-01", "2015-01-01", "2015-01-01", "2015-01-01", "NaT", "NaT"]
  52. )
  53. idx2 = DatetimeIndex(
  54. ["2015-01-01", "2015-01-01", "2015-01-02", "2015-01-02", "NaT", "2015-01-01"],
  55. tz="Asia/Tokyo",
  56. )
  57. result = MultiIndex.from_arrays([idx1, idx2]).unique()
  58. eidx1 = DatetimeIndex(["2015-01-01", "2015-01-01", "NaT", "NaT"])
  59. eidx2 = DatetimeIndex(
  60. ["2015-01-01", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo"
  61. )
  62. exp = MultiIndex.from_arrays([eidx1, eidx2])
  63. tm.assert_index_equal(result, exp)
  64. @pytest.mark.parametrize("level", [0, "first", 1, "second"])
  65. def test_unique_level(idx, level):
  66. # GH #17896 - with level= argument
  67. result = idx.unique(level=level)
  68. expected = idx.get_level_values(level).unique()
  69. tm.assert_index_equal(result, expected)
  70. # With already unique level
  71. mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], names=["first", "second"])
  72. result = mi.unique(level=level)
  73. expected = mi.get_level_values(level)
  74. tm.assert_index_equal(result, expected)
  75. # With empty MI
  76. mi = MultiIndex.from_arrays([[], []], names=["first", "second"])
  77. result = mi.unique(level=level)
  78. expected = mi.get_level_values(level)
  79. tm.assert_index_equal(result, expected)
  80. def test_duplicate_multiindex_codes():
  81. # GH 17464
  82. # Make sure that a MultiIndex with duplicate levels throws a ValueError
  83. msg = r"Level values must be unique: \[[A', ]+\] on level 0"
  84. with pytest.raises(ValueError, match=msg):
  85. mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)])
  86. # And that using set_levels with duplicate levels fails
  87. mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
  88. msg = r"Level values must be unique: \[[AB', ]+\] on level 0"
  89. with pytest.raises(ValueError, match=msg):
  90. mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]])
  91. @pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]])
  92. def test_duplicate_level_names(names):
  93. # GH18872, GH19029
  94. mi = MultiIndex.from_product([[0, 1]] * 3, names=names)
  95. assert mi.names == names
  96. # With .rename()
  97. mi = MultiIndex.from_product([[0, 1]] * 3)
  98. mi = mi.rename(names)
  99. assert mi.names == names
  100. # With .rename(., level=)
  101. mi.rename(names[1], level=1, inplace=True)
  102. mi = mi.rename([names[0], names[2]], level=[0, 2])
  103. assert mi.names == names
  104. def test_duplicate_meta_data():
  105. # GH 10115
  106. mi = MultiIndex(
  107. levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]
  108. )
  109. for idx in [
  110. mi,
  111. mi.set_names([None, None]),
  112. mi.set_names([None, "Num"]),
  113. mi.set_names(["Upper", "Num"]),
  114. ]:
  115. assert idx.has_duplicates
  116. assert idx.drop_duplicates().names == idx.names
  117. def test_has_duplicates(idx, idx_dup):
  118. # see fixtures
  119. assert idx.is_unique is True
  120. assert idx.has_duplicates is False
  121. assert idx_dup.is_unique is False
  122. assert idx_dup.has_duplicates is True
  123. mi = MultiIndex(
  124. levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]
  125. )
  126. assert mi.is_unique is False
  127. assert mi.has_duplicates is True
  128. # single instance of NaN
  129. mi_nan = MultiIndex(
  130. levels=[["a", "b"], [0, 1]], codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]]
  131. )
  132. assert mi_nan.is_unique is True
  133. assert mi_nan.has_duplicates is False
  134. # multiple instances of NaN
  135. mi_nan_dup = MultiIndex(
  136. levels=[["a", "b"], [0, 1]], codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]]
  137. )
  138. assert mi_nan_dup.is_unique is False
  139. assert mi_nan_dup.has_duplicates is True
  140. def test_has_duplicates_from_tuples():
  141. # GH 9075
  142. t = [
  143. ("x", "out", "z", 5, "y", "in", "z", 169),
  144. ("x", "out", "z", 7, "y", "in", "z", 119),
  145. ("x", "out", "z", 9, "y", "in", "z", 135),
  146. ("x", "out", "z", 13, "y", "in", "z", 145),
  147. ("x", "out", "z", 14, "y", "in", "z", 158),
  148. ("x", "out", "z", 16, "y", "in", "z", 122),
  149. ("x", "out", "z", 17, "y", "in", "z", 160),
  150. ("x", "out", "z", 18, "y", "in", "z", 180),
  151. ("x", "out", "z", 20, "y", "in", "z", 143),
  152. ("x", "out", "z", 21, "y", "in", "z", 128),
  153. ("x", "out", "z", 22, "y", "in", "z", 129),
  154. ("x", "out", "z", 25, "y", "in", "z", 111),
  155. ("x", "out", "z", 28, "y", "in", "z", 114),
  156. ("x", "out", "z", 29, "y", "in", "z", 121),
  157. ("x", "out", "z", 31, "y", "in", "z", 126),
  158. ("x", "out", "z", 32, "y", "in", "z", 155),
  159. ("x", "out", "z", 33, "y", "in", "z", 123),
  160. ("x", "out", "z", 12, "y", "in", "z", 144),
  161. ]
  162. mi = MultiIndex.from_tuples(t)
  163. assert not mi.has_duplicates
  164. @pytest.mark.parametrize("nlevels", [4, 8])
  165. @pytest.mark.parametrize("with_nulls", [True, False])
  166. def test_has_duplicates_overflow(nlevels, with_nulls):
  167. # handle int64 overflow if possible
  168. # no overflow with 4
  169. # overflow possible with 8
  170. codes = np.tile(np.arange(500), 2)
  171. level = np.arange(500)
  172. if with_nulls: # inject some null values
  173. codes[500] = -1 # common nan value
  174. codes = [codes.copy() for i in range(nlevels)]
  175. for i in range(nlevels):
  176. codes[i][500 + i - nlevels // 2] = -1
  177. codes += [np.array([-1, 1]).repeat(500)]
  178. else:
  179. codes = [codes] * nlevels + [np.arange(2).repeat(500)]
  180. levels = [level] * nlevels + [[0, 1]]
  181. # no dups
  182. mi = MultiIndex(levels=levels, codes=codes)
  183. assert not mi.has_duplicates
  184. # with a dup
  185. if with_nulls:
  186. def f(a):
  187. return np.insert(a, 1000, a[0])
  188. codes = list(map(f, codes))
  189. mi = MultiIndex(levels=levels, codes=codes)
  190. else:
  191. values = mi.values.tolist()
  192. mi = MultiIndex.from_tuples(values + [values[0]])
  193. assert mi.has_duplicates
  194. @pytest.mark.parametrize(
  195. "keep, expected",
  196. [
  197. ("first", np.array([False, False, False, True, True, False])),
  198. ("last", np.array([False, True, True, False, False, False])),
  199. (False, np.array([False, True, True, True, True, False])),
  200. ],
  201. )
  202. def test_duplicated(idx_dup, keep, expected):
  203. result = idx_dup.duplicated(keep=keep)
  204. tm.assert_numpy_array_equal(result, expected)
  205. @pytest.mark.arm_slow
  206. def test_duplicated_hashtable_impl(keep, monkeypatch):
  207. # GH 9125
  208. n, k = 6, 10
  209. levels = [np.arange(n), [str(i) for i in range(n)], 1000 + np.arange(n)]
  210. codes = [np.random.default_rng(2).choice(n, k * n) for _ in levels]
  211. with monkeypatch.context() as m:
  212. m.setattr(libindex, "_SIZE_CUTOFF", 50)
  213. mi = MultiIndex(levels=levels, codes=codes)
  214. result = mi.duplicated(keep=keep)
  215. expected = hashtable.duplicated(mi.values, keep=keep)
  216. tm.assert_numpy_array_equal(result, expected)
  217. @pytest.mark.parametrize("val", [101, 102])
  218. def test_duplicated_with_nan(val):
  219. # GH5873
  220. mi = MultiIndex.from_arrays([[101, val], [3.5, np.nan]])
  221. assert not mi.has_duplicates
  222. tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool"))
  223. @pytest.mark.parametrize("n", range(1, 6))
  224. @pytest.mark.parametrize("m", range(1, 5))
  225. def test_duplicated_with_nan_multi_shape(n, m):
  226. # GH5873
  227. # all possible unique combinations, including nan
  228. codes = product(range(-1, n), range(-1, m))
  229. mi = MultiIndex(
  230. levels=[list("abcde")[:n], list("WXYZ")[:m]],
  231. codes=np.random.default_rng(2).permutation(list(codes)).T,
  232. )
  233. assert len(mi) == (n + 1) * (m + 1)
  234. assert not mi.has_duplicates
  235. tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype="bool"))
  236. def test_duplicated_drop_duplicates():
  237. # GH#4060
  238. idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2]))
  239. expected = np.array([False, False, False, True, False, False], dtype=bool)
  240. duplicated = idx.duplicated()
  241. tm.assert_numpy_array_equal(duplicated, expected)
  242. assert duplicated.dtype == bool
  243. expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2]))
  244. tm.assert_index_equal(idx.drop_duplicates(), expected)
  245. expected = np.array([True, False, False, False, False, False])
  246. duplicated = idx.duplicated(keep="last")
  247. tm.assert_numpy_array_equal(duplicated, expected)
  248. assert duplicated.dtype == bool
  249. expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2]))
  250. tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected)
  251. expected = np.array([True, False, False, True, False, False])
  252. duplicated = idx.duplicated(keep=False)
  253. tm.assert_numpy_array_equal(duplicated, expected)
  254. assert duplicated.dtype == bool
  255. expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2]))
  256. tm.assert_index_equal(idx.drop_duplicates(keep=False), expected)
  257. @pytest.mark.parametrize(
  258. "dtype",
  259. [
  260. np.complex64,
  261. np.complex128,
  262. ],
  263. )
  264. def test_duplicated_series_complex_numbers(dtype):
  265. # GH 17927
  266. expected = Series(
  267. [False, False, False, True, False, False, False, True, False, True],
  268. dtype=bool,
  269. )
  270. result = Series(
  271. [
  272. np.nan + np.nan * 1j,
  273. 0,
  274. 1j,
  275. 1j,
  276. 1,
  277. 1 + 1j,
  278. 1 + 2j,
  279. 1 + 1j,
  280. np.nan,
  281. np.nan + np.nan * 1j,
  282. ],
  283. dtype=dtype,
  284. ).duplicated()
  285. tm.assert_series_equal(result, expected)
  286. def test_midx_unique_ea_dtype():
  287. # GH#48335
  288. vals_a = Series([1, 2, NA, NA], dtype="Int64")
  289. vals_b = np.array([1, 2, 3, 3])
  290. midx = MultiIndex.from_arrays([vals_a, vals_b], names=["a", "b"])
  291. result = midx.unique()
  292. exp_vals_a = Series([1, 2, NA], dtype="Int64")
  293. exp_vals_b = np.array([1, 2, 3])
  294. expected = MultiIndex.from_arrays([exp_vals_a, exp_vals_b], names=["a", "b"])
  295. tm.assert_index_equal(result, expected)