test_duplicate_labels.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. """Tests dealing with the NDFrame.allows_duplicates."""
  2. import operator
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. import pandas._testing as tm
  7. not_implemented = pytest.mark.xfail(reason="Not implemented.")
  8. # ----------------------------------------------------------------------------
  9. # Preservation
  10. class TestPreserves:
  11. @pytest.mark.parametrize(
  12. "cls, data",
  13. [
  14. (pd.Series, np.array([])),
  15. (pd.Series, [1, 2]),
  16. (pd.DataFrame, {}),
  17. (pd.DataFrame, {"A": [1, 2]}),
  18. ],
  19. )
  20. def test_construction_ok(self, cls, data):
  21. result = cls(data)
  22. assert result.flags.allows_duplicate_labels is True
  23. result = cls(data).set_flags(allows_duplicate_labels=False)
  24. assert result.flags.allows_duplicate_labels is False
  25. @pytest.mark.parametrize(
  26. "func",
  27. [
  28. operator.itemgetter(["a"]),
  29. operator.methodcaller("add", 1),
  30. operator.methodcaller("rename", str.upper),
  31. operator.methodcaller("rename", "name"),
  32. operator.methodcaller("abs"),
  33. np.abs,
  34. ],
  35. )
  36. def test_preserved_series(self, func):
  37. s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
  38. assert func(s).flags.allows_duplicate_labels is False
  39. @pytest.mark.parametrize(
  40. "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])]
  41. )
  42. # TODO: frame
  43. @not_implemented
  44. def test_align(self, other):
  45. s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
  46. a, b = s.align(other)
  47. assert a.flags.allows_duplicate_labels is False
  48. assert b.flags.allows_duplicate_labels is False
  49. def test_preserved_frame(self):
  50. df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
  51. allows_duplicate_labels=False
  52. )
  53. assert df.loc[["a"]].flags.allows_duplicate_labels is False
  54. assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False
  55. def test_to_frame(self):
  56. ser = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False)
  57. assert ser.to_frame().flags.allows_duplicate_labels is False
  58. @pytest.mark.parametrize("func", ["add", "sub"])
  59. @pytest.mark.parametrize("frame", [False, True])
  60. @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")])
  61. def test_binops(self, func, other, frame):
  62. df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags(
  63. allows_duplicate_labels=False
  64. )
  65. if frame:
  66. df = df.to_frame()
  67. if isinstance(other, pd.Series) and frame:
  68. other = other.to_frame()
  69. func = operator.methodcaller(func, other)
  70. assert df.flags.allows_duplicate_labels is False
  71. assert func(df).flags.allows_duplicate_labels is False
  72. def test_preserve_getitem(self):
  73. df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
  74. assert df[["A"]].flags.allows_duplicate_labels is False
  75. assert df["A"].flags.allows_duplicate_labels is False
  76. assert df.loc[0].flags.allows_duplicate_labels is False
  77. assert df.loc[[0]].flags.allows_duplicate_labels is False
  78. assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False
  79. def test_ndframe_getitem_caching_issue(
  80. self, request, using_copy_on_write, warn_copy_on_write
  81. ):
  82. if not (using_copy_on_write or warn_copy_on_write):
  83. request.applymarker(pytest.mark.xfail(reason="Unclear behavior."))
  84. # NDFrame.__getitem__ will cache the first df['A']. May need to
  85. # invalidate that cache? Update the cached entries?
  86. df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False)
  87. assert df["A"].flags.allows_duplicate_labels is False
  88. df.flags.allows_duplicate_labels = True
  89. assert df["A"].flags.allows_duplicate_labels is True
  90. @pytest.mark.parametrize(
  91. "objs, kwargs",
  92. [
  93. # Series
  94. (
  95. [
  96. pd.Series(1, index=["a", "b"]),
  97. pd.Series(2, index=["c", "d"]),
  98. ],
  99. {},
  100. ),
  101. (
  102. [
  103. pd.Series(1, index=["a", "b"]),
  104. pd.Series(2, index=["a", "b"]),
  105. ],
  106. {"ignore_index": True},
  107. ),
  108. (
  109. [
  110. pd.Series(1, index=["a", "b"]),
  111. pd.Series(2, index=["a", "b"]),
  112. ],
  113. {"axis": 1},
  114. ),
  115. # Frame
  116. (
  117. [
  118. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
  119. pd.DataFrame({"A": [1, 2]}, index=["c", "d"]),
  120. ],
  121. {},
  122. ),
  123. (
  124. [
  125. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
  126. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
  127. ],
  128. {"ignore_index": True},
  129. ),
  130. (
  131. [
  132. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
  133. pd.DataFrame({"B": [1, 2]}, index=["a", "b"]),
  134. ],
  135. {"axis": 1},
  136. ),
  137. # Series / Frame
  138. (
  139. [
  140. pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
  141. pd.Series([1, 2], index=["a", "b"], name="B"),
  142. ],
  143. {"axis": 1},
  144. ),
  145. ],
  146. )
  147. def test_concat(self, objs, kwargs):
  148. objs = [x.set_flags(allows_duplicate_labels=False) for x in objs]
  149. result = pd.concat(objs, **kwargs)
  150. assert result.flags.allows_duplicate_labels is False
  151. @pytest.mark.parametrize(
  152. "left, right, expected",
  153. [
  154. # false false false
  155. pytest.param(
  156. pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
  157. allows_duplicate_labels=False
  158. ),
  159. pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags(
  160. allows_duplicate_labels=False
  161. ),
  162. False,
  163. marks=not_implemented,
  164. ),
  165. # false true false
  166. pytest.param(
  167. pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
  168. allows_duplicate_labels=False
  169. ),
  170. pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
  171. False,
  172. marks=not_implemented,
  173. ),
  174. # true true true
  175. (
  176. pd.DataFrame({"A": [0, 1]}, index=["a", "b"]),
  177. pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
  178. True,
  179. ),
  180. ],
  181. )
  182. def test_merge(self, left, right, expected):
  183. result = pd.merge(left, right, left_index=True, right_index=True)
  184. assert result.flags.allows_duplicate_labels is expected
  185. @not_implemented
  186. def test_groupby(self):
  187. # XXX: This is under tested
  188. # TODO:
  189. # - apply
  190. # - transform
  191. # - Should passing a grouper that disallows duplicates propagate?
  192. df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False)
  193. result = df.groupby([0, 0, 1]).agg("count")
  194. assert result.flags.allows_duplicate_labels is False
  195. @pytest.mark.parametrize("frame", [True, False])
  196. @not_implemented
  197. def test_window(self, frame):
  198. df = pd.Series(
  199. 1,
  200. index=pd.date_range("2000", periods=12),
  201. name="A",
  202. allows_duplicate_labels=False,
  203. )
  204. if frame:
  205. df = df.to_frame()
  206. assert df.rolling(3).mean().flags.allows_duplicate_labels is False
  207. assert df.ewm(3).mean().flags.allows_duplicate_labels is False
  208. assert df.expanding(3).mean().flags.allows_duplicate_labels is False
  209. # ----------------------------------------------------------------------------
  210. # Raises
  211. class TestRaises:
  212. @pytest.mark.parametrize(
  213. "cls, axes",
  214. [
  215. (pd.Series, {"index": ["a", "a"], "dtype": float}),
  216. (pd.DataFrame, {"index": ["a", "a"]}),
  217. (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}),
  218. (pd.DataFrame, {"columns": ["b", "b"]}),
  219. ],
  220. )
  221. def test_set_flags_with_duplicates(self, cls, axes):
  222. result = cls(**axes)
  223. assert result.flags.allows_duplicate_labels is True
  224. msg = "Index has duplicates."
  225. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  226. cls(**axes).set_flags(allows_duplicate_labels=False)
  227. @pytest.mark.parametrize(
  228. "data",
  229. [
  230. pd.Series(index=[0, 0], dtype=float),
  231. pd.DataFrame(index=[0, 0]),
  232. pd.DataFrame(columns=[0, 0]),
  233. ],
  234. )
  235. def test_setting_allows_duplicate_labels_raises(self, data):
  236. msg = "Index has duplicates."
  237. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  238. data.flags.allows_duplicate_labels = False
  239. assert data.flags.allows_duplicate_labels is True
  240. def test_series_raises(self):
  241. a = pd.Series(0, index=["a", "b"])
  242. b = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
  243. msg = "Index has duplicates."
  244. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  245. pd.concat([a, b])
  246. @pytest.mark.parametrize(
  247. "getter, target",
  248. [
  249. (operator.itemgetter(["A", "A"]), None),
  250. # loc
  251. (operator.itemgetter(["a", "a"]), "loc"),
  252. pytest.param(operator.itemgetter(("a", ["A", "A"])), "loc"),
  253. (operator.itemgetter((["a", "a"], "A")), "loc"),
  254. # iloc
  255. (operator.itemgetter([0, 0]), "iloc"),
  256. pytest.param(operator.itemgetter((0, [0, 0])), "iloc"),
  257. pytest.param(operator.itemgetter(([0, 0], 0)), "iloc"),
  258. ],
  259. )
  260. def test_getitem_raises(self, getter, target):
  261. df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
  262. allows_duplicate_labels=False
  263. )
  264. if target:
  265. # df, df.loc, or df.iloc
  266. target = getattr(df, target)
  267. else:
  268. target = df
  269. msg = "Index has duplicates."
  270. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  271. getter(target)
  272. @pytest.mark.parametrize(
  273. "objs, kwargs",
  274. [
  275. (
  276. [
  277. pd.Series(1, index=[0, 1], name="a"),
  278. pd.Series(2, index=[0, 1], name="a"),
  279. ],
  280. {"axis": 1},
  281. )
  282. ],
  283. )
  284. def test_concat_raises(self, objs, kwargs):
  285. objs = [x.set_flags(allows_duplicate_labels=False) for x in objs]
  286. msg = "Index has duplicates."
  287. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  288. pd.concat(objs, **kwargs)
  289. @not_implemented
  290. def test_merge_raises(self):
  291. a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags(
  292. allows_duplicate_labels=False
  293. )
  294. b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"])
  295. msg = "Index has duplicates."
  296. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  297. pd.merge(a, b, left_index=True, right_index=True)
  298. @pytest.mark.parametrize(
  299. "idx",
  300. [
  301. pd.Index([1, 1]),
  302. pd.Index(["a", "a"]),
  303. pd.Index([1.1, 1.1]),
  304. pd.PeriodIndex([pd.Period("2000", "D")] * 2),
  305. pd.DatetimeIndex([pd.Timestamp("2000")] * 2),
  306. pd.TimedeltaIndex([pd.Timedelta("1D")] * 2),
  307. pd.CategoricalIndex(["a", "a"]),
  308. pd.IntervalIndex([pd.Interval(0, 1)] * 2),
  309. pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]),
  310. ],
  311. ids=lambda x: type(x).__name__,
  312. )
  313. def test_raises_basic(idx):
  314. msg = "Index has duplicates."
  315. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  316. pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False)
  317. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  318. pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False)
  319. with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
  320. pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False)
  321. def test_format_duplicate_labels_message():
  322. idx = pd.Index(["a", "b", "a", "b", "c"])
  323. result = idx._format_duplicate_message()
  324. expected = pd.DataFrame(
  325. {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label")
  326. )
  327. tm.assert_frame_equal(result, expected)
  328. def test_format_duplicate_labels_message_multi():
  329. idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]])
  330. result = idx._format_duplicate_message()
  331. expected = pd.DataFrame(
  332. {"positions": [[0, 2], [1, 3]]},
  333. index=pd.MultiIndex.from_product([["A"], ["a", "b"]]),
  334. )
  335. tm.assert_frame_equal(result, expected)
  336. def test_dataframe_insert_raises():
  337. df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
  338. msg = "Cannot specify"
  339. with pytest.raises(ValueError, match=msg):
  340. df.insert(0, "A", [3, 4], allow_duplicates=True)
  341. @pytest.mark.parametrize(
  342. "method, frame_only",
  343. [
  344. (operator.methodcaller("set_index", "A", inplace=True), True),
  345. (operator.methodcaller("reset_index", inplace=True), True),
  346. (operator.methodcaller("rename", lambda x: x, inplace=True), False),
  347. ],
  348. )
  349. def test_inplace_raises(method, frame_only):
  350. df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags(
  351. allows_duplicate_labels=False
  352. )
  353. s = df["A"]
  354. s.flags.allows_duplicate_labels = False
  355. msg = "Cannot specify"
  356. with pytest.raises(ValueError, match=msg):
  357. method(df)
  358. if not frame_only:
  359. with pytest.raises(ValueError, match=msg):
  360. method(s)
  361. def test_pickle():
  362. a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False)
  363. b = tm.round_trip_pickle(a)
  364. tm.assert_series_equal(a, b)
  365. a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False)
  366. b = tm.round_trip_pickle(a)
  367. tm.assert_frame_equal(a, b)