test_duplicated.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. import re
  2. import sys
  3. import numpy as np
  4. import pytest
  5. from pandas import (
  6. DataFrame,
  7. Series,
  8. date_range,
  9. )
  10. import pandas._testing as tm
  11. @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
  12. def test_duplicated_with_misspelled_column_name(subset):
  13. # GH 19730
  14. df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
  15. msg = re.escape("Index(['a'], dtype=")
  16. with pytest.raises(KeyError, match=msg):
  17. df.duplicated(subset)
  18. def test_duplicated_implemented_no_recursion():
  19. # gh-21524
  20. # Ensure duplicated isn't implemented using recursion that
  21. # can fail on wide frames
  22. df = DataFrame(np.random.default_rng(2).integers(0, 1000, (10, 1000)))
  23. rec_limit = sys.getrecursionlimit()
  24. try:
  25. sys.setrecursionlimit(100)
  26. result = df.duplicated()
  27. finally:
  28. sys.setrecursionlimit(rec_limit)
  29. # Then duplicates produce the bool Series as a result and don't fail during
  30. # calculation. Actual values doesn't matter here, though usually it's all
  31. # False in this case
  32. assert isinstance(result, Series)
  33. assert result.dtype == np.bool_
  34. @pytest.mark.parametrize(
  35. "keep, expected",
  36. [
  37. ("first", Series([False, False, True, False, True])),
  38. ("last", Series([True, True, False, False, False])),
  39. (False, Series([True, True, True, False, True])),
  40. ],
  41. )
  42. def test_duplicated_keep(keep, expected):
  43. df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})
  44. result = df.duplicated(keep=keep)
  45. tm.assert_series_equal(result, expected)
  46. @pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
  47. @pytest.mark.parametrize(
  48. "keep, expected",
  49. [
  50. ("first", Series([False, False, True, False, True])),
  51. ("last", Series([True, True, False, False, False])),
  52. (False, Series([True, True, True, False, True])),
  53. ],
  54. )
  55. def test_duplicated_nan_none(keep, expected):
  56. df = DataFrame({"C": [np.nan, 3, 3, None, np.nan], "x": 1}, dtype=object)
  57. result = df.duplicated(keep=keep)
  58. tm.assert_series_equal(result, expected)
  59. @pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
  60. def test_duplicated_subset(subset, keep):
  61. df = DataFrame(
  62. {
  63. "A": [0, 1, 1, 2, 0],
  64. "B": ["a", "b", "b", "c", "a"],
  65. "C": [np.nan, 3, 3, None, np.nan],
  66. }
  67. )
  68. if subset is None:
  69. subset = list(df.columns)
  70. elif isinstance(subset, str):
  71. # need to have a DataFrame, not a Series
  72. # -> select columns with singleton list, not string
  73. subset = [subset]
  74. expected = df[subset].duplicated(keep=keep)
  75. result = df.duplicated(keep=keep, subset=subset)
  76. tm.assert_series_equal(result, expected)
  77. def test_duplicated_on_empty_frame():
  78. # GH 25184
  79. df = DataFrame(columns=["a", "b"])
  80. dupes = df.duplicated("a")
  81. result = df[dupes]
  82. expected = df.copy()
  83. tm.assert_frame_equal(result, expected)
  84. def test_frame_datetime64_duplicated():
  85. dates = date_range("2010-07-01", end="2010-08-05")
  86. tst = DataFrame({"symbol": "AAA", "date": dates})
  87. result = tst.duplicated(["date", "symbol"])
  88. assert (-result).all()
  89. tst = DataFrame({"date": dates})
  90. result = tst.date.duplicated()
  91. assert (-result).all()