test_unique.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas._testing as tm
  5. from pandas.tests.base.common import allow_na_ops
  6. @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
  7. def test_unique(index_or_series_obj):
  8. obj = index_or_series_obj
  9. obj = np.repeat(obj, range(1, len(obj) + 1))
  10. result = obj.unique()
  11. # dict.fromkeys preserves the order
  12. unique_values = list(dict.fromkeys(obj.values))
  13. if isinstance(obj, pd.MultiIndex):
  14. expected = pd.MultiIndex.from_tuples(unique_values)
  15. expected.names = obj.names
  16. tm.assert_index_equal(result, expected, exact=True)
  17. elif isinstance(obj, pd.Index):
  18. expected = pd.Index(unique_values, dtype=obj.dtype)
  19. if isinstance(obj.dtype, pd.DatetimeTZDtype):
  20. expected = expected.normalize()
  21. tm.assert_index_equal(result, expected, exact=True)
  22. else:
  23. expected = np.array(unique_values)
  24. tm.assert_numpy_array_equal(result, expected)
  25. @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
  26. @pytest.mark.parametrize("null_obj", [np.nan, None])
  27. def test_unique_null(null_obj, index_or_series_obj):
  28. obj = index_or_series_obj
  29. if not allow_na_ops(obj):
  30. pytest.skip("type doesn't allow for NA operations")
  31. elif len(obj) < 1:
  32. pytest.skip("Test doesn't make sense on empty data")
  33. elif isinstance(obj, pd.MultiIndex):
  34. pytest.skip(f"MultiIndex can't hold '{null_obj}'")
  35. values = obj._values
  36. values[0:2] = null_obj
  37. klass = type(obj)
  38. repeated_values = np.repeat(values, range(1, len(values) + 1))
  39. obj = klass(repeated_values, dtype=obj.dtype)
  40. result = obj.unique()
  41. unique_values_raw = dict.fromkeys(obj.values)
  42. # because np.nan == np.nan is False, but None == None is True
  43. # np.nan would be duplicated, whereas None wouldn't
  44. unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)]
  45. unique_values = [null_obj] + unique_values_not_null
  46. if isinstance(obj, pd.Index):
  47. expected = pd.Index(unique_values, dtype=obj.dtype)
  48. if isinstance(obj.dtype, pd.DatetimeTZDtype):
  49. result = result.normalize()
  50. expected = expected.normalize()
  51. tm.assert_index_equal(result, expected, exact=True)
  52. else:
  53. expected = np.array(unique_values, dtype=obj.dtype)
  54. tm.assert_numpy_array_equal(result, expected)
  55. def test_nunique(index_or_series_obj):
  56. obj = index_or_series_obj
  57. obj = np.repeat(obj, range(1, len(obj) + 1))
  58. expected = len(obj.unique())
  59. assert obj.nunique(dropna=False) == expected
  60. @pytest.mark.parametrize("null_obj", [np.nan, None])
  61. def test_nunique_null(null_obj, index_or_series_obj):
  62. obj = index_or_series_obj
  63. if not allow_na_ops(obj):
  64. pytest.skip("type doesn't allow for NA operations")
  65. elif isinstance(obj, pd.MultiIndex):
  66. pytest.skip(f"MultiIndex can't hold '{null_obj}'")
  67. values = obj._values
  68. values[0:2] = null_obj
  69. klass = type(obj)
  70. repeated_values = np.repeat(values, range(1, len(values) + 1))
  71. obj = klass(repeated_values, dtype=obj.dtype)
  72. if isinstance(obj, pd.CategoricalIndex):
  73. assert obj.nunique() == len(obj.categories)
  74. assert obj.nunique(dropna=False) == len(obj.categories) + 1
  75. else:
  76. num_unique_values = len(obj.unique())
  77. assert obj.nunique() == max(0, num_unique_values - 1)
  78. assert obj.nunique(dropna=False) == max(0, num_unique_values)
  79. @pytest.mark.single_cpu
  80. def test_unique_bad_unicode(index_or_series):
  81. # regression test for #34550
  82. uval = "\ud83d" # smiley emoji
  83. obj = index_or_series([uval] * 2, dtype=object)
  84. result = obj.unique()
  85. if isinstance(obj, pd.Index):
  86. expected = pd.Index(["\ud83d"], dtype=object)
  87. tm.assert_index_equal(result, expected, exact=True)
  88. else:
  89. expected = np.array(["\ud83d"], dtype=object)
  90. tm.assert_numpy_array_equal(result, expected)
  91. @pytest.mark.parametrize("dropna", [True, False])
  92. def test_nunique_dropna(dropna):
  93. # GH37566
  94. ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])
  95. res = ser.nunique(dropna)
  96. assert res == 1 if dropna else 5