test_isin.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. Series,
  6. date_range,
  7. )
  8. import pandas._testing as tm
  9. from pandas.core import algorithms
  10. from pandas.core.arrays import PeriodArray
  11. class TestSeriesIsIn:
  12. def test_isin(self):
  13. s = Series(["A", "B", "C", "a", "B", "B", "A", "C"])
  14. result = s.isin(["A", "C"])
  15. expected = Series([True, False, True, False, False, False, True, True])
  16. tm.assert_series_equal(result, expected)
  17. # GH#16012
  18. # This specific issue has to have a series over 1e6 in len, but the
  19. # comparison array (in_list) must be large enough so that numpy doesn't
  20. # do a manual masking trick that will avoid this issue altogether
  21. s = Series(list("abcdefghijk" * 10**5))
  22. # If numpy doesn't do the manual comparison/mask, these
  23. # unorderable mixed types are what cause the exception in numpy
  24. in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6
  25. assert s.isin(in_list).sum() == 200000
  26. def test_isin_with_string_scalar(self):
  27. # GH#4763
  28. s = Series(["A", "B", "C", "a", "B", "B", "A", "C"])
  29. msg = (
  30. r"only list-like objects are allowed to be passed to isin\(\), "
  31. r"you passed a `str`"
  32. )
  33. with pytest.raises(TypeError, match=msg):
  34. s.isin("a")
  35. s = Series(["aaa", "b", "c"])
  36. with pytest.raises(TypeError, match=msg):
  37. s.isin("aaa")
  38. def test_isin_datetimelike_mismatched_reso(self):
  39. expected = Series([True, True, False, False, False])
  40. ser = Series(date_range("jan-01-2013", "jan-05-2013"))
  41. # fails on dtype conversion in the first place
  42. day_values = np.asarray(ser[0:2].values).astype("datetime64[D]")
  43. result = ser.isin(day_values)
  44. tm.assert_series_equal(result, expected)
  45. dta = ser[:2]._values.astype("M8[s]")
  46. result = ser.isin(dta)
  47. tm.assert_series_equal(result, expected)
  48. def test_isin_datetimelike_mismatched_reso_list(self):
  49. expected = Series([True, True, False, False, False])
  50. ser = Series(date_range("jan-01-2013", "jan-05-2013"))
  51. dta = ser[:2]._values.astype("M8[s]")
  52. result = ser.isin(list(dta))
  53. tm.assert_series_equal(result, expected)
  54. def test_isin_with_i8(self):
  55. # GH#5021
  56. expected = Series([True, True, False, False, False])
  57. expected2 = Series([False, True, False, False, False])
  58. # datetime64[ns]
  59. s = Series(date_range("jan-01-2013", "jan-05-2013"))
  60. result = s.isin(s[0:2])
  61. tm.assert_series_equal(result, expected)
  62. result = s.isin(s[0:2].values)
  63. tm.assert_series_equal(result, expected)
  64. result = s.isin([s[1]])
  65. tm.assert_series_equal(result, expected2)
  66. result = s.isin([np.datetime64(s[1])])
  67. tm.assert_series_equal(result, expected2)
  68. result = s.isin(set(s[0:2]))
  69. tm.assert_series_equal(result, expected)
  70. # timedelta64[ns]
  71. s = Series(pd.to_timedelta(range(5), unit="d"))
  72. result = s.isin(s[0:2])
  73. tm.assert_series_equal(result, expected)
  74. @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
  75. def test_isin_empty(self, empty):
  76. # see GH#16991
  77. s = Series(["a", "b"])
  78. expected = Series([False, False])
  79. result = s.isin(empty)
  80. tm.assert_series_equal(expected, result)
  81. def test_isin_read_only(self):
  82. # https://github.com/pandas-dev/pandas/issues/37174
  83. arr = np.array([1, 2, 3])
  84. arr.setflags(write=False)
  85. s = Series([1, 2, 3])
  86. result = s.isin(arr)
  87. expected = Series([True, True, True])
  88. tm.assert_series_equal(result, expected)
  89. @pytest.mark.parametrize("dtype", [object, None])
  90. def test_isin_dt64_values_vs_ints(self, dtype):
  91. # GH#36621 dont cast integers to datetimes for isin
  92. dti = date_range("2013-01-01", "2013-01-05")
  93. ser = Series(dti)
  94. comps = np.asarray([1356998400000000000], dtype=dtype)
  95. res = dti.isin(comps)
  96. expected = np.array([False] * len(dti), dtype=bool)
  97. tm.assert_numpy_array_equal(res, expected)
  98. res = ser.isin(comps)
  99. tm.assert_series_equal(res, Series(expected))
  100. res = pd.core.algorithms.isin(ser, comps)
  101. tm.assert_numpy_array_equal(res, expected)
  102. def test_isin_tzawareness_mismatch(self):
  103. dti = date_range("2013-01-01", "2013-01-05")
  104. ser = Series(dti)
  105. other = dti.tz_localize("UTC")
  106. res = dti.isin(other)
  107. expected = np.array([False] * len(dti), dtype=bool)
  108. tm.assert_numpy_array_equal(res, expected)
  109. res = ser.isin(other)
  110. tm.assert_series_equal(res, Series(expected))
  111. res = pd.core.algorithms.isin(ser, other)
  112. tm.assert_numpy_array_equal(res, expected)
  113. def test_isin_period_freq_mismatch(self):
  114. dti = date_range("2013-01-01", "2013-01-05")
  115. pi = dti.to_period("M")
  116. ser = Series(pi)
  117. # We construct another PeriodIndex with the same i8 values
  118. # but different dtype
  119. dtype = dti.to_period("Y").dtype
  120. other = PeriodArray._simple_new(pi.asi8, dtype=dtype)
  121. res = pi.isin(other)
  122. expected = np.array([False] * len(pi), dtype=bool)
  123. tm.assert_numpy_array_equal(res, expected)
  124. res = ser.isin(other)
  125. tm.assert_series_equal(res, Series(expected))
  126. res = pd.core.algorithms.isin(ser, other)
  127. tm.assert_numpy_array_equal(res, expected)
  128. @pytest.mark.parametrize("values", [[-9.0, 0.0], [-9, 0]])
  129. def test_isin_float_in_int_series(self, values):
  130. # GH#19356 GH#21804
  131. ser = Series(values)
  132. result = ser.isin([-9, -0.5])
  133. expected = Series([True, False])
  134. tm.assert_series_equal(result, expected)
  135. @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"])
  136. @pytest.mark.parametrize(
  137. "data,values,expected",
  138. [
  139. ([0, 1, 0], [1], [False, True, False]),
  140. ([0, 1, 0], [1, pd.NA], [False, True, False]),
  141. ([0, pd.NA, 0], [1, 0], [True, False, True]),
  142. ([0, 1, pd.NA], [1, pd.NA], [False, True, True]),
  143. ([0, 1, pd.NA], [1, np.nan], [False, True, False]),
  144. ([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]),
  145. ],
  146. )
  147. def test_isin_masked_types(self, dtype, data, values, expected):
  148. # GH#42405
  149. ser = Series(data, dtype=dtype)
  150. result = ser.isin(values)
  151. expected = Series(expected, dtype="boolean")
  152. tm.assert_series_equal(result, expected)
  153. def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch):
  154. # https://github.com/pandas-dev/pandas/issues/37094
  155. # combination of object dtype for the values
  156. # and > _MINIMUM_COMP_ARR_LEN elements
  157. min_isin_comp = 5
  158. ser = Series([1, 2, np.nan] * min_isin_comp)
  159. with monkeypatch.context() as m:
  160. m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp)
  161. result = ser.isin({"foo", "bar"})
  162. expected = Series([False] * 3 * min_isin_comp)
  163. tm.assert_series_equal(result, expected)
  164. @pytest.mark.parametrize(
  165. "array,expected",
  166. [
  167. (
  168. [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j],
  169. Series([False, True, True, False, True, True, True], dtype=bool),
  170. )
  171. ],
  172. )
  173. def test_isin_complex_numbers(array, expected):
  174. # GH 17927
  175. result = Series(array).isin([1j, 1 + 1j, 1 + 2j])
  176. tm.assert_series_equal(result, expected)
  177. @pytest.mark.parametrize(
  178. "data,is_in",
  179. [([1, [2]], [1]), (["simple str", [{"values": 3}]], ["simple str"])],
  180. )
  181. def test_isin_filtering_with_mixed_object_types(data, is_in):
  182. # GH 20883
  183. ser = Series(data)
  184. result = ser.isin(is_in)
  185. expected = Series([True, False])
  186. tm.assert_series_equal(result, expected)
  187. @pytest.mark.parametrize("data", [[1, 2, 3], [1.0, 2.0, 3.0]])
  188. @pytest.mark.parametrize("isin", [[1, 2], [1.0, 2.0]])
  189. def test_isin_filtering_on_iterable(data, isin):
  190. # GH 50234
  191. ser = Series(data)
  192. result = ser.isin(i for i in isin)
  193. expected_result = Series([True, True, False])
  194. tm.assert_series_equal(result, expected_result)