methods.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720
  1. import inspect
  2. import operator
  3. import numpy as np
  4. import pytest
  5. from pandas._typing import Dtype
  6. from pandas.core.dtypes.common import is_bool_dtype
  7. from pandas.core.dtypes.dtypes import NumpyEADtype
  8. from pandas.core.dtypes.missing import na_value_for_dtype
  9. import pandas as pd
  10. import pandas._testing as tm
  11. from pandas.core.sorting import nargsort
  12. class BaseMethodsTests:
  13. """Various Series and DataFrame methods."""
  14. def test_hash_pandas_object(self, data):
  15. # _hash_pandas_object should return a uint64 ndarray of the same length
  16. # as the data
  17. from pandas.core.util.hashing import _default_hash_key
  18. res = data._hash_pandas_object(
  19. encoding="utf-8", hash_key=_default_hash_key, categorize=False
  20. )
  21. assert res.dtype == np.uint64
  22. assert res.shape == data.shape
  23. def test_value_counts_default_dropna(self, data):
  24. # make sure we have consistent default dropna kwarg
  25. if not hasattr(data, "value_counts"):
  26. pytest.skip(f"value_counts is not implemented for {type(data)}")
  27. sig = inspect.signature(data.value_counts)
  28. kwarg = sig.parameters["dropna"]
  29. assert kwarg.default is True
  30. @pytest.mark.parametrize("dropna", [True, False])
  31. def test_value_counts(self, all_data, dropna):
  32. all_data = all_data[:10]
  33. if dropna:
  34. other = all_data[~all_data.isna()]
  35. else:
  36. other = all_data
  37. result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
  38. expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
  39. tm.assert_series_equal(result, expected)
  40. def test_value_counts_with_normalize(self, data):
  41. # GH 33172
  42. data = data[:10].unique()
  43. values = np.array(data[~data.isna()])
  44. ser = pd.Series(data, dtype=data.dtype)
  45. result = ser.value_counts(normalize=True).sort_index()
  46. if not isinstance(data, pd.Categorical):
  47. expected = pd.Series(
  48. [1 / len(values)] * len(values), index=result.index, name="proportion"
  49. )
  50. else:
  51. expected = pd.Series(0.0, index=result.index, name="proportion")
  52. expected[result > 0] = 1 / len(values)
  53. if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan:
  54. # TODO: avoid special-casing
  55. expected = expected.astype("float64")
  56. elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
  57. data.dtype, pd.ArrowDtype
  58. ):
  59. # TODO: avoid special-casing
  60. expected = expected.astype("double[pyarrow]")
  61. elif na_value_for_dtype(data.dtype) is pd.NA:
  62. # TODO(GH#44692): avoid special-casing
  63. expected = expected.astype("Float64")
  64. tm.assert_series_equal(result, expected)
  65. def test_count(self, data_missing):
  66. df = pd.DataFrame({"A": data_missing})
  67. result = df.count(axis="columns")
  68. expected = pd.Series([0, 1])
  69. tm.assert_series_equal(result, expected)
  70. def test_series_count(self, data_missing):
  71. # GH#26835
  72. ser = pd.Series(data_missing)
  73. result = ser.count()
  74. expected = 1
  75. assert result == expected
  76. def test_apply_simple_series(self, data):
  77. result = pd.Series(data).apply(id)
  78. assert isinstance(result, pd.Series)
  79. @pytest.mark.parametrize("na_action", [None, "ignore"])
  80. def test_map(self, data_missing, na_action):
  81. result = data_missing.map(lambda x: x, na_action=na_action)
  82. expected = data_missing.to_numpy()
  83. tm.assert_numpy_array_equal(result, expected)
  84. def test_argsort(self, data_for_sorting):
  85. result = pd.Series(data_for_sorting).argsort()
  86. # argsort result gets passed to take, so should be np.intp
  87. expected = pd.Series(np.array([2, 0, 1], dtype=np.intp))
  88. tm.assert_series_equal(result, expected)
  89. def test_argsort_missing_array(self, data_missing_for_sorting):
  90. result = data_missing_for_sorting.argsort()
  91. # argsort result gets passed to take, so should be np.intp
  92. expected = np.array([2, 0, 1], dtype=np.intp)
  93. tm.assert_numpy_array_equal(result, expected)
  94. def test_argsort_missing(self, data_missing_for_sorting):
  95. msg = "The behavior of Series.argsort in the presence of NA values"
  96. with tm.assert_produces_warning(FutureWarning, match=msg):
  97. result = pd.Series(data_missing_for_sorting).argsort()
  98. expected = pd.Series(np.array([1, -1, 0], dtype=np.intp))
  99. tm.assert_series_equal(result, expected)
  100. def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
  101. # GH 24382
  102. is_bool = data_for_sorting.dtype._is_boolean
  103. exp_argmax = 1
  104. exp_argmax_repeated = 3
  105. if is_bool:
  106. # See data_for_sorting docstring
  107. exp_argmax = 0
  108. exp_argmax_repeated = 1
  109. # data_for_sorting -> [B, C, A] with A < B < C
  110. assert data_for_sorting.argmax() == exp_argmax
  111. assert data_for_sorting.argmin() == 2
  112. # with repeated values -> first occurrence
  113. data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
  114. assert data.argmax() == exp_argmax_repeated
  115. assert data.argmin() == 0
  116. # with missing values
  117. # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
  118. assert data_missing_for_sorting.argmax() == 0
  119. assert data_missing_for_sorting.argmin() == 2
  120. @pytest.mark.parametrize("method", ["argmax", "argmin"])
  121. def test_argmin_argmax_empty_array(self, method, data):
  122. # GH 24382
  123. err_msg = "attempt to get"
  124. with pytest.raises(ValueError, match=err_msg):
  125. getattr(data[:0], method)()
  126. @pytest.mark.parametrize("method", ["argmax", "argmin"])
  127. def test_argmin_argmax_all_na(self, method, data, na_value):
  128. # all missing with skipna=True is the same as empty
  129. err_msg = "attempt to get"
  130. data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
  131. with pytest.raises(ValueError, match=err_msg):
  132. getattr(data_na, method)()
  133. @pytest.mark.parametrize(
  134. "op_name, skipna, expected",
  135. [
  136. ("idxmax", True, 0),
  137. ("idxmin", True, 2),
  138. ("argmax", True, 0),
  139. ("argmin", True, 2),
  140. ("idxmax", False, np.nan),
  141. ("idxmin", False, np.nan),
  142. ("argmax", False, -1),
  143. ("argmin", False, -1),
  144. ],
  145. )
  146. def test_argreduce_series(
  147. self, data_missing_for_sorting, op_name, skipna, expected
  148. ):
  149. # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
  150. warn = None
  151. msg = "The behavior of Series.argmax/argmin"
  152. if op_name.startswith("arg") and expected == -1:
  153. warn = FutureWarning
  154. if op_name.startswith("idx") and np.isnan(expected):
  155. warn = FutureWarning
  156. msg = f"The behavior of Series.{op_name}"
  157. ser = pd.Series(data_missing_for_sorting)
  158. with tm.assert_produces_warning(warn, match=msg):
  159. result = getattr(ser, op_name)(skipna=skipna)
  160. tm.assert_almost_equal(result, expected)
  161. def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting):
  162. # GH#38733
  163. data = data_missing_for_sorting
  164. with pytest.raises(NotImplementedError, match=""):
  165. data.argmin(skipna=False)
  166. with pytest.raises(NotImplementedError, match=""):
  167. data.argmax(skipna=False)
  168. @pytest.mark.parametrize(
  169. "na_position, expected",
  170. [
  171. ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
  172. ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
  173. ],
  174. )
  175. def test_nargsort(self, data_missing_for_sorting, na_position, expected):
  176. # GH 25439
  177. result = nargsort(data_missing_for_sorting, na_position=na_position)
  178. tm.assert_numpy_array_equal(result, expected)
  179. @pytest.mark.parametrize("ascending", [True, False])
  180. def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
  181. ser = pd.Series(data_for_sorting)
  182. result = ser.sort_values(ascending=ascending, key=sort_by_key)
  183. expected = ser.iloc[[2, 0, 1]]
  184. if not ascending:
  185. # GH 35922. Expect stable sort
  186. if ser.nunique() == 2:
  187. expected = ser.iloc[[0, 1, 2]]
  188. else:
  189. expected = ser.iloc[[1, 0, 2]]
  190. tm.assert_series_equal(result, expected)
  191. @pytest.mark.parametrize("ascending", [True, False])
  192. def test_sort_values_missing(
  193. self, data_missing_for_sorting, ascending, sort_by_key
  194. ):
  195. ser = pd.Series(data_missing_for_sorting)
  196. result = ser.sort_values(ascending=ascending, key=sort_by_key)
  197. if ascending:
  198. expected = ser.iloc[[2, 0, 1]]
  199. else:
  200. expected = ser.iloc[[0, 2, 1]]
  201. tm.assert_series_equal(result, expected)
  202. @pytest.mark.parametrize("ascending", [True, False])
  203. def test_sort_values_frame(self, data_for_sorting, ascending):
  204. df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
  205. result = df.sort_values(["A", "B"])
  206. expected = pd.DataFrame(
  207. {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
  208. )
  209. tm.assert_frame_equal(result, expected)
  210. @pytest.mark.parametrize("keep", ["first", "last", False])
  211. def test_duplicated(self, data, keep):
  212. arr = data.take([0, 1, 0, 1])
  213. result = arr.duplicated(keep=keep)
  214. if keep == "first":
  215. expected = np.array([False, False, True, True])
  216. elif keep == "last":
  217. expected = np.array([True, True, False, False])
  218. else:
  219. expected = np.array([True, True, True, True])
  220. tm.assert_numpy_array_equal(result, expected)
  221. @pytest.mark.parametrize("box", [pd.Series, lambda x: x])
  222. @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
  223. def test_unique(self, data, box, method):
  224. duplicated = box(data._from_sequence([data[0], data[0]], dtype=data.dtype))
  225. result = method(duplicated)
  226. assert len(result) == 1
  227. assert isinstance(result, type(data))
  228. assert result[0] == duplicated[0]
  229. def test_factorize(self, data_for_grouping):
  230. codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True)
  231. is_bool = data_for_grouping.dtype._is_boolean
  232. if is_bool:
  233. # only 2 unique values
  234. expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 0], dtype=np.intp)
  235. expected_uniques = data_for_grouping.take([0, 4])
  236. else:
  237. expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp)
  238. expected_uniques = data_for_grouping.take([0, 4, 7])
  239. tm.assert_numpy_array_equal(codes, expected_codes)
  240. tm.assert_extension_array_equal(uniques, expected_uniques)
  241. def test_factorize_equivalence(self, data_for_grouping):
  242. codes_1, uniques_1 = pd.factorize(data_for_grouping, use_na_sentinel=True)
  243. codes_2, uniques_2 = data_for_grouping.factorize(use_na_sentinel=True)
  244. tm.assert_numpy_array_equal(codes_1, codes_2)
  245. tm.assert_extension_array_equal(uniques_1, uniques_2)
  246. assert len(uniques_1) == len(pd.unique(uniques_1))
  247. assert uniques_1.dtype == data_for_grouping.dtype
  248. def test_factorize_empty(self, data):
  249. codes, uniques = pd.factorize(data[:0])
  250. expected_codes = np.array([], dtype=np.intp)
  251. expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
  252. tm.assert_numpy_array_equal(codes, expected_codes)
  253. tm.assert_extension_array_equal(uniques, expected_uniques)
  254. def test_fillna_copy_frame(self, data_missing):
  255. arr = data_missing.take([1, 1])
  256. df = pd.DataFrame({"A": arr})
  257. df_orig = df.copy()
  258. filled_val = df.iloc[0, 0]
  259. result = df.fillna(filled_val)
  260. result.iloc[0, 0] = filled_val
  261. tm.assert_frame_equal(df, df_orig)
  262. def test_fillna_copy_series(self, data_missing):
  263. arr = data_missing.take([1, 1])
  264. ser = pd.Series(arr, copy=False)
  265. ser_orig = ser.copy()
  266. filled_val = ser[0]
  267. result = ser.fillna(filled_val)
  268. result.iloc[0] = filled_val
  269. tm.assert_series_equal(ser, ser_orig)
  270. def test_fillna_length_mismatch(self, data_missing):
  271. msg = "Length of 'value' does not match."
  272. with pytest.raises(ValueError, match=msg):
  273. data_missing.fillna(data_missing.take([1]))
  274. # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool]
  275. _combine_le_expected_dtype: Dtype = NumpyEADtype("bool")
  276. def test_combine_le(self, data_repeated):
  277. # GH 20825
  278. # Test that combine works when doing a <= (le) comparison
  279. orig_data1, orig_data2 = data_repeated(2)
  280. s1 = pd.Series(orig_data1)
  281. s2 = pd.Series(orig_data2)
  282. result = s1.combine(s2, lambda x1, x2: x1 <= x2)
  283. expected = pd.Series(
  284. pd.array(
  285. [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
  286. dtype=self._combine_le_expected_dtype,
  287. )
  288. )
  289. tm.assert_series_equal(result, expected)
  290. val = s1.iloc[0]
  291. result = s1.combine(val, lambda x1, x2: x1 <= x2)
  292. expected = pd.Series(
  293. pd.array(
  294. [a <= val for a in list(orig_data1)],
  295. dtype=self._combine_le_expected_dtype,
  296. )
  297. )
  298. tm.assert_series_equal(result, expected)
  299. def test_combine_add(self, data_repeated):
  300. # GH 20825
  301. orig_data1, orig_data2 = data_repeated(2)
  302. s1 = pd.Series(orig_data1)
  303. s2 = pd.Series(orig_data2)
  304. # Check if the operation is supported pointwise for our scalars. If not,
  305. # we will expect Series.combine to raise as well.
  306. try:
  307. with np.errstate(over="ignore"):
  308. expected = pd.Series(
  309. orig_data1._from_sequence(
  310. [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
  311. )
  312. )
  313. except TypeError:
  314. # If the operation is not supported pointwise for our scalars,
  315. # then Series.combine should also raise
  316. with pytest.raises(TypeError):
  317. s1.combine(s2, lambda x1, x2: x1 + x2)
  318. return
  319. result = s1.combine(s2, lambda x1, x2: x1 + x2)
  320. tm.assert_series_equal(result, expected)
  321. val = s1.iloc[0]
  322. result = s1.combine(val, lambda x1, x2: x1 + x2)
  323. expected = pd.Series(
  324. orig_data1._from_sequence([a + val for a in list(orig_data1)])
  325. )
  326. tm.assert_series_equal(result, expected)
  327. def test_combine_first(self, data):
  328. # https://github.com/pandas-dev/pandas/issues/24147
  329. a = pd.Series(data[:3])
  330. b = pd.Series(data[2:5], index=[2, 3, 4])
  331. result = a.combine_first(b)
  332. expected = pd.Series(data[:5])
  333. tm.assert_series_equal(result, expected)
  334. @pytest.mark.parametrize("frame", [True, False])
  335. @pytest.mark.parametrize(
  336. "periods, indices",
  337. [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
  338. )
  339. def test_container_shift(self, data, frame, periods, indices):
  340. # https://github.com/pandas-dev/pandas/issues/22386
  341. subset = data[:5]
  342. data = pd.Series(subset, name="A")
  343. expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
  344. if frame:
  345. result = data.to_frame(name="A").assign(B=1).shift(periods)
  346. expected = pd.concat(
  347. [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
  348. )
  349. compare = tm.assert_frame_equal
  350. else:
  351. result = data.shift(periods)
  352. compare = tm.assert_series_equal
  353. compare(result, expected)
  354. def test_shift_0_periods(self, data):
  355. # GH#33856 shifting with periods=0 should return a copy, not same obj
  356. result = data.shift(0)
  357. assert data[0] != data[1] # otherwise below is invalid
  358. data[0] = data[1]
  359. assert result[0] != result[1] # i.e. not the same object/view
  360. @pytest.mark.parametrize("periods", [1, -2])
  361. def test_diff(self, data, periods):
  362. data = data[:5]
  363. if is_bool_dtype(data.dtype):
  364. op = operator.xor
  365. else:
  366. op = operator.sub
  367. try:
  368. # does this array implement ops?
  369. op(data, data)
  370. except Exception:
  371. pytest.skip(f"{type(data)} does not support diff")
  372. s = pd.Series(data)
  373. result = s.diff(periods)
  374. expected = pd.Series(op(data, data.shift(periods)))
  375. tm.assert_series_equal(result, expected)
  376. df = pd.DataFrame({"A": data, "B": [1.0] * 5})
  377. result = df.diff(periods)
  378. if periods == 1:
  379. b = [np.nan, 0, 0, 0, 0]
  380. else:
  381. b = [0, 0, 0, np.nan, np.nan]
  382. expected = pd.DataFrame({"A": expected, "B": b})
  383. tm.assert_frame_equal(result, expected)
  384. @pytest.mark.parametrize(
  385. "periods, indices",
  386. [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
  387. )
  388. def test_shift_non_empty_array(self, data, periods, indices):
  389. # https://github.com/pandas-dev/pandas/issues/23911
  390. subset = data[:2]
  391. result = subset.shift(periods)
  392. expected = subset.take(indices, allow_fill=True)
  393. tm.assert_extension_array_equal(result, expected)
  394. @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
  395. def test_shift_empty_array(self, data, periods):
  396. # https://github.com/pandas-dev/pandas/issues/23911
  397. empty = data[:0]
  398. result = empty.shift(periods)
  399. expected = empty
  400. tm.assert_extension_array_equal(result, expected)
  401. def test_shift_zero_copies(self, data):
  402. # GH#31502
  403. result = data.shift(0)
  404. assert result is not data
  405. result = data[:0].shift(2)
  406. assert result is not data
  407. def test_shift_fill_value(self, data):
  408. arr = data[:4]
  409. fill_value = data[0]
  410. result = arr.shift(1, fill_value=fill_value)
  411. expected = data.take([0, 0, 1, 2])
  412. tm.assert_extension_array_equal(result, expected)
  413. result = arr.shift(-2, fill_value=fill_value)
  414. expected = data.take([2, 3, 0, 0])
  415. tm.assert_extension_array_equal(result, expected)
  416. def test_not_hashable(self, data):
  417. # We are in general mutable, so not hashable
  418. with pytest.raises(TypeError, match="unhashable type"):
  419. hash(data)
  420. def test_hash_pandas_object_works(self, data, as_frame):
  421. # https://github.com/pandas-dev/pandas/issues/23066
  422. data = pd.Series(data)
  423. if as_frame:
  424. data = data.to_frame()
  425. a = pd.util.hash_pandas_object(data)
  426. b = pd.util.hash_pandas_object(data)
  427. tm.assert_equal(a, b)
  428. def test_searchsorted(self, data_for_sorting, as_series):
  429. if data_for_sorting.dtype._is_boolean:
  430. return self._test_searchsorted_bool_dtypes(data_for_sorting, as_series)
  431. b, c, a = data_for_sorting
  432. arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
  433. if as_series:
  434. arr = pd.Series(arr)
  435. assert arr.searchsorted(a) == 0
  436. assert arr.searchsorted(a, side="right") == 1
  437. assert arr.searchsorted(b) == 1
  438. assert arr.searchsorted(b, side="right") == 2
  439. assert arr.searchsorted(c) == 2
  440. assert arr.searchsorted(c, side="right") == 3
  441. result = arr.searchsorted(arr.take([0, 2]))
  442. expected = np.array([0, 2], dtype=np.intp)
  443. tm.assert_numpy_array_equal(result, expected)
  444. # sorter
  445. sorter = np.array([1, 2, 0])
  446. assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
  447. def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series):
  448. # We call this from test_searchsorted in cases where we have a
  449. # boolean-like dtype. The non-bool test assumes we have more than 2
  450. # unique values.
  451. dtype = data_for_sorting.dtype
  452. data_for_sorting = pd.array([True, False], dtype=dtype)
  453. b, a = data_for_sorting
  454. arr = type(data_for_sorting)._from_sequence([a, b])
  455. if as_series:
  456. arr = pd.Series(arr)
  457. assert arr.searchsorted(a) == 0
  458. assert arr.searchsorted(a, side="right") == 1
  459. assert arr.searchsorted(b) == 1
  460. assert arr.searchsorted(b, side="right") == 2
  461. result = arr.searchsorted(arr.take([0, 1]))
  462. expected = np.array([0, 1], dtype=np.intp)
  463. tm.assert_numpy_array_equal(result, expected)
  464. # sorter
  465. sorter = np.array([1, 0])
  466. assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
  467. def test_where_series(self, data, na_value, as_frame):
  468. assert data[0] != data[1]
  469. cls = type(data)
  470. a, b = data[:2]
  471. orig = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
  472. ser = orig.copy()
  473. cond = np.array([True, True, False, False])
  474. if as_frame:
  475. ser = ser.to_frame(name="a")
  476. cond = cond.reshape(-1, 1)
  477. result = ser.where(cond)
  478. expected = pd.Series(
  479. cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
  480. )
  481. if as_frame:
  482. expected = expected.to_frame(name="a")
  483. tm.assert_equal(result, expected)
  484. ser.mask(~cond, inplace=True)
  485. tm.assert_equal(ser, expected)
  486. # array other
  487. ser = orig.copy()
  488. if as_frame:
  489. ser = ser.to_frame(name="a")
  490. cond = np.array([True, False, True, True])
  491. other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
  492. if as_frame:
  493. other = pd.DataFrame({"a": other})
  494. cond = pd.DataFrame({"a": cond})
  495. result = ser.where(cond, other)
  496. expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
  497. if as_frame:
  498. expected = expected.to_frame(name="a")
  499. tm.assert_equal(result, expected)
  500. ser.mask(~cond, other, inplace=True)
  501. tm.assert_equal(ser, expected)
  502. @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
  503. def test_repeat(self, data, repeats, as_series, use_numpy):
  504. arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
  505. if as_series:
  506. arr = pd.Series(arr)
  507. result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
  508. repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
  509. expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
  510. expected = type(data)._from_sequence(expected, dtype=data.dtype)
  511. if as_series:
  512. expected = pd.Series(expected, index=arr.index.repeat(repeats))
  513. tm.assert_equal(result, expected)
  514. @pytest.mark.parametrize(
  515. "repeats, kwargs, error, msg",
  516. [
  517. (2, {"axis": 1}, ValueError, "axis"),
  518. (-1, {}, ValueError, "negative"),
  519. ([1, 2], {}, ValueError, "shape"),
  520. (2, {"foo": "bar"}, TypeError, "'foo'"),
  521. ],
  522. )
  523. def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
  524. with pytest.raises(error, match=msg):
  525. if use_numpy:
  526. np.repeat(data, repeats, **kwargs)
  527. else:
  528. data.repeat(repeats, **kwargs)
  529. def test_delete(self, data):
  530. result = data.delete(0)
  531. expected = data[1:]
  532. tm.assert_extension_array_equal(result, expected)
  533. result = data.delete([1, 3])
  534. expected = data._concat_same_type([data[[0]], data[[2]], data[4:]])
  535. tm.assert_extension_array_equal(result, expected)
  536. def test_insert(self, data):
  537. # insert at the beginning
  538. result = data[1:].insert(0, data[0])
  539. tm.assert_extension_array_equal(result, data)
  540. result = data[1:].insert(-len(data[1:]), data[0])
  541. tm.assert_extension_array_equal(result, data)
  542. # insert at the middle
  543. result = data[:-1].insert(4, data[-1])
  544. taker = np.arange(len(data))
  545. taker[5:] = taker[4:-1]
  546. taker[4] = len(data) - 1
  547. expected = data.take(taker)
  548. tm.assert_extension_array_equal(result, expected)
  549. def test_insert_invalid(self, data, invalid_scalar):
  550. item = invalid_scalar
  551. with pytest.raises((TypeError, ValueError)):
  552. data.insert(0, item)
  553. with pytest.raises((TypeError, ValueError)):
  554. data.insert(4, item)
  555. with pytest.raises((TypeError, ValueError)):
  556. data.insert(len(data) - 1, item)
  557. def test_insert_invalid_loc(self, data):
  558. ub = len(data)
  559. with pytest.raises(IndexError):
  560. data.insert(ub + 1, data[0])
  561. with pytest.raises(IndexError):
  562. data.insert(-ub - 1, data[0])
  563. with pytest.raises(TypeError):
  564. # we expect TypeError here instead of IndexError to match np.insert
  565. data.insert(1.5, data[0])
  566. @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
  567. def test_equals(self, data, na_value, as_series, box):
  568. data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
  569. data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype)
  570. data = tm.box_expected(data, box, transpose=False)
  571. data2 = tm.box_expected(data2, box, transpose=False)
  572. data_na = tm.box_expected(data_na, box, transpose=False)
  573. # we are asserting with `is True/False` explicitly, to test that the
  574. # result is an actual Python bool, and not something "truthy"
  575. assert data.equals(data) is True
  576. assert data.equals(data.copy()) is True
  577. # unequal other data
  578. assert data.equals(data2) is False
  579. assert data.equals(data_na) is False
  580. # different length
  581. assert data[:2].equals(data[:3]) is False
  582. # empty are equal
  583. assert data[:0].equals(data[:0]) is True
  584. # other types
  585. assert data.equals(None) is False
  586. assert data[[0]].equals(data[0]) is False
  587. def test_equals_same_data_different_object(self, data):
  588. # https://github.com/pandas-dev/pandas/issues/34660
  589. assert pd.Series(data).equals(pd.Series(data))