test_masked.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. """
  2. This file contains a minimal set of tests for compliance with the extension
  3. array interface test suite, and should contain no other tests.
  4. The test suite for the full functionality of the array is located in
  5. `pandas/tests/arrays/`.
  6. The tests in this file are inherited from the BaseExtensionTests, and only
  7. minimal tweaks should be applied to get the tests passing (by overwriting a
  8. parent method).
  9. Additional tests should either be added to one of the BaseExtensionTests
  10. classes (if they are relevant for the extension interface for all dtypes), or
  11. be added to the array-specific tests in `pandas/tests/arrays/`.
  12. """
  13. import warnings
  14. import numpy as np
  15. import pytest
  16. from pandas.compat import (
  17. IS64,
  18. is_platform_windows,
  19. )
  20. from pandas.compat.numpy import np_version_gt2
  21. from pandas.core.dtypes.common import (
  22. is_float_dtype,
  23. is_signed_integer_dtype,
  24. is_unsigned_integer_dtype,
  25. )
  26. import pandas as pd
  27. import pandas._testing as tm
  28. from pandas.core.arrays.boolean import BooleanDtype
  29. from pandas.core.arrays.floating import (
  30. Float32Dtype,
  31. Float64Dtype,
  32. )
  33. from pandas.core.arrays.integer import (
  34. Int8Dtype,
  35. Int16Dtype,
  36. Int32Dtype,
  37. Int64Dtype,
  38. UInt8Dtype,
  39. UInt16Dtype,
  40. UInt32Dtype,
  41. UInt64Dtype,
  42. )
  43. from pandas.tests.extension import base
  44. is_windows_or_32bit = (is_platform_windows() and not np_version_gt2) or not IS64
  45. pytestmark = [
  46. pytest.mark.filterwarnings(
  47. "ignore:invalid value encountered in divide:RuntimeWarning"
  48. ),
  49. pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning"),
  50. # overflow only relevant for Floating dtype cases cases
  51. pytest.mark.filterwarnings("ignore:overflow encountered in reduce:RuntimeWarning"),
  52. ]
  53. def make_data():
  54. return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100]
  55. def make_float_data():
  56. return (
  57. list(np.arange(0.1, 0.9, 0.1))
  58. + [pd.NA]
  59. + list(np.arange(1, 9.8, 0.1))
  60. + [pd.NA]
  61. + [9.9, 10.0]
  62. )
  63. def make_bool_data():
  64. return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False]
  65. @pytest.fixture(
  66. params=[
  67. Int8Dtype,
  68. Int16Dtype,
  69. Int32Dtype,
  70. Int64Dtype,
  71. UInt8Dtype,
  72. UInt16Dtype,
  73. UInt32Dtype,
  74. UInt64Dtype,
  75. Float32Dtype,
  76. Float64Dtype,
  77. BooleanDtype,
  78. ]
  79. )
  80. def dtype(request):
  81. return request.param()
  82. @pytest.fixture
  83. def data(dtype):
  84. if dtype.kind == "f":
  85. data = make_float_data()
  86. elif dtype.kind == "b":
  87. data = make_bool_data()
  88. else:
  89. data = make_data()
  90. return pd.array(data, dtype=dtype)
  91. @pytest.fixture
  92. def data_for_twos(dtype):
  93. if dtype.kind == "b":
  94. return pd.array(np.ones(100), dtype=dtype)
  95. return pd.array(np.ones(100) * 2, dtype=dtype)
  96. @pytest.fixture
  97. def data_missing(dtype):
  98. if dtype.kind == "f":
  99. return pd.array([pd.NA, 0.1], dtype=dtype)
  100. elif dtype.kind == "b":
  101. return pd.array([np.nan, True], dtype=dtype)
  102. return pd.array([pd.NA, 1], dtype=dtype)
  103. @pytest.fixture
  104. def data_for_sorting(dtype):
  105. if dtype.kind == "f":
  106. return pd.array([0.1, 0.2, 0.0], dtype=dtype)
  107. elif dtype.kind == "b":
  108. return pd.array([True, True, False], dtype=dtype)
  109. return pd.array([1, 2, 0], dtype=dtype)
  110. @pytest.fixture
  111. def data_missing_for_sorting(dtype):
  112. if dtype.kind == "f":
  113. return pd.array([0.1, pd.NA, 0.0], dtype=dtype)
  114. elif dtype.kind == "b":
  115. return pd.array([True, np.nan, False], dtype=dtype)
  116. return pd.array([1, pd.NA, 0], dtype=dtype)
  117. @pytest.fixture
  118. def na_cmp():
  119. # we are pd.NA
  120. return lambda x, y: x is pd.NA and y is pd.NA
  121. @pytest.fixture
  122. def data_for_grouping(dtype):
  123. if dtype.kind == "f":
  124. b = 0.1
  125. a = 0.0
  126. c = 0.2
  127. elif dtype.kind == "b":
  128. b = True
  129. a = False
  130. c = b
  131. else:
  132. b = 1
  133. a = 0
  134. c = 2
  135. na = pd.NA
  136. return pd.array([b, b, na, na, a, a, b, c], dtype=dtype)
  137. class TestMaskedArrays(base.ExtensionTests):
  138. @pytest.mark.parametrize("na_action", [None, "ignore"])
  139. def test_map(self, data_missing, na_action):
  140. result = data_missing.map(lambda x: x, na_action=na_action)
  141. if data_missing.dtype == Float32Dtype():
  142. # map roundtrips through objects, which converts to float64
  143. expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
  144. else:
  145. expected = data_missing.to_numpy()
  146. tm.assert_numpy_array_equal(result, expected)
  147. def test_map_na_action_ignore(self, data_missing_for_sorting):
  148. zero = data_missing_for_sorting[2]
  149. result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore")
  150. if data_missing_for_sorting.dtype.kind == "b":
  151. expected = np.array([False, pd.NA, False], dtype=object)
  152. else:
  153. expected = np.array([zero, np.nan, zero])
  154. tm.assert_numpy_array_equal(result, expected)
  155. def _get_expected_exception(self, op_name, obj, other):
  156. try:
  157. dtype = tm.get_dtype(obj)
  158. except AttributeError:
  159. # passed arguments reversed
  160. dtype = tm.get_dtype(other)
  161. if dtype.kind == "b":
  162. if op_name.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]:
  163. # match behavior with non-masked bool dtype
  164. return NotImplementedError
  165. elif op_name in ["__sub__", "__rsub__"]:
  166. # exception message would include "numpy boolean subtract""
  167. return TypeError
  168. return None
  169. return None
  170. def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
  171. sdtype = tm.get_dtype(obj)
  172. expected = pointwise_result
  173. if op_name in ("eq", "ne", "le", "ge", "lt", "gt"):
  174. return expected.astype("boolean")
  175. if sdtype.kind in "iu":
  176. if op_name in ("__rtruediv__", "__truediv__", "__div__"):
  177. with warnings.catch_warnings():
  178. warnings.filterwarnings(
  179. "ignore",
  180. "Downcasting object dtype arrays",
  181. category=FutureWarning,
  182. )
  183. filled = expected.fillna(np.nan)
  184. expected = filled.astype("Float64")
  185. else:
  186. # combine method result in 'biggest' (int64) dtype
  187. expected = expected.astype(sdtype)
  188. elif sdtype.kind == "b":
  189. if op_name in (
  190. "__floordiv__",
  191. "__rfloordiv__",
  192. "__pow__",
  193. "__rpow__",
  194. "__mod__",
  195. "__rmod__",
  196. ):
  197. # combine keeps boolean type
  198. expected = expected.astype("Int8")
  199. elif op_name in ("__truediv__", "__rtruediv__"):
  200. # combine with bools does not generate the correct result
  201. # (numpy behaviour for div is to regard the bools as numeric)
  202. op = self.get_op_from_name(op_name)
  203. expected = self._combine(obj.astype(float), other, op)
  204. expected = expected.astype("Float64")
  205. if op_name == "__rpow__":
  206. # for rpow, combine does not propagate NaN
  207. result = getattr(obj, op_name)(other)
  208. expected[result.isna()] = np.nan
  209. else:
  210. # combine method result in 'biggest' (float64) dtype
  211. expected = expected.astype(sdtype)
  212. return expected
  213. def test_divmod_series_array(self, data, data_for_twos, request):
  214. if data.dtype.kind == "b":
  215. mark = pytest.mark.xfail(
  216. reason="Inconsistency between floordiv and divmod; we raise for "
  217. "floordiv but not for divmod. This matches what we do for "
  218. "non-masked bool dtype."
  219. )
  220. request.applymarker(mark)
  221. super().test_divmod_series_array(data, data_for_twos)
  222. def test_combine_le(self, data_repeated):
  223. # TODO: patching self is a bad pattern here
  224. orig_data1, orig_data2 = data_repeated(2)
  225. if orig_data1.dtype.kind == "b":
  226. self._combine_le_expected_dtype = "boolean"
  227. else:
  228. # TODO: can we make this boolean?
  229. self._combine_le_expected_dtype = object
  230. super().test_combine_le(data_repeated)
  231. def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
  232. if op_name in ["any", "all"] and ser.dtype.kind != "b":
  233. pytest.skip(reason="Tested in tests/reductions/test_reductions.py")
  234. return True
  235. def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
  236. # overwrite to ensure pd.NA is tested instead of np.nan
  237. # https://github.com/pandas-dev/pandas/issues/30958
  238. cmp_dtype = "int64"
  239. if ser.dtype.kind == "f":
  240. # Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has
  241. # no attribute "numpy_dtype"
  242. cmp_dtype = ser.dtype.numpy_dtype # type: ignore[union-attr]
  243. elif ser.dtype.kind == "b":
  244. if op_name in ["min", "max"]:
  245. cmp_dtype = "bool"
  246. # TODO: prod with integer dtypes does *not* match the result we would
  247. # get if we used object for cmp_dtype. In that cae the object result
  248. # is a large integer while the non-object case overflows and returns 0
  249. alt = ser.dropna().astype(cmp_dtype)
  250. if op_name == "count":
  251. result = getattr(ser, op_name)()
  252. expected = getattr(alt, op_name)()
  253. else:
  254. result = getattr(ser, op_name)(skipna=skipna)
  255. expected = getattr(alt, op_name)(skipna=skipna)
  256. if not skipna and ser.isna().any() and op_name not in ["any", "all"]:
  257. expected = pd.NA
  258. tm.assert_almost_equal(result, expected)
  259. def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
  260. if is_float_dtype(arr.dtype):
  261. cmp_dtype = arr.dtype.name
  262. elif op_name in ["mean", "median", "var", "std", "skew"]:
  263. cmp_dtype = "Float64"
  264. elif op_name in ["max", "min"]:
  265. cmp_dtype = arr.dtype.name
  266. elif arr.dtype in ["Int64", "UInt64"]:
  267. cmp_dtype = arr.dtype.name
  268. elif is_signed_integer_dtype(arr.dtype):
  269. # TODO: Why does Window Numpy 2.0 dtype depend on skipna?
  270. cmp_dtype = (
  271. "Int32"
  272. if (is_platform_windows() and (not np_version_gt2 or not skipna))
  273. or not IS64
  274. else "Int64"
  275. )
  276. elif is_unsigned_integer_dtype(arr.dtype):
  277. cmp_dtype = (
  278. "UInt32"
  279. if (is_platform_windows() and (not np_version_gt2 or not skipna))
  280. or not IS64
  281. else "UInt64"
  282. )
  283. elif arr.dtype.kind == "b":
  284. if op_name in ["mean", "median", "var", "std", "skew"]:
  285. cmp_dtype = "Float64"
  286. elif op_name in ["min", "max"]:
  287. cmp_dtype = "boolean"
  288. elif op_name in ["sum", "prod"]:
  289. cmp_dtype = (
  290. "Int32"
  291. if (is_platform_windows() and (not np_version_gt2 or not skipna))
  292. or not IS64
  293. else "Int64"
  294. )
  295. else:
  296. raise TypeError("not supposed to reach this")
  297. else:
  298. raise TypeError("not supposed to reach this")
  299. return cmp_dtype
  300. def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
  301. return True
  302. def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
  303. # overwrite to ensure pd.NA is tested instead of np.nan
  304. # https://github.com/pandas-dev/pandas/issues/30958
  305. length = 64
  306. if is_windows_or_32bit:
  307. # Item "ExtensionDtype" of "Union[dtype[Any], ExtensionDtype]" has
  308. # no attribute "itemsize"
  309. if not ser.dtype.itemsize == 8: # type: ignore[union-attr]
  310. length = 32
  311. if ser.dtype.name.startswith("U"):
  312. expected_dtype = f"UInt{length}"
  313. elif ser.dtype.name.startswith("I"):
  314. expected_dtype = f"Int{length}"
  315. elif ser.dtype.name.startswith("F"):
  316. # Incompatible types in assignment (expression has type
  317. # "Union[dtype[Any], ExtensionDtype]", variable has type "str")
  318. expected_dtype = ser.dtype # type: ignore[assignment]
  319. elif ser.dtype.kind == "b":
  320. if op_name in ("cummin", "cummax"):
  321. expected_dtype = "boolean"
  322. else:
  323. expected_dtype = f"Int{length}"
  324. if expected_dtype == "Float32" and op_name == "cumprod" and skipna:
  325. # TODO: xfail?
  326. pytest.skip(
  327. f"Float32 precision lead to large differences with op {op_name} "
  328. f"and skipna={skipna}"
  329. )
  330. if op_name == "cumsum":
  331. result = getattr(ser, op_name)(skipna=skipna)
  332. expected = pd.Series(
  333. pd.array(
  334. getattr(ser.astype("float64"), op_name)(skipna=skipna),
  335. dtype=expected_dtype,
  336. )
  337. )
  338. tm.assert_series_equal(result, expected)
  339. elif op_name in ["cummax", "cummin"]:
  340. result = getattr(ser, op_name)(skipna=skipna)
  341. expected = pd.Series(
  342. pd.array(
  343. getattr(ser.astype("float64"), op_name)(skipna=skipna),
  344. dtype=ser.dtype,
  345. )
  346. )
  347. tm.assert_series_equal(result, expected)
  348. elif op_name == "cumprod":
  349. result = getattr(ser[:12], op_name)(skipna=skipna)
  350. expected = pd.Series(
  351. pd.array(
  352. getattr(ser[:12].astype("float64"), op_name)(skipna=skipna),
  353. dtype=expected_dtype,
  354. )
  355. )
  356. tm.assert_series_equal(result, expected)
  357. else:
  358. raise NotImplementedError(f"{op_name} not supported")
  359. class Test2DCompat(base.Dim2CompatTests):
  360. pass