test_array.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas._libs.sparse import IntIndex
  5. from pandas.compat.numpy import np_version_gt2
  6. import pandas as pd
  7. from pandas import (
  8. SparseDtype,
  9. isna,
  10. )
  11. import pandas._testing as tm
  12. from pandas.core.arrays.sparse import SparseArray
  13. @pytest.fixture
  14. def arr_data():
  15. """Fixture returning numpy array with valid and missing entries"""
  16. return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
  17. @pytest.fixture
  18. def arr(arr_data):
  19. """Fixture returning SparseArray from 'arr_data'"""
  20. return SparseArray(arr_data)
  21. @pytest.fixture
  22. def zarr():
  23. """Fixture returning SparseArray with integer entries and 'fill_value=0'"""
  24. return SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
  25. class TestSparseArray:
  26. @pytest.mark.parametrize("fill_value", [0, None, np.nan])
  27. def test_shift_fill_value(self, fill_value):
  28. # GH #24128
  29. sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0)
  30. res = sparse.shift(1, fill_value=fill_value)
  31. if isna(fill_value):
  32. fill_value = res.dtype.na_value
  33. exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0)
  34. tm.assert_sp_array_equal(res, exp)
  35. def test_set_fill_value(self):
  36. arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan)
  37. arr.fill_value = 2
  38. assert arr.fill_value == 2
  39. arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
  40. arr.fill_value = 2
  41. assert arr.fill_value == 2
  42. msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
  43. with tm.assert_produces_warning(FutureWarning, match=msg):
  44. arr.fill_value = 3.1
  45. assert arr.fill_value == 3.1
  46. arr.fill_value = np.nan
  47. assert np.isnan(arr.fill_value)
  48. arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
  49. arr.fill_value = True
  50. assert arr.fill_value is True
  51. with tm.assert_produces_warning(FutureWarning, match=msg):
  52. arr.fill_value = 0
  53. arr.fill_value = np.nan
  54. assert np.isnan(arr.fill_value)
  55. @pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)])
  56. def test_set_fill_invalid_non_scalar(self, val):
  57. arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
  58. msg = "fill_value must be a scalar"
  59. with pytest.raises(ValueError, match=msg):
  60. arr.fill_value = val
  61. def test_copy(self, arr):
  62. arr2 = arr.copy()
  63. assert arr2.sp_values is not arr.sp_values
  64. assert arr2.sp_index is arr.sp_index
  65. def test_values_asarray(self, arr_data, arr):
  66. tm.assert_almost_equal(arr.to_dense(), arr_data)
  67. @pytest.mark.parametrize(
  68. "data,shape,dtype",
  69. [
  70. ([0, 0, 0, 0, 0], (5,), None),
  71. ([], (0,), None),
  72. ([0], (1,), None),
  73. (["A", "A", np.nan, "B"], (4,), object),
  74. ],
  75. )
  76. def test_shape(self, data, shape, dtype):
  77. # GH 21126
  78. out = SparseArray(data, dtype=dtype)
  79. assert out.shape == shape
  80. @pytest.mark.parametrize(
  81. "vals",
  82. [
  83. [np.nan, np.nan, np.nan, np.nan, np.nan],
  84. [1, np.nan, np.nan, 3, np.nan],
  85. [1, np.nan, 0, 3, 0],
  86. ],
  87. )
  88. @pytest.mark.parametrize("fill_value", [None, 0])
  89. def test_dense_repr(self, vals, fill_value):
  90. vals = np.array(vals)
  91. arr = SparseArray(vals, fill_value=fill_value)
  92. res = arr.to_dense()
  93. tm.assert_numpy_array_equal(res, vals)
  94. @pytest.mark.parametrize("fix", ["arr", "zarr"])
  95. def test_pickle(self, fix, request):
  96. obj = request.getfixturevalue(fix)
  97. unpickled = tm.round_trip_pickle(obj)
  98. tm.assert_sp_array_equal(unpickled, obj)
  99. def test_generator_warnings(self):
  100. sp_arr = SparseArray([1, 2, 3])
  101. with tm.assert_produces_warning(None):
  102. for _ in sp_arr:
  103. pass
  104. def test_where_retain_fill_value(self):
  105. # GH#45691 don't lose fill_value on _where
  106. arr = SparseArray([np.nan, 1.0], fill_value=0)
  107. mask = np.array([True, False])
  108. res = arr._where(~mask, 1)
  109. exp = SparseArray([1, 1.0], fill_value=0)
  110. tm.assert_sp_array_equal(res, exp)
  111. ser = pd.Series(arr)
  112. res = ser.where(~mask, 1)
  113. tm.assert_series_equal(res, pd.Series(exp))
  114. def test_fillna(self):
  115. s = SparseArray([1, np.nan, np.nan, 3, np.nan])
  116. res = s.fillna(-1)
  117. exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64)
  118. tm.assert_sp_array_equal(res, exp)
  119. s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
  120. res = s.fillna(-1)
  121. exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64)
  122. tm.assert_sp_array_equal(res, exp)
  123. s = SparseArray([1, np.nan, 0, 3, 0])
  124. res = s.fillna(-1)
  125. exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64)
  126. tm.assert_sp_array_equal(res, exp)
  127. s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0)
  128. res = s.fillna(-1)
  129. exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64)
  130. tm.assert_sp_array_equal(res, exp)
  131. s = SparseArray([np.nan, np.nan, np.nan, np.nan])
  132. res = s.fillna(-1)
  133. exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64)
  134. tm.assert_sp_array_equal(res, exp)
  135. s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0)
  136. res = s.fillna(-1)
  137. exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64)
  138. tm.assert_sp_array_equal(res, exp)
  139. # float dtype's fill_value is np.nan, replaced by -1
  140. s = SparseArray([0.0, 0.0, 0.0, 0.0])
  141. res = s.fillna(-1)
  142. exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1)
  143. tm.assert_sp_array_equal(res, exp)
  144. # int dtype shouldn't have missing. No changes.
  145. s = SparseArray([0, 0, 0, 0])
  146. assert s.dtype == SparseDtype(np.int64)
  147. assert s.fill_value == 0
  148. res = s.fillna(-1)
  149. tm.assert_sp_array_equal(res, s)
  150. s = SparseArray([0, 0, 0, 0], fill_value=0)
  151. assert s.dtype == SparseDtype(np.int64)
  152. assert s.fill_value == 0
  153. res = s.fillna(-1)
  154. exp = SparseArray([0, 0, 0, 0], fill_value=0)
  155. tm.assert_sp_array_equal(res, exp)
  156. # fill_value can be nan if there is no missing hole.
  157. # only fill_value will be changed
  158. s = SparseArray([0, 0, 0, 0], fill_value=np.nan)
  159. assert s.dtype == SparseDtype(np.int64, fill_value=np.nan)
  160. assert np.isnan(s.fill_value)
  161. res = s.fillna(-1)
  162. exp = SparseArray([0, 0, 0, 0], fill_value=-1)
  163. tm.assert_sp_array_equal(res, exp)
  164. def test_fillna_overlap(self):
  165. s = SparseArray([1, np.nan, np.nan, 3, np.nan])
  166. # filling with existing value doesn't replace existing value with
  167. # fill_value, i.e. existing 3 remains in sp_values
  168. res = s.fillna(3)
  169. exp = np.array([1, 3, 3, 3, 3], dtype=np.float64)
  170. tm.assert_numpy_array_equal(res.to_dense(), exp)
  171. s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
  172. res = s.fillna(3)
  173. exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64)
  174. tm.assert_sp_array_equal(res, exp)
  175. def test_nonzero(self):
  176. # Tests regression #21172.
  177. sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
  178. expected = np.array([2, 5, 9], dtype=np.int32)
  179. (result,) = sa.nonzero()
  180. tm.assert_numpy_array_equal(expected, result)
  181. sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
  182. (result,) = sa.nonzero()
  183. tm.assert_numpy_array_equal(expected, result)
  184. class TestSparseArrayAnalytics:
  185. @pytest.mark.parametrize(
  186. "data,expected",
  187. [
  188. (
  189. np.array([1, 2, 3, 4, 5], dtype=float), # non-null data
  190. SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])),
  191. ),
  192. (
  193. np.array([1, 2, np.nan, 4, 5], dtype=float), # null data
  194. SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])),
  195. ),
  196. ],
  197. )
  198. @pytest.mark.parametrize("numpy", [True, False])
  199. def test_cumsum(self, data, expected, numpy):
  200. cumsum = np.cumsum if numpy else lambda s: s.cumsum()
  201. out = cumsum(SparseArray(data))
  202. tm.assert_sp_array_equal(out, expected)
  203. out = cumsum(SparseArray(data, fill_value=np.nan))
  204. tm.assert_sp_array_equal(out, expected)
  205. out = cumsum(SparseArray(data, fill_value=2))
  206. tm.assert_sp_array_equal(out, expected)
  207. if numpy: # numpy compatibility checks.
  208. msg = "the 'dtype' parameter is not supported"
  209. with pytest.raises(ValueError, match=msg):
  210. np.cumsum(SparseArray(data), dtype=np.int64)
  211. msg = "the 'out' parameter is not supported"
  212. with pytest.raises(ValueError, match=msg):
  213. np.cumsum(SparseArray(data), out=out)
  214. else:
  215. axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid.
  216. msg = re.escape(f"axis(={axis}) out of bounds")
  217. with pytest.raises(ValueError, match=msg):
  218. SparseArray(data).cumsum(axis=axis)
  219. def test_ufunc(self):
  220. # GH 13853 make sure ufunc is applied to fill_value
  221. sparse = SparseArray([1, np.nan, 2, np.nan, -2])
  222. result = SparseArray([1, np.nan, 2, np.nan, 2])
  223. tm.assert_sp_array_equal(abs(sparse), result)
  224. tm.assert_sp_array_equal(np.abs(sparse), result)
  225. sparse = SparseArray([1, -1, 2, -2], fill_value=1)
  226. result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
  227. tm.assert_sp_array_equal(abs(sparse), result)
  228. tm.assert_sp_array_equal(np.abs(sparse), result)
  229. sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
  230. exp = SparseArray([1, 1, 2, 2], fill_value=1)
  231. tm.assert_sp_array_equal(abs(sparse), exp)
  232. tm.assert_sp_array_equal(np.abs(sparse), exp)
  233. sparse = SparseArray([1, np.nan, 2, np.nan, -2])
  234. result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))
  235. tm.assert_sp_array_equal(np.sin(sparse), result)
  236. sparse = SparseArray([1, -1, 2, -2], fill_value=1)
  237. result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1))
  238. tm.assert_sp_array_equal(np.sin(sparse), result)
  239. sparse = SparseArray([1, -1, 0, -2], fill_value=0)
  240. result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0))
  241. tm.assert_sp_array_equal(np.sin(sparse), result)
  242. def test_ufunc_args(self):
  243. # GH 13853 make sure ufunc is applied to fill_value, including its arg
  244. sparse = SparseArray([1, np.nan, 2, np.nan, -2])
  245. result = SparseArray([2, np.nan, 3, np.nan, -1])
  246. tm.assert_sp_array_equal(np.add(sparse, 1), result)
  247. sparse = SparseArray([1, -1, 2, -2], fill_value=1)
  248. result = SparseArray([2, 0, 3, -1], fill_value=2)
  249. tm.assert_sp_array_equal(np.add(sparse, 1), result)
  250. sparse = SparseArray([1, -1, 0, -2], fill_value=0)
  251. result = SparseArray([2, 0, 1, -1], fill_value=1)
  252. tm.assert_sp_array_equal(np.add(sparse, 1), result)
  253. @pytest.mark.parametrize("fill_value", [0.0, np.nan])
  254. def test_modf(self, fill_value):
  255. # https://github.com/pandas-dev/pandas/issues/26946
  256. sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value)
  257. r1, r2 = np.modf(sparse)
  258. e1, e2 = np.modf(np.asarray(sparse))
  259. tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value))
  260. tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value))
  261. def test_nbytes_integer(self):
  262. arr = SparseArray([1, 0, 0, 0, 2], kind="integer")
  263. result = arr.nbytes
  264. # (2 * 8) + 2 * 4
  265. assert result == 24
  266. def test_nbytes_block(self):
  267. arr = SparseArray([1, 2, 0, 0, 0], kind="block")
  268. result = arr.nbytes
  269. # (2 * 8) + 4 + 4
  270. # sp_values, blocs, blengths
  271. assert result == 24
  272. def test_asarray_datetime64(self):
  273. s = SparseArray(pd.to_datetime(["2012", None, None, "2013"]))
  274. np.asarray(s)
  275. def test_density(self):
  276. arr = SparseArray([0, 1])
  277. assert arr.density == 0.5
  278. def test_npoints(self):
  279. arr = SparseArray([0, 1])
  280. assert arr.npoints == 1
  281. def test_setting_fill_value_fillna_still_works():
  282. # This is why letting users update fill_value / dtype is bad
  283. # astype has the same problem.
  284. arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0)
  285. arr.fill_value = np.nan
  286. result = arr.isna()
  287. # Can't do direct comparison, since the sp_index will be different
  288. # So let's convert to ndarray and check there.
  289. result = np.asarray(result)
  290. expected = np.array([False, True, False])
  291. tm.assert_numpy_array_equal(result, expected)
  292. def test_setting_fill_value_updates():
  293. arr = SparseArray([0.0, np.nan], fill_value=0)
  294. arr.fill_value = np.nan
  295. # use private constructor to get the index right
  296. # otherwise both nans would be un-stored.
  297. expected = SparseArray._simple_new(
  298. sparse_array=np.array([np.nan]),
  299. sparse_index=IntIndex(2, [1]),
  300. dtype=SparseDtype(float, np.nan),
  301. )
  302. tm.assert_sp_array_equal(arr, expected)
  303. @pytest.mark.parametrize(
  304. "arr,fill_value,loc",
  305. [
  306. ([None, 1, 2], None, 0),
  307. ([0, None, 2], None, 1),
  308. ([0, 1, None], None, 2),
  309. ([0, 1, 1, None, None], None, 3),
  310. ([1, 1, 1, 2], None, -1),
  311. ([], None, -1),
  312. ([None, 1, 0, 0, None, 2], None, 0),
  313. ([None, 1, 0, 0, None, 2], 1, 1),
  314. ([None, 1, 0, 0, None, 2], 2, 5),
  315. ([None, 1, 0, 0, None, 2], 3, -1),
  316. ([None, 0, 0, 1, 2, 1], 0, 1),
  317. ([None, 0, 0, 1, 2, 1], 1, 3),
  318. ],
  319. )
  320. def test_first_fill_value_loc(arr, fill_value, loc):
  321. result = SparseArray(arr, fill_value=fill_value)._first_fill_value_loc()
  322. assert result == loc
  323. @pytest.mark.parametrize(
  324. "arr",
  325. [
  326. [1, 2, np.nan, np.nan],
  327. [1, np.nan, 2, np.nan],
  328. [1, 2, np.nan],
  329. [np.nan, 1, 0, 0, np.nan, 2],
  330. [np.nan, 0, 0, 1, 2, 1],
  331. ],
  332. )
  333. @pytest.mark.parametrize("fill_value", [np.nan, 0, 1])
  334. def test_unique_na_fill(arr, fill_value):
  335. a = SparseArray(arr, fill_value=fill_value).unique()
  336. b = pd.Series(arr).unique()
  337. assert isinstance(a, SparseArray)
  338. a = np.asarray(a)
  339. tm.assert_numpy_array_equal(a, b)
  340. def test_unique_all_sparse():
  341. # https://github.com/pandas-dev/pandas/issues/23168
  342. arr = SparseArray([0, 0])
  343. result = arr.unique()
  344. expected = SparseArray([0])
  345. tm.assert_sp_array_equal(result, expected)
  346. def test_map():
  347. arr = SparseArray([0, 1, 2])
  348. expected = SparseArray([10, 11, 12], fill_value=10)
  349. # dict
  350. result = arr.map({0: 10, 1: 11, 2: 12})
  351. tm.assert_sp_array_equal(result, expected)
  352. # series
  353. result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
  354. tm.assert_sp_array_equal(result, expected)
  355. # function
  356. result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
  357. expected = SparseArray([10, 11, 12], fill_value=10)
  358. tm.assert_sp_array_equal(result, expected)
  359. def test_map_missing():
  360. arr = SparseArray([0, 1, 2])
  361. expected = SparseArray([10, 11, None], fill_value=10)
  362. result = arr.map({0: 10, 1: 11})
  363. tm.assert_sp_array_equal(result, expected)
  364. @pytest.mark.parametrize("fill_value", [np.nan, 1])
  365. def test_dropna(fill_value):
  366. # GH-28287
  367. arr = SparseArray([np.nan, 1], fill_value=fill_value)
  368. exp = SparseArray([1.0], fill_value=fill_value)
  369. tm.assert_sp_array_equal(arr.dropna(), exp)
  370. df = pd.DataFrame({"a": [0, 1], "b": arr})
  371. expected_df = pd.DataFrame({"a": [1], "b": exp}, index=pd.Index([1]))
  372. tm.assert_equal(df.dropna(), expected_df)
  373. def test_drop_duplicates_fill_value():
  374. # GH 11726
  375. df = pd.DataFrame(np.zeros((5, 5))).apply(lambda x: SparseArray(x, fill_value=0))
  376. result = df.drop_duplicates()
  377. expected = pd.DataFrame({i: SparseArray([0.0], fill_value=0) for i in range(5)})
  378. tm.assert_frame_equal(result, expected)
  379. def test_zero_sparse_column():
  380. # GH 27781
  381. df1 = pd.DataFrame({"A": SparseArray([0, 0, 0]), "B": [1, 2, 3]})
  382. df2 = pd.DataFrame({"A": SparseArray([0, 1, 0]), "B": [1, 2, 3]})
  383. result = df1.loc[df1["B"] != 2]
  384. expected = df2.loc[df2["B"] != 2]
  385. tm.assert_frame_equal(result, expected)
  386. expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
  387. tm.assert_frame_equal(result, expected)
  388. def test_array_interface(arr_data, arr):
  389. # https://github.com/pandas-dev/pandas/pull/60046
  390. result = np.asarray(arr)
  391. tm.assert_numpy_array_equal(result, arr_data)
  392. # it always gives a copy by default
  393. result_copy1 = np.asarray(arr)
  394. result_copy2 = np.asarray(arr)
  395. assert not np.may_share_memory(result_copy1, result_copy2)
  396. # or with explicit copy=True
  397. result_copy1 = np.array(arr, copy=True)
  398. result_copy2 = np.array(arr, copy=True)
  399. assert not np.may_share_memory(result_copy1, result_copy2)
  400. if not np_version_gt2:
  401. # copy=False semantics are only supported in NumPy>=2.
  402. return
  403. msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed"
  404. with tm.assert_produces_warning(FutureWarning, match=msg):
  405. np.array(arr, copy=False)
  406. # except when there are actually no sparse filled values
  407. arr2 = SparseArray(np.array([1, 2, 3]))
  408. result_nocopy1 = np.array(arr2, copy=False)
  409. result_nocopy2 = np.array(arr2, copy=False)
  410. assert np.may_share_memory(result_nocopy1, result_nocopy2)