test_libsparse.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. import operator
  2. import numpy as np
  3. import pytest
  4. import pandas._libs.sparse as splib
  5. import pandas.util._test_decorators as td
  6. from pandas import Series
  7. import pandas._testing as tm
  8. from pandas.core.arrays.sparse import (
  9. BlockIndex,
  10. IntIndex,
  11. make_sparse_index,
  12. )
  13. @pytest.fixture
  14. def test_length():
  15. return 20
  16. @pytest.fixture(
  17. params=[
  18. [
  19. [0, 7, 15],
  20. [3, 5, 5],
  21. [2, 9, 14],
  22. [2, 3, 5],
  23. [2, 9, 15],
  24. [1, 3, 4],
  25. ],
  26. [
  27. [0, 5],
  28. [4, 4],
  29. [1],
  30. [4],
  31. [1],
  32. [3],
  33. ],
  34. [
  35. [0],
  36. [10],
  37. [0, 5],
  38. [3, 7],
  39. [0, 5],
  40. [3, 5],
  41. ],
  42. [
  43. [10],
  44. [5],
  45. [0, 12],
  46. [5, 3],
  47. [12],
  48. [3],
  49. ],
  50. [
  51. [0, 10],
  52. [4, 6],
  53. [5, 17],
  54. [4, 2],
  55. [],
  56. [],
  57. ],
  58. [
  59. [0],
  60. [5],
  61. [],
  62. [],
  63. [],
  64. [],
  65. ],
  66. ],
  67. ids=[
  68. "plain_case",
  69. "delete_blocks",
  70. "split_blocks",
  71. "skip_block",
  72. "no_intersect",
  73. "one_empty",
  74. ],
  75. )
  76. def cases(request):
  77. return request.param
  78. class TestSparseIndexUnion:
  79. @pytest.mark.parametrize(
  80. "xloc, xlen, yloc, ylen, eloc, elen",
  81. [
  82. [[0], [5], [5], [4], [0], [9]],
  83. [[0, 10], [5, 5], [2, 17], [5, 2], [0, 10, 17], [7, 5, 2]],
  84. [[1], [5], [3], [5], [1], [7]],
  85. [[2, 10], [4, 4], [4], [8], [2], [12]],
  86. [[0, 5], [3, 5], [0], [7], [0], [10]],
  87. [[2, 10], [4, 4], [4, 13], [8, 4], [2], [15]],
  88. [[2], [15], [4, 9, 14], [3, 2, 2], [2], [15]],
  89. [[0, 10], [3, 3], [5, 15], [2, 2], [0, 5, 10, 15], [3, 2, 3, 2]],
  90. ],
  91. )
  92. def test_index_make_union(self, xloc, xlen, yloc, ylen, eloc, elen, test_length):
  93. # Case 1
  94. # x: ----
  95. # y: ----
  96. # r: --------
  97. # Case 2
  98. # x: ----- -----
  99. # y: ----- --
  100. # Case 3
  101. # x: ------
  102. # y: -------
  103. # r: ----------
  104. # Case 4
  105. # x: ------ -----
  106. # y: -------
  107. # r: -------------
  108. # Case 5
  109. # x: --- -----
  110. # y: -------
  111. # r: -------------
  112. # Case 6
  113. # x: ------ -----
  114. # y: ------- ---
  115. # r: -------------
  116. # Case 7
  117. # x: ----------------------
  118. # y: ---- ---- ---
  119. # r: ----------------------
  120. # Case 8
  121. # x: ---- ---
  122. # y: --- ---
  123. xindex = BlockIndex(test_length, xloc, xlen)
  124. yindex = BlockIndex(test_length, yloc, ylen)
  125. bresult = xindex.make_union(yindex)
  126. assert isinstance(bresult, BlockIndex)
  127. tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32))
  128. tm.assert_numpy_array_equal(bresult.blengths, np.array(elen, dtype=np.int32))
  129. ixindex = xindex.to_int_index()
  130. iyindex = yindex.to_int_index()
  131. iresult = ixindex.make_union(iyindex)
  132. assert isinstance(iresult, IntIndex)
  133. tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices)
  134. def test_int_index_make_union(self):
  135. a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
  136. b = IntIndex(5, np.array([0, 2], dtype=np.int32))
  137. res = a.make_union(b)
  138. exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
  139. assert res.equals(exp)
  140. a = IntIndex(5, np.array([], dtype=np.int32))
  141. b = IntIndex(5, np.array([0, 2], dtype=np.int32))
  142. res = a.make_union(b)
  143. exp = IntIndex(5, np.array([0, 2], np.int32))
  144. assert res.equals(exp)
  145. a = IntIndex(5, np.array([], dtype=np.int32))
  146. b = IntIndex(5, np.array([], dtype=np.int32))
  147. res = a.make_union(b)
  148. exp = IntIndex(5, np.array([], np.int32))
  149. assert res.equals(exp)
  150. a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
  151. b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
  152. res = a.make_union(b)
  153. exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
  154. assert res.equals(exp)
  155. a = IntIndex(5, np.array([0, 1], dtype=np.int32))
  156. b = IntIndex(4, np.array([0, 1], dtype=np.int32))
  157. msg = "Indices must reference same underlying length"
  158. with pytest.raises(ValueError, match=msg):
  159. a.make_union(b)
  160. class TestSparseIndexIntersect:
  161. @td.skip_if_windows
  162. def test_intersect(self, cases, test_length):
  163. xloc, xlen, yloc, ylen, eloc, elen = cases
  164. xindex = BlockIndex(test_length, xloc, xlen)
  165. yindex = BlockIndex(test_length, yloc, ylen)
  166. expected = BlockIndex(test_length, eloc, elen)
  167. longer_index = BlockIndex(test_length + 1, yloc, ylen)
  168. result = xindex.intersect(yindex)
  169. assert result.equals(expected)
  170. result = xindex.to_int_index().intersect(yindex.to_int_index())
  171. assert result.equals(expected.to_int_index())
  172. msg = "Indices must reference same underlying length"
  173. with pytest.raises(Exception, match=msg):
  174. xindex.intersect(longer_index)
  175. with pytest.raises(Exception, match=msg):
  176. xindex.to_int_index().intersect(longer_index.to_int_index())
  177. def test_intersect_empty(self):
  178. xindex = IntIndex(4, np.array([], dtype=np.int32))
  179. yindex = IntIndex(4, np.array([2, 3], dtype=np.int32))
  180. assert xindex.intersect(yindex).equals(xindex)
  181. assert yindex.intersect(xindex).equals(xindex)
  182. xindex = xindex.to_block_index()
  183. yindex = yindex.to_block_index()
  184. assert xindex.intersect(yindex).equals(xindex)
  185. assert yindex.intersect(xindex).equals(xindex)
  186. @pytest.mark.parametrize(
  187. "case",
  188. [
  189. # Argument 2 to "IntIndex" has incompatible type "ndarray[Any,
  190. # dtype[signedinteger[_32Bit]]]"; expected "Sequence[int]"
  191. IntIndex(5, np.array([1, 2], dtype=np.int32)), # type: ignore[arg-type]
  192. IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), # type: ignore[arg-type]
  193. IntIndex(0, np.array([], dtype=np.int32)), # type: ignore[arg-type]
  194. IntIndex(5, np.array([], dtype=np.int32)), # type: ignore[arg-type]
  195. ],
  196. )
  197. def test_intersect_identical(self, case):
  198. assert case.intersect(case).equals(case)
  199. case = case.to_block_index()
  200. assert case.intersect(case).equals(case)
  201. class TestSparseIndexCommon:
  202. def test_int_internal(self):
  203. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
  204. assert isinstance(idx, IntIndex)
  205. assert idx.npoints == 2
  206. tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
  207. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
  208. assert isinstance(idx, IntIndex)
  209. assert idx.npoints == 0
  210. tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
  211. idx = make_sparse_index(
  212. 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
  213. )
  214. assert isinstance(idx, IntIndex)
  215. assert idx.npoints == 4
  216. tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
  217. def test_block_internal(self):
  218. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
  219. assert isinstance(idx, BlockIndex)
  220. assert idx.npoints == 2
  221. tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
  222. tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
  223. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
  224. assert isinstance(idx, BlockIndex)
  225. assert idx.npoints == 0
  226. tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
  227. tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
  228. idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
  229. assert isinstance(idx, BlockIndex)
  230. assert idx.npoints == 4
  231. tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
  232. tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
  233. idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
  234. assert isinstance(idx, BlockIndex)
  235. assert idx.npoints == 3
  236. tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
  237. tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
  238. @pytest.mark.parametrize("kind", ["integer", "block"])
  239. def test_lookup(self, kind):
  240. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
  241. assert idx.lookup(-1) == -1
  242. assert idx.lookup(0) == -1
  243. assert idx.lookup(1) == -1
  244. assert idx.lookup(2) == 0
  245. assert idx.lookup(3) == 1
  246. assert idx.lookup(4) == -1
  247. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
  248. for i in range(-1, 5):
  249. assert idx.lookup(i) == -1
  250. idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
  251. assert idx.lookup(-1) == -1
  252. assert idx.lookup(0) == 0
  253. assert idx.lookup(1) == 1
  254. assert idx.lookup(2) == 2
  255. assert idx.lookup(3) == 3
  256. assert idx.lookup(4) == -1
  257. idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
  258. assert idx.lookup(-1) == -1
  259. assert idx.lookup(0) == 0
  260. assert idx.lookup(1) == -1
  261. assert idx.lookup(2) == 1
  262. assert idx.lookup(3) == 2
  263. assert idx.lookup(4) == -1
  264. @pytest.mark.parametrize("kind", ["integer", "block"])
  265. def test_lookup_array(self, kind):
  266. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
  267. res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
  268. exp = np.array([-1, -1, 0], dtype=np.int32)
  269. tm.assert_numpy_array_equal(res, exp)
  270. res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
  271. exp = np.array([-1, 0, -1, 1], dtype=np.int32)
  272. tm.assert_numpy_array_equal(res, exp)
  273. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
  274. res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
  275. exp = np.array([-1, -1, -1, -1], dtype=np.int32)
  276. tm.assert_numpy_array_equal(res, exp)
  277. idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
  278. res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
  279. exp = np.array([-1, 0, 2], dtype=np.int32)
  280. tm.assert_numpy_array_equal(res, exp)
  281. res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
  282. exp = np.array([-1, 2, 1, 3], dtype=np.int32)
  283. tm.assert_numpy_array_equal(res, exp)
  284. idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
  285. res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
  286. exp = np.array([1, -1, 2, 0], dtype=np.int32)
  287. tm.assert_numpy_array_equal(res, exp)
  288. res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32))
  289. exp = np.array([-1, -1, 1, -1], dtype=np.int32)
  290. tm.assert_numpy_array_equal(res, exp)
  291. @pytest.mark.parametrize(
  292. "idx, expected",
  293. [
  294. [0, -1],
  295. [5, 0],
  296. [7, 2],
  297. [8, -1],
  298. [9, -1],
  299. [10, -1],
  300. [11, -1],
  301. [12, 3],
  302. [17, 8],
  303. [18, -1],
  304. ],
  305. )
  306. def test_lookup_basics(self, idx, expected):
  307. bindex = BlockIndex(20, [5, 12], [3, 6])
  308. assert bindex.lookup(idx) == expected
  309. iindex = bindex.to_int_index()
  310. assert iindex.lookup(idx) == expected
  311. class TestBlockIndex:
  312. def test_block_internal(self):
  313. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
  314. assert isinstance(idx, BlockIndex)
  315. assert idx.npoints == 2
  316. tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
  317. tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
  318. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
  319. assert isinstance(idx, BlockIndex)
  320. assert idx.npoints == 0
  321. tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
  322. tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
  323. idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
  324. assert isinstance(idx, BlockIndex)
  325. assert idx.npoints == 4
  326. tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
  327. tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
  328. idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
  329. assert isinstance(idx, BlockIndex)
  330. assert idx.npoints == 3
  331. tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
  332. tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
  333. @pytest.mark.parametrize("i", [5, 10, 100, 101])
  334. def test_make_block_boundary(self, i):
  335. idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
  336. exp = np.arange(0, i, 2, dtype=np.int32)
  337. tm.assert_numpy_array_equal(idx.blocs, exp)
  338. tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32))
  339. def test_equals(self):
  340. index = BlockIndex(10, [0, 4], [2, 5])
  341. assert index.equals(index)
  342. assert not index.equals(BlockIndex(10, [0, 4], [2, 6]))
  343. def test_check_integrity(self):
  344. locs = []
  345. lengths = []
  346. # 0-length OK
  347. BlockIndex(0, locs, lengths)
  348. # also OK even though empty
  349. BlockIndex(1, locs, lengths)
  350. msg = "Block 0 extends beyond end"
  351. with pytest.raises(ValueError, match=msg):
  352. BlockIndex(10, [5], [10])
  353. msg = "Block 0 overlaps"
  354. with pytest.raises(ValueError, match=msg):
  355. BlockIndex(10, [2, 5], [5, 3])
  356. def test_to_int_index(self):
  357. locs = [0, 10]
  358. lengths = [4, 6]
  359. exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15]
  360. block = BlockIndex(20, locs, lengths)
  361. dense = block.to_int_index()
  362. tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32))
  363. def test_to_block_index(self):
  364. index = BlockIndex(10, [0, 5], [4, 5])
  365. assert index.to_block_index() is index
  366. class TestIntIndex:
  367. def test_check_integrity(self):
  368. # Too many indices than specified in self.length
  369. msg = "Too many indices"
  370. with pytest.raises(ValueError, match=msg):
  371. IntIndex(length=1, indices=[1, 2, 3])
  372. # No index can be negative.
  373. msg = "No index can be less than zero"
  374. with pytest.raises(ValueError, match=msg):
  375. IntIndex(length=5, indices=[1, -2, 3])
  376. # No index can be negative.
  377. msg = "No index can be less than zero"
  378. with pytest.raises(ValueError, match=msg):
  379. IntIndex(length=5, indices=[1, -2, 3])
  380. # All indices must be less than the length.
  381. msg = "All indices must be less than the length"
  382. with pytest.raises(ValueError, match=msg):
  383. IntIndex(length=5, indices=[1, 2, 5])
  384. with pytest.raises(ValueError, match=msg):
  385. IntIndex(length=5, indices=[1, 2, 6])
  386. # Indices must be strictly ascending.
  387. msg = "Indices must be strictly increasing"
  388. with pytest.raises(ValueError, match=msg):
  389. IntIndex(length=5, indices=[1, 3, 2])
  390. with pytest.raises(ValueError, match=msg):
  391. IntIndex(length=5, indices=[1, 3, 3])
  392. def test_int_internal(self):
  393. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
  394. assert isinstance(idx, IntIndex)
  395. assert idx.npoints == 2
  396. tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
  397. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
  398. assert isinstance(idx, IntIndex)
  399. assert idx.npoints == 0
  400. tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
  401. idx = make_sparse_index(
  402. 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
  403. )
  404. assert isinstance(idx, IntIndex)
  405. assert idx.npoints == 4
  406. tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
  407. def test_equals(self):
  408. index = IntIndex(10, [0, 1, 2, 3, 4])
  409. assert index.equals(index)
  410. assert not index.equals(IntIndex(10, [0, 1, 2, 3]))
  411. def test_to_block_index(self, cases, test_length):
  412. xloc, xlen, yloc, ylen, _, _ = cases
  413. xindex = BlockIndex(test_length, xloc, xlen)
  414. yindex = BlockIndex(test_length, yloc, ylen)
  415. # see if survive the round trip
  416. xbindex = xindex.to_int_index().to_block_index()
  417. ybindex = yindex.to_int_index().to_block_index()
  418. assert isinstance(xbindex, BlockIndex)
  419. assert xbindex.equals(xindex)
  420. assert ybindex.equals(yindex)
  421. def test_to_int_index(self):
  422. index = IntIndex(10, [2, 3, 4, 5, 6])
  423. assert index.to_int_index() is index
  424. class TestSparseOperators:
  425. @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"])
  426. def test_op(self, opname, cases, test_length):
  427. xloc, xlen, yloc, ylen, _, _ = cases
  428. sparse_op = getattr(splib, f"sparse_{opname}_float64")
  429. python_op = getattr(operator, opname)
  430. xindex = BlockIndex(test_length, xloc, xlen)
  431. yindex = BlockIndex(test_length, yloc, ylen)
  432. xdindex = xindex.to_int_index()
  433. ydindex = yindex.to_int_index()
  434. x = np.arange(xindex.npoints) * 10.0 + 1
  435. y = np.arange(yindex.npoints) * 100.0 + 1
  436. xfill = 0
  437. yfill = 2
  438. result_block_vals, rb_index, bfill = sparse_op(
  439. x, xindex, xfill, y, yindex, yfill
  440. )
  441. result_int_vals, ri_index, ifill = sparse_op(
  442. x, xdindex, xfill, y, ydindex, yfill
  443. )
  444. assert rb_index.to_int_index().equals(ri_index)
  445. tm.assert_numpy_array_equal(result_block_vals, result_int_vals)
  446. assert bfill == ifill
  447. # check versus Series...
  448. xseries = Series(x, xdindex.indices)
  449. xseries = xseries.reindex(np.arange(test_length)).fillna(xfill)
  450. yseries = Series(y, ydindex.indices)
  451. yseries = yseries.reindex(np.arange(test_length)).fillna(yfill)
  452. series_result = python_op(xseries, yseries)
  453. series_result = series_result.reindex(ri_index.indices)
  454. tm.assert_numpy_array_equal(result_block_vals, series_result.values)
  455. tm.assert_numpy_array_equal(result_int_vals, series_result.values)