test_indexing.py 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001
  1. from datetime import timedelta
  2. import re
  3. import numpy as np
  4. import pytest
  5. from pandas._libs import index as libindex
  6. from pandas.errors import (
  7. InvalidIndexError,
  8. PerformanceWarning,
  9. )
  10. import pandas as pd
  11. from pandas import (
  12. Categorical,
  13. DataFrame,
  14. Index,
  15. MultiIndex,
  16. date_range,
  17. )
  18. import pandas._testing as tm
  19. class TestSliceLocs:
  20. def test_slice_locs_partial(self, idx):
  21. sorted_idx, _ = idx.sortlevel(0)
  22. result = sorted_idx.slice_locs(("foo", "two"), ("qux", "one"))
  23. assert result == (1, 5)
  24. result = sorted_idx.slice_locs(None, ("qux", "one"))
  25. assert result == (0, 5)
  26. result = sorted_idx.slice_locs(("foo", "two"), None)
  27. assert result == (1, len(sorted_idx))
  28. result = sorted_idx.slice_locs("bar", "baz")
  29. assert result == (2, 4)
  30. def test_slice_locs(self):
  31. df = DataFrame(
  32. np.random.default_rng(2).standard_normal((50, 4)),
  33. columns=Index(list("ABCD"), dtype=object),
  34. index=date_range("2000-01-01", periods=50, freq="B"),
  35. )
  36. stacked = df.stack(future_stack=True)
  37. idx = stacked.index
  38. slob = slice(*idx.slice_locs(df.index[5], df.index[15]))
  39. sliced = stacked[slob]
  40. expected = df[5:16].stack(future_stack=True)
  41. tm.assert_almost_equal(sliced.values, expected.values)
  42. slob = slice(
  43. *idx.slice_locs(
  44. df.index[5] + timedelta(seconds=30),
  45. df.index[15] - timedelta(seconds=30),
  46. )
  47. )
  48. sliced = stacked[slob]
  49. expected = df[6:15].stack(future_stack=True)
  50. tm.assert_almost_equal(sliced.values, expected.values)
  51. def test_slice_locs_with_type_mismatch(self):
  52. df = DataFrame(
  53. np.random.default_rng(2).standard_normal((10, 4)),
  54. columns=Index(list("ABCD"), dtype=object),
  55. index=date_range("2000-01-01", periods=10, freq="B"),
  56. )
  57. stacked = df.stack(future_stack=True)
  58. idx = stacked.index
  59. with pytest.raises(TypeError, match="^Level type mismatch"):
  60. idx.slice_locs((1, 3))
  61. with pytest.raises(TypeError, match="^Level type mismatch"):
  62. idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2))
  63. df = DataFrame(
  64. np.ones((5, 5)),
  65. index=Index([f"i-{i}" for i in range(5)], name="a"),
  66. columns=Index([f"i-{i}" for i in range(5)], name="a"),
  67. )
  68. stacked = df.stack(future_stack=True)
  69. idx = stacked.index
  70. with pytest.raises(TypeError, match="^Level type mismatch"):
  71. idx.slice_locs(timedelta(seconds=30))
  72. # TODO: Try creating a UnicodeDecodeError in exception message
  73. with pytest.raises(TypeError, match="^Level type mismatch"):
  74. idx.slice_locs(df.index[1], (16, "a"))
  75. def test_slice_locs_not_sorted(self):
  76. index = MultiIndex(
  77. levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))],
  78. codes=[
  79. np.array([0, 0, 1, 2, 2, 2, 3, 3]),
  80. np.array([0, 1, 0, 0, 0, 1, 0, 1]),
  81. np.array([1, 0, 1, 1, 0, 0, 1, 0]),
  82. ],
  83. )
  84. msg = "[Kk]ey length.*greater than MultiIndex lexsort depth"
  85. with pytest.raises(KeyError, match=msg):
  86. index.slice_locs((1, 0, 1), (2, 1, 0))
  87. # works
  88. sorted_index, _ = index.sortlevel(0)
  89. # should there be a test case here???
  90. sorted_index.slice_locs((1, 0, 1), (2, 1, 0))
  91. def test_slice_locs_not_contained(self):
  92. # some searchsorted action
  93. index = MultiIndex(
  94. levels=[[0, 2, 4, 6], [0, 2, 4]],
  95. codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]],
  96. )
  97. result = index.slice_locs((1, 0), (5, 2))
  98. assert result == (3, 6)
  99. result = index.slice_locs(1, 5)
  100. assert result == (3, 6)
  101. result = index.slice_locs((2, 2), (5, 2))
  102. assert result == (3, 6)
  103. result = index.slice_locs(2, 5)
  104. assert result == (3, 6)
  105. result = index.slice_locs((1, 0), (6, 3))
  106. assert result == (3, 8)
  107. result = index.slice_locs(-1, 10)
  108. assert result == (0, len(index))
  109. @pytest.mark.parametrize(
  110. "index_arr,expected,start_idx,end_idx",
  111. [
  112. ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None),
  113. ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"),
  114. ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")),
  115. ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None),
  116. ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"),
  117. ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")),
  118. ],
  119. )
  120. def test_slice_locs_with_missing_value(
  121. self, index_arr, expected, start_idx, end_idx
  122. ):
  123. # issue 19132
  124. idx = MultiIndex.from_arrays(index_arr)
  125. result = idx.slice_locs(start=start_idx, end=end_idx)
  126. assert result == expected
  127. class TestPutmask:
  128. def test_putmask_with_wrong_mask(self, idx):
  129. # GH18368
  130. msg = "putmask: mask and data must be the same size"
  131. with pytest.raises(ValueError, match=msg):
  132. idx.putmask(np.ones(len(idx) + 1, np.bool_), 1)
  133. with pytest.raises(ValueError, match=msg):
  134. idx.putmask(np.ones(len(idx) - 1, np.bool_), 1)
  135. with pytest.raises(ValueError, match=msg):
  136. idx.putmask("foo", 1)
  137. def test_putmask_multiindex_other(self):
  138. # GH#43212 `value` is also a MultiIndex
  139. left = MultiIndex.from_tuples([(np.nan, 6), (np.nan, 6), ("a", 4)])
  140. right = MultiIndex.from_tuples([("a", 1), ("a", 1), ("d", 1)])
  141. mask = np.array([True, True, False])
  142. result = left.putmask(mask, right)
  143. expected = MultiIndex.from_tuples([right[0], right[1], left[2]])
  144. tm.assert_index_equal(result, expected)
  145. def test_putmask_keep_dtype(self, any_numeric_ea_dtype):
  146. # GH#49830
  147. midx = MultiIndex.from_arrays(
  148. [pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]]
  149. )
  150. midx2 = MultiIndex.from_arrays(
  151. [pd.Series([5, 6, 7], dtype=any_numeric_ea_dtype), [-1, -2, -3]]
  152. )
  153. result = midx.putmask([True, False, False], midx2)
  154. expected = MultiIndex.from_arrays(
  155. [pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]]
  156. )
  157. tm.assert_index_equal(result, expected)
  158. def test_putmask_keep_dtype_shorter_value(self, any_numeric_ea_dtype):
  159. # GH#49830
  160. midx = MultiIndex.from_arrays(
  161. [pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]]
  162. )
  163. midx2 = MultiIndex.from_arrays(
  164. [pd.Series([5], dtype=any_numeric_ea_dtype), [-1]]
  165. )
  166. result = midx.putmask([True, False, False], midx2)
  167. expected = MultiIndex.from_arrays(
  168. [pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]]
  169. )
  170. tm.assert_index_equal(result, expected)
  171. class TestGetIndexer:
  172. def test_get_indexer(self):
  173. major_axis = Index(np.arange(4))
  174. minor_axis = Index(np.arange(2))
  175. major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp)
  176. minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp)
  177. index = MultiIndex(
  178. levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
  179. )
  180. idx1 = index[:5]
  181. idx2 = index[[1, 3, 5]]
  182. r1 = idx1.get_indexer(idx2)
  183. tm.assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp))
  184. r1 = idx2.get_indexer(idx1, method="pad")
  185. e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp)
  186. tm.assert_almost_equal(r1, e1)
  187. r2 = idx2.get_indexer(idx1[::-1], method="pad")
  188. tm.assert_almost_equal(r2, e1[::-1])
  189. rffill1 = idx2.get_indexer(idx1, method="ffill")
  190. tm.assert_almost_equal(r1, rffill1)
  191. r1 = idx2.get_indexer(idx1, method="backfill")
  192. e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp)
  193. tm.assert_almost_equal(r1, e1)
  194. r2 = idx2.get_indexer(idx1[::-1], method="backfill")
  195. tm.assert_almost_equal(r2, e1[::-1])
  196. rbfill1 = idx2.get_indexer(idx1, method="bfill")
  197. tm.assert_almost_equal(r1, rbfill1)
  198. # pass non-MultiIndex
  199. r1 = idx1.get_indexer(idx2.values)
  200. rexp1 = idx1.get_indexer(idx2)
  201. tm.assert_almost_equal(r1, rexp1)
  202. r1 = idx1.get_indexer([1, 2, 3])
  203. assert (r1 == [-1, -1, -1]).all()
  204. # create index with duplicates
  205. idx1 = Index(list(range(10)) + list(range(10)))
  206. idx2 = Index(list(range(20)))
  207. msg = "Reindexing only valid with uniquely valued Index objects"
  208. with pytest.raises(InvalidIndexError, match=msg):
  209. idx1.get_indexer(idx2)
  210. def test_get_indexer_nearest(self):
  211. midx = MultiIndex.from_tuples([("a", 1), ("b", 2)])
  212. msg = (
  213. "method='nearest' not implemented yet for MultiIndex; "
  214. "see GitHub issue 9365"
  215. )
  216. with pytest.raises(NotImplementedError, match=msg):
  217. midx.get_indexer(["a"], method="nearest")
  218. msg = "tolerance not implemented yet for MultiIndex"
  219. with pytest.raises(NotImplementedError, match=msg):
  220. midx.get_indexer(["a"], method="pad", tolerance=2)
  221. def test_get_indexer_categorical_time(self):
  222. # https://github.com/pandas-dev/pandas/issues/21390
  223. midx = MultiIndex.from_product(
  224. [
  225. Categorical(["a", "b", "c"]),
  226. Categorical(date_range("2012-01-01", periods=3, freq="h")),
  227. ]
  228. )
  229. result = midx.get_indexer(midx)
  230. tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp))
  231. @pytest.mark.parametrize(
  232. "index_arr,labels,expected",
  233. [
  234. (
  235. [[1, np.nan, 2], [3, 4, 5]],
  236. [1, np.nan, 2],
  237. np.array([-1, -1, -1], dtype=np.intp),
  238. ),
  239. ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)),
  240. ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)),
  241. (
  242. [[1, 2, 3], [np.nan, 4, 5]],
  243. [np.nan, 4, 5],
  244. np.array([-1, -1, -1], dtype=np.intp),
  245. ),
  246. ],
  247. )
  248. def test_get_indexer_with_missing_value(self, index_arr, labels, expected):
  249. # issue 19132
  250. idx = MultiIndex.from_arrays(index_arr)
  251. result = idx.get_indexer(labels)
  252. tm.assert_numpy_array_equal(result, expected)
  253. def test_get_indexer_methods(self):
  254. # https://github.com/pandas-dev/pandas/issues/29896
  255. # test getting an indexer for another index with different methods
  256. # confirms that getting an indexer without a filling method, getting an
  257. # indexer and backfilling, and getting an indexer and padding all behave
  258. # correctly in the case where all of the target values fall in between
  259. # several levels in the MultiIndex into which they are getting an indexer
  260. #
  261. # visually, the MultiIndexes used in this test are:
  262. # mult_idx_1:
  263. # 0: -1 0
  264. # 1: 2
  265. # 2: 3
  266. # 3: 4
  267. # 4: 0 0
  268. # 5: 2
  269. # 6: 3
  270. # 7: 4
  271. # 8: 1 0
  272. # 9: 2
  273. # 10: 3
  274. # 11: 4
  275. #
  276. # mult_idx_2:
  277. # 0: 0 1
  278. # 1: 3
  279. # 2: 4
  280. mult_idx_1 = MultiIndex.from_product([[-1, 0, 1], [0, 2, 3, 4]])
  281. mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]])
  282. indexer = mult_idx_1.get_indexer(mult_idx_2)
  283. expected = np.array([-1, 6, 7], dtype=indexer.dtype)
  284. tm.assert_almost_equal(expected, indexer)
  285. backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="backfill")
  286. expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype)
  287. tm.assert_almost_equal(expected, backfill_indexer)
  288. # ensure the legacy "bfill" option functions identically to "backfill"
  289. backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill")
  290. expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype)
  291. tm.assert_almost_equal(expected, backfill_indexer)
  292. pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="pad")
  293. expected = np.array([4, 6, 7], dtype=pad_indexer.dtype)
  294. tm.assert_almost_equal(expected, pad_indexer)
  295. # ensure the legacy "ffill" option functions identically to "pad"
  296. pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill")
  297. expected = np.array([4, 6, 7], dtype=pad_indexer.dtype)
  298. tm.assert_almost_equal(expected, pad_indexer)
  299. @pytest.mark.parametrize("method", ["pad", "ffill", "backfill", "bfill", "nearest"])
  300. def test_get_indexer_methods_raise_for_non_monotonic(self, method):
  301. # 53452
  302. mi = MultiIndex.from_arrays([[0, 4, 2], [0, 4, 2]])
  303. if method == "nearest":
  304. err = NotImplementedError
  305. msg = "not implemented yet for MultiIndex"
  306. else:
  307. err = ValueError
  308. msg = "index must be monotonic increasing or decreasing"
  309. with pytest.raises(err, match=msg):
  310. mi.get_indexer([(1, 1)], method=method)
  311. def test_get_indexer_three_or_more_levels(self):
  312. # https://github.com/pandas-dev/pandas/issues/29896
  313. # tests get_indexer() on MultiIndexes with 3+ levels
  314. # visually, these are
  315. # mult_idx_1:
  316. # 0: 1 2 5
  317. # 1: 7
  318. # 2: 4 5
  319. # 3: 7
  320. # 4: 6 5
  321. # 5: 7
  322. # 6: 3 2 5
  323. # 7: 7
  324. # 8: 4 5
  325. # 9: 7
  326. # 10: 6 5
  327. # 11: 7
  328. #
  329. # mult_idx_2:
  330. # 0: 1 1 8
  331. # 1: 1 5 9
  332. # 2: 1 6 7
  333. # 3: 2 1 6
  334. # 4: 2 7 6
  335. # 5: 2 7 8
  336. # 6: 3 6 8
  337. mult_idx_1 = MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]])
  338. mult_idx_2 = MultiIndex.from_tuples(
  339. [
  340. (1, 1, 8),
  341. (1, 5, 9),
  342. (1, 6, 7),
  343. (2, 1, 6),
  344. (2, 7, 7),
  345. (2, 7, 8),
  346. (3, 6, 8),
  347. ]
  348. )
  349. # sanity check
  350. assert mult_idx_1.is_monotonic_increasing
  351. assert mult_idx_1.is_unique
  352. assert mult_idx_2.is_monotonic_increasing
  353. assert mult_idx_2.is_unique
  354. # show the relationships between the two
  355. assert mult_idx_2[0] < mult_idx_1[0]
  356. assert mult_idx_1[3] < mult_idx_2[1] < mult_idx_1[4]
  357. assert mult_idx_1[5] == mult_idx_2[2]
  358. assert mult_idx_1[5] < mult_idx_2[3] < mult_idx_1[6]
  359. assert mult_idx_1[5] < mult_idx_2[4] < mult_idx_1[6]
  360. assert mult_idx_1[5] < mult_idx_2[5] < mult_idx_1[6]
  361. assert mult_idx_1[-1] < mult_idx_2[6]
  362. indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2)
  363. expected = np.array([-1, -1, 5, -1, -1, -1, -1], dtype=indexer_no_fill.dtype)
  364. tm.assert_almost_equal(expected, indexer_no_fill)
  365. # test with backfilling
  366. indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method="backfill")
  367. expected = np.array([0, 4, 5, 6, 6, 6, -1], dtype=indexer_backfilled.dtype)
  368. tm.assert_almost_equal(expected, indexer_backfilled)
  369. # now, the same thing, but forward-filled (aka "padded")
  370. indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method="pad")
  371. expected = np.array([-1, 3, 5, 5, 5, 5, 11], dtype=indexer_padded.dtype)
  372. tm.assert_almost_equal(expected, indexer_padded)
  373. # now, do the indexing in the other direction
  374. assert mult_idx_2[0] < mult_idx_1[0] < mult_idx_2[1]
  375. assert mult_idx_2[0] < mult_idx_1[1] < mult_idx_2[1]
  376. assert mult_idx_2[0] < mult_idx_1[2] < mult_idx_2[1]
  377. assert mult_idx_2[0] < mult_idx_1[3] < mult_idx_2[1]
  378. assert mult_idx_2[1] < mult_idx_1[4] < mult_idx_2[2]
  379. assert mult_idx_2[2] == mult_idx_1[5]
  380. assert mult_idx_2[5] < mult_idx_1[6] < mult_idx_2[6]
  381. assert mult_idx_2[5] < mult_idx_1[7] < mult_idx_2[6]
  382. assert mult_idx_2[5] < mult_idx_1[8] < mult_idx_2[6]
  383. assert mult_idx_2[5] < mult_idx_1[9] < mult_idx_2[6]
  384. assert mult_idx_2[5] < mult_idx_1[10] < mult_idx_2[6]
  385. assert mult_idx_2[5] < mult_idx_1[11] < mult_idx_2[6]
  386. indexer = mult_idx_2.get_indexer(mult_idx_1)
  387. expected = np.array(
  388. [-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], dtype=indexer.dtype
  389. )
  390. tm.assert_almost_equal(expected, indexer)
  391. backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill")
  392. expected = np.array(
  393. [1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], dtype=backfill_indexer.dtype
  394. )
  395. tm.assert_almost_equal(expected, backfill_indexer)
  396. pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad")
  397. expected = np.array(
  398. [0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype
  399. )
  400. tm.assert_almost_equal(expected, pad_indexer)
  401. def test_get_indexer_crossing_levels(self):
  402. # https://github.com/pandas-dev/pandas/issues/29896
  403. # tests a corner case with get_indexer() with MultiIndexes where, when we
  404. # need to "carry" across levels, proper tuple ordering is respected
  405. #
  406. # the MultiIndexes used in this test, visually, are:
  407. # mult_idx_1:
  408. # 0: 1 1 1 1
  409. # 1: 2
  410. # 2: 2 1
  411. # 3: 2
  412. # 4: 1 2 1 1
  413. # 5: 2
  414. # 6: 2 1
  415. # 7: 2
  416. # 8: 2 1 1 1
  417. # 9: 2
  418. # 10: 2 1
  419. # 11: 2
  420. # 12: 2 2 1 1
  421. # 13: 2
  422. # 14: 2 1
  423. # 15: 2
  424. #
  425. # mult_idx_2:
  426. # 0: 1 3 2 2
  427. # 1: 2 3 2 2
  428. mult_idx_1 = MultiIndex.from_product([[1, 2]] * 4)
  429. mult_idx_2 = MultiIndex.from_tuples([(1, 3, 2, 2), (2, 3, 2, 2)])
  430. # show the tuple orderings, which get_indexer() should respect
  431. assert mult_idx_1[7] < mult_idx_2[0] < mult_idx_1[8]
  432. assert mult_idx_1[-1] < mult_idx_2[1]
  433. indexer = mult_idx_1.get_indexer(mult_idx_2)
  434. expected = np.array([-1, -1], dtype=indexer.dtype)
  435. tm.assert_almost_equal(expected, indexer)
  436. backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill")
  437. expected = np.array([8, -1], dtype=backfill_indexer.dtype)
  438. tm.assert_almost_equal(expected, backfill_indexer)
  439. pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill")
  440. expected = np.array([7, 15], dtype=pad_indexer.dtype)
  441. tm.assert_almost_equal(expected, pad_indexer)
  442. def test_get_indexer_kwarg_validation(self):
  443. # GH#41918
  444. mi = MultiIndex.from_product([range(3), ["A", "B"]])
  445. msg = "limit argument only valid if doing pad, backfill or nearest"
  446. with pytest.raises(ValueError, match=msg):
  447. mi.get_indexer(mi[:-1], limit=4)
  448. msg = "tolerance argument only valid if doing pad, backfill or nearest"
  449. with pytest.raises(ValueError, match=msg):
  450. mi.get_indexer(mi[:-1], tolerance="piano")
  451. def test_get_indexer_nan(self):
  452. # GH#37222
  453. idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"])
  454. idx2 = MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"])
  455. expected = np.array([-1, 1])
  456. result = idx2.get_indexer(idx1)
  457. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  458. result = idx1.get_indexer(idx2)
  459. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  460. def test_getitem(idx):
  461. # scalar
  462. assert idx[2] == ("bar", "one")
  463. # slice
  464. result = idx[2:5]
  465. expected = idx[[2, 3, 4]]
  466. assert result.equals(expected)
  467. # boolean
  468. result = idx[[True, False, True, False, True, True]]
  469. result2 = idx[np.array([True, False, True, False, True, True])]
  470. expected = idx[[0, 2, 4, 5]]
  471. assert result.equals(expected)
  472. assert result2.equals(expected)
  473. def test_getitem_group_select(idx):
  474. sorted_idx, _ = idx.sortlevel(0)
  475. assert sorted_idx.get_loc("baz") == slice(3, 4)
  476. assert sorted_idx.get_loc("foo") == slice(0, 2)
  477. @pytest.mark.parametrize("ind1", [[True] * 5, Index([True] * 5)])
  478. @pytest.mark.parametrize(
  479. "ind2",
  480. [[True, False, True, False, False], Index([True, False, True, False, False])],
  481. )
  482. def test_getitem_bool_index_all(ind1, ind2):
  483. # GH#22533
  484. idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), (40, 4), (50, 5)])
  485. tm.assert_index_equal(idx[ind1], idx)
  486. expected = MultiIndex.from_tuples([(10, 1), (30, 3)])
  487. tm.assert_index_equal(idx[ind2], expected)
  488. @pytest.mark.parametrize("ind1", [[True], Index([True])])
  489. @pytest.mark.parametrize("ind2", [[False], Index([False])])
  490. def test_getitem_bool_index_single(ind1, ind2):
  491. # GH#22533
  492. idx = MultiIndex.from_tuples([(10, 1)])
  493. tm.assert_index_equal(idx[ind1], idx)
  494. expected = MultiIndex(
  495. levels=[np.array([], dtype=np.int64), np.array([], dtype=np.int64)],
  496. codes=[[], []],
  497. )
  498. tm.assert_index_equal(idx[ind2], expected)
  499. class TestGetLoc:
  500. def test_get_loc(self, idx):
  501. assert idx.get_loc(("foo", "two")) == 1
  502. assert idx.get_loc(("baz", "two")) == 3
  503. with pytest.raises(KeyError, match=r"^\('bar', 'two'\)$"):
  504. idx.get_loc(("bar", "two"))
  505. with pytest.raises(KeyError, match=r"^'quux'$"):
  506. idx.get_loc("quux")
  507. # 3 levels
  508. index = MultiIndex(
  509. levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))],
  510. codes=[
  511. np.array([0, 0, 1, 2, 2, 2, 3, 3]),
  512. np.array([0, 1, 0, 0, 0, 1, 0, 1]),
  513. np.array([1, 0, 1, 1, 0, 0, 1, 0]),
  514. ],
  515. )
  516. with pytest.raises(KeyError, match=r"^\(1, 1\)$"):
  517. index.get_loc((1, 1))
  518. assert index.get_loc((2, 0)) == slice(3, 5)
  519. def test_get_loc_duplicates(self):
  520. index = Index([2, 2, 2, 2])
  521. result = index.get_loc(2)
  522. expected = slice(0, 4)
  523. assert result == expected
  524. index = Index(["c", "a", "a", "b", "b"])
  525. rs = index.get_loc("c")
  526. xp = 0
  527. assert rs == xp
  528. with pytest.raises(KeyError, match="2"):
  529. index.get_loc(2)
  530. def test_get_loc_level(self):
  531. index = MultiIndex(
  532. levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))],
  533. codes=[
  534. np.array([0, 0, 1, 2, 2, 2, 3, 3]),
  535. np.array([0, 1, 0, 0, 0, 1, 0, 1]),
  536. np.array([1, 0, 1, 1, 0, 0, 1, 0]),
  537. ],
  538. )
  539. loc, new_index = index.get_loc_level((0, 1))
  540. expected = slice(1, 2)
  541. exp_index = index[expected].droplevel(0).droplevel(0)
  542. assert loc == expected
  543. assert new_index.equals(exp_index)
  544. loc, new_index = index.get_loc_level((0, 1, 0))
  545. expected = 1
  546. assert loc == expected
  547. assert new_index is None
  548. with pytest.raises(KeyError, match=r"^\(2, 2\)$"):
  549. index.get_loc_level((2, 2))
  550. # GH 22221: unused label
  551. with pytest.raises(KeyError, match=r"^2$"):
  552. index.drop(2).get_loc_level(2)
  553. # Unused label on unsorted level:
  554. with pytest.raises(KeyError, match=r"^2$"):
  555. index.drop(1, level=2).get_loc_level(2, level=2)
  556. index = MultiIndex(
  557. levels=[[2000], list(range(4))],
  558. codes=[np.array([0, 0, 0, 0]), np.array([0, 1, 2, 3])],
  559. )
  560. result, new_index = index.get_loc_level((2000, slice(None, None)))
  561. expected = slice(None, None)
  562. assert result == expected
  563. assert new_index.equals(index.droplevel(0))
  564. @pytest.mark.parametrize("dtype1", [int, float, bool, str])
  565. @pytest.mark.parametrize("dtype2", [int, float, bool, str])
  566. def test_get_loc_multiple_dtypes(self, dtype1, dtype2):
  567. # GH 18520
  568. levels = [np.array([0, 1]).astype(dtype1), np.array([0, 1]).astype(dtype2)]
  569. idx = MultiIndex.from_product(levels)
  570. assert idx.get_loc(idx[2]) == 2
  571. @pytest.mark.parametrize("level", [0, 1])
  572. @pytest.mark.parametrize("dtypes", [[int, float], [float, int]])
  573. def test_get_loc_implicit_cast(self, level, dtypes):
  574. # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa
  575. levels = [["a", "b"], ["c", "d"]]
  576. key = ["b", "d"]
  577. lev_dtype, key_dtype = dtypes
  578. levels[level] = np.array([0, 1], dtype=lev_dtype)
  579. key[level] = key_dtype(1)
  580. idx = MultiIndex.from_product(levels)
  581. assert idx.get_loc(tuple(key)) == 3
  582. @pytest.mark.parametrize("dtype", [bool, object])
  583. def test_get_loc_cast_bool(self, dtype):
  584. # GH 19086 : int is casted to bool, but not vice-versa (for object dtype)
  585. # With bool dtype, we don't cast in either direction.
  586. levels = [Index([False, True], dtype=dtype), np.arange(2, dtype="int64")]
  587. idx = MultiIndex.from_product(levels)
  588. if dtype is bool:
  589. with pytest.raises(KeyError, match=r"^\(0, 1\)$"):
  590. assert idx.get_loc((0, 1)) == 1
  591. with pytest.raises(KeyError, match=r"^\(1, 0\)$"):
  592. assert idx.get_loc((1, 0)) == 2
  593. else:
  594. # We use python object comparisons, which treat 0 == False and 1 == True
  595. assert idx.get_loc((0, 1)) == 1
  596. assert idx.get_loc((1, 0)) == 2
  597. with pytest.raises(KeyError, match=r"^\(False, True\)$"):
  598. idx.get_loc((False, True))
  599. with pytest.raises(KeyError, match=r"^\(True, False\)$"):
  600. idx.get_loc((True, False))
  601. @pytest.mark.parametrize("level", [0, 1])
  602. def test_get_loc_nan(self, level, nulls_fixture):
  603. # GH 18485 : NaN in MultiIndex
  604. levels = [["a", "b"], ["c", "d"]]
  605. key = ["b", "d"]
  606. levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture))
  607. key[level] = nulls_fixture
  608. idx = MultiIndex.from_product(levels)
  609. assert idx.get_loc(tuple(key)) == 3
  610. def test_get_loc_missing_nan(self):
  611. # GH 8569
  612. idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]])
  613. assert isinstance(idx.get_loc(1), slice)
  614. with pytest.raises(KeyError, match=r"^3$"):
  615. idx.get_loc(3)
  616. with pytest.raises(KeyError, match=r"^nan$"):
  617. idx.get_loc(np.nan)
  618. with pytest.raises(InvalidIndexError, match=r"\[nan\]"):
  619. # listlike/non-hashable raises TypeError
  620. idx.get_loc([np.nan])
  621. def test_get_loc_with_values_including_missing_values(self):
  622. # issue 19132
  623. idx = MultiIndex.from_product([[np.nan, 1]] * 2)
  624. expected = slice(0, 2, None)
  625. assert idx.get_loc(np.nan) == expected
  626. idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]])
  627. expected = np.array([True, False, False, True])
  628. tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected)
  629. idx = MultiIndex.from_product([[np.nan, 1]] * 3)
  630. expected = slice(2, 4, None)
  631. assert idx.get_loc((np.nan, 1)) == expected
  632. def test_get_loc_duplicates2(self):
  633. # TODO: de-duplicate with test_get_loc_duplicates above?
  634. index = MultiIndex(
  635. levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]],
  636. codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
  637. names=["tag", "day"],
  638. )
  639. assert index.get_loc("D") == slice(0, 3)
  640. def test_get_loc_past_lexsort_depth(self):
  641. # GH#30053
  642. idx = MultiIndex(
  643. levels=[["a"], [0, 7], [1]],
  644. codes=[[0, 0], [1, 0], [0, 0]],
  645. names=["x", "y", "z"],
  646. sortorder=0,
  647. )
  648. key = ("a", 7)
  649. with tm.assert_produces_warning(PerformanceWarning):
  650. # PerformanceWarning: indexing past lexsort depth may impact performance
  651. result = idx.get_loc(key)
  652. assert result == slice(0, 1, None)
  653. def test_multiindex_get_loc_list_raises(self):
  654. # GH#35878
  655. idx = MultiIndex.from_tuples([("a", 1), ("b", 2)])
  656. msg = r"\[\]"
  657. with pytest.raises(InvalidIndexError, match=msg):
  658. idx.get_loc([])
  659. def test_get_loc_nested_tuple_raises_keyerror(self):
  660. # raise KeyError, not TypeError
  661. mi = MultiIndex.from_product([range(3), range(4), range(5), range(6)])
  662. key = ((2, 3, 4), "foo")
  663. with pytest.raises(KeyError, match=re.escape(str(key))):
  664. mi.get_loc(key)
  665. class TestWhere:
  666. def test_where(self):
  667. i = MultiIndex.from_tuples([("A", 1), ("A", 2)])
  668. msg = r"\.where is not supported for MultiIndex operations"
  669. with pytest.raises(NotImplementedError, match=msg):
  670. i.where(True)
  671. def test_where_array_like(self, listlike_box):
  672. mi = MultiIndex.from_tuples([("A", 1), ("A", 2)])
  673. cond = [False, True]
  674. msg = r"\.where is not supported for MultiIndex operations"
  675. with pytest.raises(NotImplementedError, match=msg):
  676. mi.where(listlike_box(cond))
  677. class TestContains:
  678. def test_contains_top_level(self):
  679. midx = MultiIndex.from_product([["A", "B"], [1, 2]])
  680. assert "A" in midx
  681. assert "A" not in midx._engine
  682. def test_contains_with_nat(self):
  683. # MI with a NaT
  684. mi = MultiIndex(
  685. levels=[["C"], date_range("2012-01-01", periods=5)],
  686. codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
  687. names=[None, "B"],
  688. )
  689. assert ("C", pd.Timestamp("2012-01-01")) in mi
  690. for val in mi.values:
  691. assert val in mi
  692. def test_contains(self, idx):
  693. assert ("foo", "two") in idx
  694. assert ("bar", "two") not in idx
  695. assert None not in idx
  696. def test_contains_with_missing_value(self):
  697. # GH#19132
  698. idx = MultiIndex.from_arrays([[1, np.nan, 2]])
  699. assert np.nan in idx
  700. idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]])
  701. assert np.nan not in idx
  702. assert (1, np.nan) in idx
  703. def test_multiindex_contains_dropped(self):
  704. # GH#19027
  705. # test that dropped MultiIndex levels are not in the MultiIndex
  706. # despite continuing to be in the MultiIndex's levels
  707. idx = MultiIndex.from_product([[1, 2], [3, 4]])
  708. assert 2 in idx
  709. idx = idx.drop(2)
  710. # drop implementation keeps 2 in the levels
  711. assert 2 in idx.levels[0]
  712. # but it should no longer be in the index itself
  713. assert 2 not in idx
  714. # also applies to strings
  715. idx = MultiIndex.from_product([["a", "b"], ["c", "d"]])
  716. assert "a" in idx
  717. idx = idx.drop("a")
  718. assert "a" in idx.levels[0]
  719. assert "a" not in idx
  720. def test_contains_td64_level(self):
  721. # GH#24570
  722. tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min")
  723. idx = MultiIndex.from_arrays([tx, np.arange(len(tx))])
  724. assert tx[0] in idx
  725. assert "element_not_exit" not in idx
  726. assert "0 day 09:30:00" in idx
  727. def test_large_mi_contains(self, monkeypatch):
  728. # GH#10645
  729. with monkeypatch.context():
  730. monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 10)
  731. result = MultiIndex.from_arrays([range(10), range(10)])
  732. assert (10, 0) not in result
  733. def test_timestamp_multiindex_indexer():
  734. # https://github.com/pandas-dev/pandas/issues/26944
  735. idx = MultiIndex.from_product(
  736. [
  737. date_range("2019-01-01T00:15:33", periods=100, freq="h", name="date"),
  738. ["x"],
  739. [3],
  740. ]
  741. )
  742. df = DataFrame({"foo": np.arange(len(idx))}, idx)
  743. result = df.loc[pd.IndexSlice["2019-1-2":, "x", :], "foo"]
  744. qidx = MultiIndex.from_product(
  745. [
  746. date_range(
  747. start="2019-01-02T00:15:33",
  748. end="2019-01-05T03:15:33",
  749. freq="h",
  750. name="date",
  751. ),
  752. ["x"],
  753. [3],
  754. ]
  755. )
  756. should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo")
  757. tm.assert_series_equal(result, should_be)
  758. @pytest.mark.parametrize(
  759. "index_arr,expected,target,algo",
  760. [
  761. ([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"),
  762. ([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"),
  763. ([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"),
  764. ],
  765. )
  766. def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo):
  767. # issue 19132
  768. idx = MultiIndex.from_arrays(index_arr)
  769. result = idx.get_slice_bound(target, side=algo)
  770. assert result == expected
  771. @pytest.mark.parametrize(
  772. "index_arr,expected,start_idx,end_idx",
  773. [
  774. ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1),
  775. ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)),
  776. ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3),
  777. ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)),
  778. ],
  779. )
  780. def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx):
  781. # issue 19132
  782. idx = MultiIndex.from_arrays(index_arr)
  783. result = idx.slice_indexer(start=start_idx, end=end_idx)
  784. assert result == expected
  785. def test_pyint_engine():
  786. # GH#18519 : when combinations of codes cannot be represented in 64
  787. # bits, the index underlying the MultiIndex engine works with Python
  788. # integers, rather than uint64.
  789. N = 5
  790. keys = [
  791. tuple(arr)
  792. for arr in [
  793. [0] * 10 * N,
  794. [1] * 10 * N,
  795. [2] * 10 * N,
  796. [np.nan] * N + [2] * 9 * N,
  797. [0] * N + [2] * 9 * N,
  798. [np.nan] * N + [2] * 8 * N + [0] * N,
  799. ]
  800. ]
  801. # Each level contains 4 elements (including NaN), so it is represented
  802. # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
  803. # 64 bit engine and truncating the first levels, the fourth and fifth
  804. # keys would collide; if truncating the last levels, the fifth and
  805. # sixth; if rotating bits rather than shifting, the third and fifth.
  806. for idx, key_value in enumerate(keys):
  807. index = MultiIndex.from_tuples(keys)
  808. assert index.get_loc(key_value) == idx
  809. expected = np.arange(idx + 1, dtype=np.intp)
  810. result = index.get_indexer([keys[i] for i in expected])
  811. tm.assert_numpy_array_equal(result, expected)
  812. # With missing key:
  813. idces = range(len(keys))
  814. expected = np.array([-1] + list(idces), dtype=np.intp)
  815. missing = tuple([0, 1] * 5 * N)
  816. result = index.get_indexer([missing] + [keys[i] for i in idces])
  817. tm.assert_numpy_array_equal(result, expected)
  818. @pytest.mark.parametrize(
  819. "keys,expected",
  820. [
  821. ((slice(None), [5, 4]), [1, 0]),
  822. ((slice(None), [4, 5]), [0, 1]),
  823. (([True, False, True], [4, 6]), [0, 2]),
  824. (([True, False, True], [6, 4]), [0, 2]),
  825. ((2, [4, 5]), [0, 1]),
  826. ((2, [5, 4]), [1, 0]),
  827. (([2], [4, 5]), [0, 1]),
  828. (([2], [5, 4]), [1, 0]),
  829. ],
  830. )
  831. def test_get_locs_reordering(keys, expected):
  832. # GH48384
  833. idx = MultiIndex.from_arrays(
  834. [
  835. [2, 2, 1],
  836. [4, 5, 6],
  837. ]
  838. )
  839. result = idx.get_locs(keys)
  840. expected = np.array(expected, dtype=np.intp)
  841. tm.assert_numpy_array_equal(result, expected)
  842. def test_get_indexer_for_multiindex_with_nans(nulls_fixture):
  843. # GH37222
  844. idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"])
  845. idx2 = MultiIndex.from_product([["A"], [nulls_fixture, 2.0]], names=["id1", "id2"])
  846. result = idx2.get_indexer(idx1)
  847. expected = np.array([-1, 1], dtype=np.intp)
  848. tm.assert_numpy_array_equal(result, expected)
  849. result = idx1.get_indexer(idx2)
  850. expected = np.array([-1, 1], dtype=np.intp)
  851. tm.assert_numpy_array_equal(result, expected)