test_getitem.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. Index,
  6. MultiIndex,
  7. Series,
  8. )
  9. import pandas._testing as tm
  10. from pandas.core.indexing import IndexingError
  11. # ----------------------------------------------------------------------------
  12. # test indexing of Series with multi-level Index
  13. # ----------------------------------------------------------------------------
  14. @pytest.mark.parametrize(
  15. "access_method",
  16. [lambda s, x: s[:, x], lambda s, x: s.loc[:, x], lambda s, x: s.xs(x, level=1)],
  17. )
  18. @pytest.mark.parametrize(
  19. "level1_value, expected",
  20. [(0, Series([1], index=[0])), (1, Series([2, 3], index=[1, 2]))],
  21. )
  22. def test_series_getitem_multiindex(access_method, level1_value, expected):
  23. # GH 6018
  24. # series regression getitem with a multi-index
  25. mi = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)], names=["A", "B"])
  26. ser = Series([1, 2, 3], index=mi)
  27. expected.index.name = "A"
  28. result = access_method(ser, level1_value)
  29. tm.assert_series_equal(result, expected)
  30. @pytest.mark.parametrize("level0_value", ["D", "A"])
  31. def test_series_getitem_duplicates_multiindex(level0_value):
  32. # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise
  33. # the appropriate error, only in PY3 of course!
  34. index = MultiIndex(
  35. levels=[[level0_value, "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]],
  36. codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
  37. names=["tag", "day"],
  38. )
  39. arr = np.random.default_rng(2).standard_normal((len(index), 1))
  40. df = DataFrame(arr, index=index, columns=["val"])
  41. # confirm indexing on missing value raises KeyError
  42. if level0_value != "A":
  43. with pytest.raises(KeyError, match=r"^'A'$"):
  44. df.val["A"]
  45. with pytest.raises(KeyError, match=r"^'X'$"):
  46. df.val["X"]
  47. result = df.val[level0_value]
  48. expected = Series(
  49. arr.ravel()[0:3], name="val", index=Index([26, 37, 57], name="day")
  50. )
  51. tm.assert_series_equal(result, expected)
  52. def test_series_getitem(multiindex_year_month_day_dataframe_random_data, indexer_sl):
  53. s = multiindex_year_month_day_dataframe_random_data["A"]
  54. expected = s.reindex(s.index[42:65])
  55. expected.index = expected.index.droplevel(0).droplevel(0)
  56. result = indexer_sl(s)[2000, 3]
  57. tm.assert_series_equal(result, expected)
  58. def test_series_getitem_returns_scalar(
  59. multiindex_year_month_day_dataframe_random_data, indexer_sl
  60. ):
  61. s = multiindex_year_month_day_dataframe_random_data["A"]
  62. expected = s.iloc[49]
  63. result = indexer_sl(s)[2000, 3, 10]
  64. assert result == expected
  65. @pytest.mark.parametrize(
  66. "indexer,expected_error,expected_error_msg",
  67. [
  68. (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^\(2000, 3, 4\)$"),
  69. (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"),
  70. (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"),
  71. (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"),
  72. (lambda s: s.__getitem__(len(s)), KeyError, ""), # match should include len(s)
  73. (lambda s: s[len(s)], KeyError, ""), # match should include len(s)
  74. (
  75. lambda s: s.iloc[len(s)],
  76. IndexError,
  77. "single positional indexer is out-of-bounds",
  78. ),
  79. ],
  80. )
  81. def test_series_getitem_indexing_errors(
  82. multiindex_year_month_day_dataframe_random_data,
  83. indexer,
  84. expected_error,
  85. expected_error_msg,
  86. ):
  87. s = multiindex_year_month_day_dataframe_random_data["A"]
  88. with pytest.raises(expected_error, match=expected_error_msg):
  89. indexer(s)
  90. def test_series_getitem_corner_generator(
  91. multiindex_year_month_day_dataframe_random_data,
  92. ):
  93. s = multiindex_year_month_day_dataframe_random_data["A"]
  94. result = s[(x > 0 for x in s)]
  95. expected = s[s > 0]
  96. tm.assert_series_equal(result, expected)
  97. # ----------------------------------------------------------------------------
  98. # test indexing of DataFrame with multi-level Index
  99. # ----------------------------------------------------------------------------
  100. def test_getitem_simple(multiindex_dataframe_random_data):
  101. df = multiindex_dataframe_random_data.T
  102. expected = df.values[:, 0]
  103. result = df["foo", "one"].values
  104. tm.assert_almost_equal(result, expected)
  105. @pytest.mark.parametrize(
  106. "indexer,expected_error_msg",
  107. [
  108. (lambda df: df[("foo", "four")], r"^\('foo', 'four'\)$"),
  109. (lambda df: df["foobar"], r"^'foobar'$"),
  110. ],
  111. )
  112. def test_frame_getitem_simple_key_error(
  113. multiindex_dataframe_random_data, indexer, expected_error_msg
  114. ):
  115. df = multiindex_dataframe_random_data.T
  116. with pytest.raises(KeyError, match=expected_error_msg):
  117. indexer(df)
  118. def test_tuple_string_column_names():
  119. # GH#50372
  120. mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")])
  121. df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi)
  122. df["single_index"] = 0
  123. df_flat = df.copy()
  124. df_flat.columns = df_flat.columns.to_flat_index()
  125. df_flat["new_single_index"] = 0
  126. result = df_flat[[("a", "aa"), "new_single_index"]]
  127. expected = DataFrame(
  128. [[0, 0], [1, 0], [2, 0]], columns=Index([("a", "aa"), "new_single_index"])
  129. )
  130. tm.assert_frame_equal(result, expected)
  131. def test_frame_getitem_multicolumn_empty_level():
  132. df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]})
  133. df.columns = [
  134. ["level1 item1", "level1 item2"],
  135. ["", "level2 item2"],
  136. ["level3 item1", "level3 item2"],
  137. ]
  138. result = df["level1 item1"]
  139. expected = DataFrame(
  140. [["1"], ["2"], ["3"]], index=df.index, columns=["level3 item1"]
  141. )
  142. tm.assert_frame_equal(result, expected)
  143. @pytest.mark.parametrize(
  144. "indexer,expected_slice",
  145. [
  146. (lambda df: df["foo"], slice(3)),
  147. (lambda df: df["bar"], slice(3, 5)),
  148. (lambda df: df.loc[:, "bar"], slice(3, 5)),
  149. ],
  150. )
  151. def test_frame_getitem_toplevel(
  152. multiindex_dataframe_random_data, indexer, expected_slice
  153. ):
  154. df = multiindex_dataframe_random_data.T
  155. expected = df.reindex(columns=df.columns[expected_slice])
  156. expected.columns = expected.columns.droplevel(0)
  157. result = indexer(df)
  158. tm.assert_frame_equal(result, expected)
  159. def test_frame_mixed_depth_get():
  160. arrays = [
  161. ["a", "top", "top", "routine1", "routine1", "routine2"],
  162. ["", "OD", "OD", "result1", "result2", "result1"],
  163. ["", "wx", "wy", "", "", ""],
  164. ]
  165. tuples = sorted(zip(*arrays))
  166. index = MultiIndex.from_tuples(tuples)
  167. df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index)
  168. result = df["a"]
  169. expected = df["a", "", ""].rename("a")
  170. tm.assert_series_equal(result, expected)
  171. result = df["routine1", "result1"]
  172. expected = df["routine1", "result1", ""]
  173. expected = expected.rename(("routine1", "result1"))
  174. tm.assert_series_equal(result, expected)
  175. def test_frame_getitem_nan_multiindex(nulls_fixture):
  176. # GH#29751
  177. # loc on a multiindex containing nan values
  178. n = nulls_fixture # for code readability
  179. cols = ["a", "b", "c"]
  180. df = DataFrame(
  181. [[11, n, 13], [21, n, 23], [31, n, 33], [41, n, 43]],
  182. columns=cols,
  183. ).set_index(["a", "b"])
  184. df["c"] = df["c"].astype("int64")
  185. idx = (21, n)
  186. result = df.loc[:idx]
  187. expected = DataFrame([[11, n, 13], [21, n, 23]], columns=cols).set_index(["a", "b"])
  188. expected["c"] = expected["c"].astype("int64")
  189. tm.assert_frame_equal(result, expected)
  190. result = df.loc[idx:]
  191. expected = DataFrame(
  192. [[21, n, 23], [31, n, 33], [41, n, 43]], columns=cols
  193. ).set_index(["a", "b"])
  194. expected["c"] = expected["c"].astype("int64")
  195. tm.assert_frame_equal(result, expected)
  196. idx1, idx2 = (21, n), (31, n)
  197. result = df.loc[idx1:idx2]
  198. expected = DataFrame([[21, n, 23], [31, n, 33]], columns=cols).set_index(["a", "b"])
  199. expected["c"] = expected["c"].astype("int64")
  200. tm.assert_frame_equal(result, expected)
  201. @pytest.mark.parametrize(
  202. "indexer,expected",
  203. [
  204. (
  205. (["b"], ["bar", np.nan]),
  206. (
  207. DataFrame(
  208. [[2, 3], [5, 6]],
  209. columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]),
  210. dtype="int64",
  211. )
  212. ),
  213. ),
  214. (
  215. (["a", "b"]),
  216. (
  217. DataFrame(
  218. [[1, 2, 3], [4, 5, 6]],
  219. columns=MultiIndex.from_tuples(
  220. [("a", "foo"), ("b", "bar"), ("b", np.nan)]
  221. ),
  222. dtype="int64",
  223. )
  224. ),
  225. ),
  226. (
  227. (["b"]),
  228. (
  229. DataFrame(
  230. [[2, 3], [5, 6]],
  231. columns=MultiIndex.from_tuples([("b", "bar"), ("b", np.nan)]),
  232. dtype="int64",
  233. )
  234. ),
  235. ),
  236. (
  237. (["b"], ["bar"]),
  238. (
  239. DataFrame(
  240. [[2], [5]],
  241. columns=MultiIndex.from_tuples([("b", "bar")]),
  242. dtype="int64",
  243. )
  244. ),
  245. ),
  246. (
  247. (["b"], [np.nan]),
  248. (
  249. DataFrame(
  250. [[3], [6]],
  251. columns=MultiIndex(
  252. codes=[[1], [-1]], levels=[["a", "b"], ["bar", "foo"]]
  253. ),
  254. dtype="int64",
  255. )
  256. ),
  257. ),
  258. (("b", np.nan), Series([3, 6], dtype="int64", name=("b", np.nan))),
  259. ],
  260. )
  261. def test_frame_getitem_nan_cols_multiindex(
  262. indexer,
  263. expected,
  264. nulls_fixture,
  265. ):
  266. # Slicing MultiIndex including levels with nan values, for more information
  267. # see GH#25154
  268. df = DataFrame(
  269. [[1, 2, 3], [4, 5, 6]],
  270. columns=MultiIndex.from_tuples(
  271. [("a", "foo"), ("b", "bar"), ("b", nulls_fixture)]
  272. ),
  273. dtype="int64",
  274. )
  275. result = df.loc[:, indexer]
  276. tm.assert_equal(result, expected)
  277. # ----------------------------------------------------------------------------
  278. # test indexing of DataFrame with multi-level Index with duplicates
  279. # ----------------------------------------------------------------------------
  280. @pytest.fixture
  281. def dataframe_with_duplicate_index():
  282. """Fixture for DataFrame used in tests for gh-4145 and gh-4146"""
  283. data = [["a", "d", "e", "c", "f", "b"], [1, 4, 5, 3, 6, 2], [1, 4, 5, 3, 6, 2]]
  284. index = ["h1", "h3", "h5"]
  285. columns = MultiIndex(
  286. levels=[["A", "B"], ["A1", "A2", "B1", "B2"]],
  287. codes=[[0, 0, 0, 1, 1, 1], [0, 3, 3, 0, 1, 2]],
  288. names=["main", "sub"],
  289. )
  290. return DataFrame(data, index=index, columns=columns)
  291. @pytest.mark.parametrize(
  292. "indexer", [lambda df: df[("A", "A1")], lambda df: df.loc[:, ("A", "A1")]]
  293. )
  294. def test_frame_mi_access(dataframe_with_duplicate_index, indexer):
  295. # GH 4145
  296. df = dataframe_with_duplicate_index
  297. index = Index(["h1", "h3", "h5"])
  298. columns = MultiIndex.from_tuples([("A", "A1")], names=["main", "sub"])
  299. expected = DataFrame([["a", 1, 1]], index=columns, columns=index).T
  300. result = indexer(df)
  301. tm.assert_frame_equal(result, expected)
  302. def test_frame_mi_access_returns_series(dataframe_with_duplicate_index):
  303. # GH 4146, not returning a block manager when selecting a unique index
  304. # from a duplicate index
  305. # as of 4879, this returns a Series (which is similar to what happens
  306. # with a non-unique)
  307. df = dataframe_with_duplicate_index
  308. expected = Series(["a", 1, 1], index=["h1", "h3", "h5"], name="A1")
  309. result = df["A"]["A1"]
  310. tm.assert_series_equal(result, expected)
  311. def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index):
  312. # selecting a non_unique from the 2nd level
  313. df = dataframe_with_duplicate_index
  314. expected = DataFrame(
  315. [["d", 4, 4], ["e", 5, 5]],
  316. index=Index(["B2", "B2"], name="sub"),
  317. columns=["h1", "h3", "h5"],
  318. ).T
  319. result = df["A"]["B2"]
  320. tm.assert_frame_equal(result, expected)
  321. def test_frame_mi_empty_slice():
  322. # GH 15454
  323. df = DataFrame(0, index=range(2), columns=MultiIndex.from_product([[1], [2]]))
  324. result = df[[]]
  325. expected = DataFrame(
  326. index=[0, 1], columns=MultiIndex(levels=[[1], [2]], codes=[[], []])
  327. )
  328. tm.assert_frame_equal(result, expected)
  329. def test_loc_empty_multiindex():
  330. # GH#36936
  331. arrays = [["a", "a", "b", "a"], ["a", "a", "b", "b"]]
  332. index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
  333. df = DataFrame([1, 2, 3, 4], index=index, columns=["value"])
  334. # loc on empty multiindex == loc with False mask
  335. empty_multiindex = df.loc[df.loc[:, "value"] == 0, :].index
  336. result = df.loc[empty_multiindex, :]
  337. expected = df.loc[[False] * len(df.index), :]
  338. tm.assert_frame_equal(result, expected)
  339. # replacing value with loc on empty multiindex
  340. df.loc[df.loc[df.loc[:, "value"] == 0].index, "value"] = 5
  341. result = df
  342. expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"])
  343. tm.assert_frame_equal(result, expected)