getitem.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas._testing as tm
  5. class BaseGetitemTests:
  6. """Tests for ExtensionArray.__getitem__."""
  7. def test_iloc_series(self, data):
  8. ser = pd.Series(data)
  9. result = ser.iloc[:4]
  10. expected = pd.Series(data[:4])
  11. tm.assert_series_equal(result, expected)
  12. result = ser.iloc[[0, 1, 2, 3]]
  13. tm.assert_series_equal(result, expected)
  14. def test_iloc_frame(self, data):
  15. df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
  16. expected = pd.DataFrame({"A": data[:4]})
  17. # slice -> frame
  18. result = df.iloc[:4, [0]]
  19. tm.assert_frame_equal(result, expected)
  20. # sequence -> frame
  21. result = df.iloc[[0, 1, 2, 3], [0]]
  22. tm.assert_frame_equal(result, expected)
  23. expected = pd.Series(data[:4], name="A")
  24. # slice -> series
  25. result = df.iloc[:4, 0]
  26. tm.assert_series_equal(result, expected)
  27. # sequence -> series
  28. result = df.iloc[:4, 0]
  29. tm.assert_series_equal(result, expected)
  30. # GH#32959 slice columns with step
  31. result = df.iloc[:, ::2]
  32. tm.assert_frame_equal(result, df[["A"]])
  33. result = df[["B", "A"]].iloc[:, ::2]
  34. tm.assert_frame_equal(result, df[["B"]])
  35. def test_iloc_frame_single_block(self, data):
  36. # GH#32959 null slice along index, slice along columns with single-block
  37. df = pd.DataFrame({"A": data})
  38. result = df.iloc[:, :]
  39. tm.assert_frame_equal(result, df)
  40. result = df.iloc[:, :1]
  41. tm.assert_frame_equal(result, df)
  42. result = df.iloc[:, :2]
  43. tm.assert_frame_equal(result, df)
  44. result = df.iloc[:, ::2]
  45. tm.assert_frame_equal(result, df)
  46. result = df.iloc[:, 1:2]
  47. tm.assert_frame_equal(result, df.iloc[:, :0])
  48. result = df.iloc[:, -1:]
  49. tm.assert_frame_equal(result, df)
  50. def test_loc_series(self, data):
  51. ser = pd.Series(data)
  52. result = ser.loc[:3]
  53. expected = pd.Series(data[:4])
  54. tm.assert_series_equal(result, expected)
  55. result = ser.loc[[0, 1, 2, 3]]
  56. tm.assert_series_equal(result, expected)
  57. def test_loc_frame(self, data):
  58. df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
  59. expected = pd.DataFrame({"A": data[:4]})
  60. # slice -> frame
  61. result = df.loc[:3, ["A"]]
  62. tm.assert_frame_equal(result, expected)
  63. # sequence -> frame
  64. result = df.loc[[0, 1, 2, 3], ["A"]]
  65. tm.assert_frame_equal(result, expected)
  66. expected = pd.Series(data[:4], name="A")
  67. # slice -> series
  68. result = df.loc[:3, "A"]
  69. tm.assert_series_equal(result, expected)
  70. # sequence -> series
  71. result = df.loc[:3, "A"]
  72. tm.assert_series_equal(result, expected)
  73. def test_loc_iloc_frame_single_dtype(self, data):
  74. # GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly
  75. # return a scalar
  76. df = pd.DataFrame({"A": data})
  77. expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype)
  78. result = df.loc[2]
  79. tm.assert_series_equal(result, expected)
  80. expected = pd.Series(
  81. [data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype
  82. )
  83. result = df.iloc[-1]
  84. tm.assert_series_equal(result, expected)
  85. def test_getitem_scalar(self, data):
  86. result = data[0]
  87. assert isinstance(result, data.dtype.type)
  88. result = pd.Series(data)[0]
  89. assert isinstance(result, data.dtype.type)
  90. def test_getitem_invalid(self, data):
  91. # TODO: box over scalar, [scalar], (scalar,)?
  92. msg = (
  93. r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis "
  94. r"\(`None`\) and integer or boolean arrays are valid indices"
  95. )
  96. with pytest.raises(IndexError, match=msg):
  97. data["foo"]
  98. with pytest.raises(IndexError, match=msg):
  99. data[2.5]
  100. ub = len(data)
  101. msg = "|".join(
  102. [
  103. "list index out of range", # json
  104. "index out of bounds", # pyarrow
  105. "Out of bounds access", # Sparse
  106. f"loc must be an integer between -{ub} and {ub}", # Sparse
  107. f"index {ub+1} is out of bounds for axis 0 with size {ub}",
  108. f"index -{ub+1} is out of bounds for axis 0 with size {ub}",
  109. ]
  110. )
  111. with pytest.raises(IndexError, match=msg):
  112. data[ub + 1]
  113. with pytest.raises(IndexError, match=msg):
  114. data[-ub - 1]
  115. def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
  116. result = data_missing[0]
  117. assert na_cmp(result, na_value)
  118. def test_getitem_empty(self, data):
  119. # Indexing with empty list
  120. result = data[[]]
  121. assert len(result) == 0
  122. assert isinstance(result, type(data))
  123. expected = data[np.array([], dtype="int64")]
  124. tm.assert_extension_array_equal(result, expected)
  125. def test_getitem_mask(self, data):
  126. # Empty mask, raw array
  127. mask = np.zeros(len(data), dtype=bool)
  128. result = data[mask]
  129. assert len(result) == 0
  130. assert isinstance(result, type(data))
  131. # Empty mask, in series
  132. mask = np.zeros(len(data), dtype=bool)
  133. result = pd.Series(data)[mask]
  134. assert len(result) == 0
  135. assert result.dtype == data.dtype
  136. # non-empty mask, raw array
  137. mask[0] = True
  138. result = data[mask]
  139. assert len(result) == 1
  140. assert isinstance(result, type(data))
  141. # non-empty mask, in series
  142. result = pd.Series(data)[mask]
  143. assert len(result) == 1
  144. assert result.dtype == data.dtype
  145. def test_getitem_mask_raises(self, data):
  146. mask = np.array([True, False])
  147. msg = f"Boolean index has wrong length: 2 instead of {len(data)}"
  148. with pytest.raises(IndexError, match=msg):
  149. data[mask]
  150. mask = pd.array(mask, dtype="boolean")
  151. with pytest.raises(IndexError, match=msg):
  152. data[mask]
  153. def test_getitem_boolean_array_mask(self, data):
  154. mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
  155. result = data[mask]
  156. assert len(result) == 0
  157. assert isinstance(result, type(data))
  158. result = pd.Series(data)[mask]
  159. assert len(result) == 0
  160. assert result.dtype == data.dtype
  161. mask[:5] = True
  162. expected = data.take([0, 1, 2, 3, 4])
  163. result = data[mask]
  164. tm.assert_extension_array_equal(result, expected)
  165. expected = pd.Series(expected)
  166. result = pd.Series(data)[mask]
  167. tm.assert_series_equal(result, expected)
  168. def test_getitem_boolean_na_treated_as_false(self, data):
  169. # https://github.com/pandas-dev/pandas/issues/31503
  170. mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
  171. mask[:2] = pd.NA
  172. mask[2:4] = True
  173. result = data[mask]
  174. expected = data[mask.fillna(False)]
  175. tm.assert_extension_array_equal(result, expected)
  176. s = pd.Series(data)
  177. result = s[mask]
  178. expected = s[mask.fillna(False)]
  179. tm.assert_series_equal(result, expected)
  180. @pytest.mark.parametrize(
  181. "idx",
  182. [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
  183. ids=["list", "integer-array", "numpy-array"],
  184. )
  185. def test_getitem_integer_array(self, data, idx):
  186. result = data[idx]
  187. assert len(result) == 3
  188. assert isinstance(result, type(data))
  189. expected = data.take([0, 1, 2])
  190. tm.assert_extension_array_equal(result, expected)
  191. expected = pd.Series(expected)
  192. result = pd.Series(data)[idx]
  193. tm.assert_series_equal(result, expected)
  194. @pytest.mark.parametrize(
  195. "idx",
  196. [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
  197. ids=["list", "integer-array"],
  198. )
  199. def test_getitem_integer_with_missing_raises(self, data, idx):
  200. msg = "Cannot index with an integer indexer containing NA values"
  201. with pytest.raises(ValueError, match=msg):
  202. data[idx]
  203. @pytest.mark.xfail(
  204. reason="Tries label-based and raises KeyError; "
  205. "in some cases raises when calling np.asarray"
  206. )
  207. @pytest.mark.parametrize(
  208. "idx",
  209. [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
  210. ids=["list", "integer-array"],
  211. )
  212. def test_getitem_series_integer_with_missing_raises(self, data, idx):
  213. msg = "Cannot index with an integer indexer containing NA values"
  214. # TODO: this raises KeyError about labels not found (it tries label-based)
  215. ser = pd.Series(data, index=[chr(100 + i) for i in range(len(data))])
  216. with pytest.raises(ValueError, match=msg):
  217. ser[idx]
  218. def test_getitem_slice(self, data):
  219. # getitem[slice] should return an array
  220. result = data[slice(0)] # empty
  221. assert isinstance(result, type(data))
  222. result = data[slice(1)] # scalar
  223. assert isinstance(result, type(data))
  224. def test_getitem_ellipsis_and_slice(self, data):
  225. # GH#40353 this is called from slice_block_rows
  226. result = data[..., :]
  227. tm.assert_extension_array_equal(result, data)
  228. result = data[:, ...]
  229. tm.assert_extension_array_equal(result, data)
  230. result = data[..., :3]
  231. tm.assert_extension_array_equal(result, data[:3])
  232. result = data[:3, ...]
  233. tm.assert_extension_array_equal(result, data[:3])
  234. result = data[..., ::2]
  235. tm.assert_extension_array_equal(result, data[::2])
  236. result = data[::2, ...]
  237. tm.assert_extension_array_equal(result, data[::2])
  238. def test_get(self, data):
  239. # GH 20882
  240. s = pd.Series(data, index=[2 * i for i in range(len(data))])
  241. assert s.get(4) == s.iloc[2]
  242. result = s.get([4, 6])
  243. expected = s.iloc[[2, 3]]
  244. tm.assert_series_equal(result, expected)
  245. result = s.get(slice(2))
  246. expected = s.iloc[[0, 1]]
  247. tm.assert_series_equal(result, expected)
  248. assert s.get(-1) is None
  249. assert s.get(s.index.max() + 1) is None
  250. s = pd.Series(data[:6], index=list("abcdef"))
  251. assert s.get("c") == s.iloc[2]
  252. result = s.get(slice("b", "d"))
  253. expected = s.iloc[[1, 2, 3]]
  254. tm.assert_series_equal(result, expected)
  255. result = s.get("Z")
  256. assert result is None
  257. msg = "Series.__getitem__ treating keys as positions is deprecated"
  258. with tm.assert_produces_warning(FutureWarning, match=msg):
  259. assert s.get(4) == s.iloc[4]
  260. assert s.get(-1) == s.iloc[-1]
  261. assert s.get(len(s)) is None
  262. # GH 21257
  263. s = pd.Series(data)
  264. with tm.assert_produces_warning(None):
  265. # GH#45324 make sure we aren't giving a spurious FutureWarning
  266. s2 = s[::2]
  267. assert s2.get(1) is None
  268. def test_take_sequence(self, data):
  269. result = pd.Series(data)[[0, 1, 3]]
  270. assert result.iloc[0] == data[0]
  271. assert result.iloc[1] == data[1]
  272. assert result.iloc[2] == data[3]
  273. def test_take(self, data, na_value, na_cmp):
  274. result = data.take([0, -1])
  275. assert result.dtype == data.dtype
  276. assert result[0] == data[0]
  277. assert result[1] == data[-1]
  278. result = data.take([0, -1], allow_fill=True, fill_value=na_value)
  279. assert result[0] == data[0]
  280. assert na_cmp(result[1], na_value)
  281. with pytest.raises(IndexError, match="out of bounds"):
  282. data.take([len(data) + 1])
  283. def test_take_empty(self, data, na_value, na_cmp):
  284. empty = data[:0]
  285. result = empty.take([-1], allow_fill=True)
  286. assert na_cmp(result[0], na_value)
  287. msg = "cannot do a non-empty take from an empty axes|out of bounds"
  288. with pytest.raises(IndexError, match=msg):
  289. empty.take([-1])
  290. with pytest.raises(IndexError, match="cannot do a non-empty take"):
  291. empty.take([0, 1])
  292. def test_take_negative(self, data):
  293. # https://github.com/pandas-dev/pandas/issues/20640
  294. n = len(data)
  295. result = data.take([0, -n, n - 1, -1])
  296. expected = data.take([0, 0, n - 1, n - 1])
  297. tm.assert_extension_array_equal(result, expected)
  298. def test_take_non_na_fill_value(self, data_missing):
  299. fill_value = data_missing[1] # valid
  300. na = data_missing[0]
  301. arr = data_missing._from_sequence(
  302. [na, fill_value, na], dtype=data_missing.dtype
  303. )
  304. result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True)
  305. expected = arr.take([1, 1])
  306. tm.assert_extension_array_equal(result, expected)
  307. def test_take_pandas_style_negative_raises(self, data, na_value):
  308. with pytest.raises(ValueError, match=""):
  309. data.take([0, -2], fill_value=na_value, allow_fill=True)
  310. @pytest.mark.parametrize("allow_fill", [True, False])
  311. def test_take_out_of_bounds_raises(self, data, allow_fill):
  312. arr = data[:3]
  313. with pytest.raises(IndexError, match="out of bounds|out-of-bounds"):
  314. arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
  315. def test_take_series(self, data):
  316. s = pd.Series(data)
  317. result = s.take([0, -1])
  318. expected = pd.Series(
  319. data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
  320. index=[0, len(data) - 1],
  321. )
  322. tm.assert_series_equal(result, expected)
  323. def test_reindex(self, data, na_value):
  324. s = pd.Series(data)
  325. result = s.reindex([0, 1, 3])
  326. expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
  327. tm.assert_series_equal(result, expected)
  328. n = len(data)
  329. result = s.reindex([-1, 0, n])
  330. expected = pd.Series(
  331. data._from_sequence([na_value, data[0], na_value], dtype=s.dtype),
  332. index=[-1, 0, n],
  333. )
  334. tm.assert_series_equal(result, expected)
  335. result = s.reindex([n, n + 1])
  336. expected = pd.Series(
  337. data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1]
  338. )
  339. tm.assert_series_equal(result, expected)
  340. def test_reindex_non_na_fill_value(self, data_missing):
  341. valid = data_missing[1]
  342. na = data_missing[0]
  343. arr = data_missing._from_sequence([na, valid], dtype=data_missing.dtype)
  344. ser = pd.Series(arr)
  345. result = ser.reindex([0, 1, 2], fill_value=valid)
  346. expected = pd.Series(
  347. data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype)
  348. )
  349. tm.assert_series_equal(result, expected)
  350. def test_loc_len1(self, data):
  351. # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim
  352. df = pd.DataFrame({"A": data})
  353. res = df.loc[[0], "A"]
  354. assert res.ndim == 1
  355. assert res._mgr.arrays[0].ndim == 1
  356. if hasattr(res._mgr, "blocks"):
  357. assert res._mgr._block.ndim == 1
  358. def test_item(self, data):
  359. # https://github.com/pandas-dev/pandas/pull/30175
  360. s = pd.Series(data)
  361. result = s[:1].item()
  362. assert result == data[0]
  363. msg = "can only convert an array of size 1 to a Python scalar"
  364. with pytest.raises(ValueError, match=msg):
  365. s[:0].item()
  366. with pytest.raises(ValueError, match=msg):
  367. s.item()