test_indexing.py 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157
  1. """ test fancy indexing & misc """
  2. import array
  3. from datetime import datetime
  4. import re
  5. import weakref
  6. import numpy as np
  7. import pytest
  8. from pandas.errors import IndexingError
  9. from pandas.core.dtypes.common import (
  10. is_float_dtype,
  11. is_integer_dtype,
  12. is_object_dtype,
  13. )
  14. import pandas as pd
  15. from pandas import (
  16. DataFrame,
  17. Index,
  18. NaT,
  19. Series,
  20. date_range,
  21. offsets,
  22. timedelta_range,
  23. )
  24. import pandas._testing as tm
  25. from pandas.tests.indexing.common import _mklbl
  26. from pandas.tests.indexing.test_floats import gen_obj
  27. # ------------------------------------------------------------------------
  28. # Indexing test cases
  29. class TestFancy:
  30. """pure get/set item & fancy indexing"""
  31. def test_setitem_ndarray_1d(self):
  32. # GH5508
  33. # len of indexer vs length of the 1d ndarray
  34. df = DataFrame(index=Index(np.arange(1, 11), dtype=np.int64))
  35. df["foo"] = np.zeros(10, dtype=np.float64)
  36. df["bar"] = np.zeros(10, dtype=complex)
  37. # invalid
  38. msg = "Must have equal len keys and value when setting with an iterable"
  39. with pytest.raises(ValueError, match=msg):
  40. df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])
  41. # valid
  42. df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])
  43. result = df.loc[df.index[2:6], "bar"]
  44. expected = Series(
  45. [2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name="bar"
  46. )
  47. tm.assert_series_equal(result, expected)
  48. def test_setitem_ndarray_1d_2(self):
  49. # GH5508
  50. # dtype getting changed?
  51. df = DataFrame(index=Index(np.arange(1, 11)))
  52. df["foo"] = np.zeros(10, dtype=np.float64)
  53. df["bar"] = np.zeros(10, dtype=complex)
  54. msg = "Must have equal len keys and value when setting with an iterable"
  55. with pytest.raises(ValueError, match=msg):
  56. df[2:5] = np.arange(1, 4) * 1j
  57. @pytest.mark.filterwarnings(
  58. "ignore:Series.__getitem__ treating keys as positions is deprecated:"
  59. "FutureWarning"
  60. )
  61. def test_getitem_ndarray_3d(
  62. self, index, frame_or_series, indexer_sli, using_array_manager
  63. ):
  64. # GH 25567
  65. obj = gen_obj(frame_or_series, index)
  66. idxr = indexer_sli(obj)
  67. nd3 = np.random.default_rng(2).integers(5, size=(2, 2, 2))
  68. msgs = []
  69. if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]:
  70. msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]")
  71. if using_array_manager:
  72. msgs.append("Passed array should be 1-dimensional")
  73. if frame_or_series is Series or indexer_sli is tm.iloc:
  74. msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)")
  75. if using_array_manager:
  76. msgs.append("indexer should be 1-dimensional")
  77. if indexer_sli is tm.loc or (
  78. frame_or_series is Series and indexer_sli is tm.setitem
  79. ):
  80. msgs.append("Cannot index with multidimensional key")
  81. if frame_or_series is DataFrame and indexer_sli is tm.setitem:
  82. msgs.append("Index data must be 1-dimensional")
  83. if isinstance(index, pd.IntervalIndex) and indexer_sli is tm.iloc:
  84. msgs.append("Index data must be 1-dimensional")
  85. if isinstance(index, (pd.TimedeltaIndex, pd.DatetimeIndex, pd.PeriodIndex)):
  86. msgs.append("Data must be 1-dimensional")
  87. if len(index) == 0 or isinstance(index, pd.MultiIndex):
  88. msgs.append("positional indexers are out-of-bounds")
  89. if type(index) is Index and not isinstance(index._values, np.ndarray):
  90. # e.g. Int64
  91. msgs.append("values must be a 1D array")
  92. # string[pyarrow]
  93. msgs.append("only handle 1-dimensional arrays")
  94. msg = "|".join(msgs)
  95. potential_errors = (IndexError, ValueError, NotImplementedError)
  96. with pytest.raises(potential_errors, match=msg):
  97. idxr[nd3]
  98. @pytest.mark.filterwarnings(
  99. "ignore:Series.__setitem__ treating keys as positions is deprecated:"
  100. "FutureWarning"
  101. )
  102. def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli):
  103. # GH 25567
  104. obj = gen_obj(frame_or_series, index)
  105. idxr = indexer_sli(obj)
  106. nd3 = np.random.default_rng(2).integers(5, size=(2, 2, 2))
  107. if indexer_sli is tm.iloc:
  108. err = ValueError
  109. msg = f"Cannot set values with ndim > {obj.ndim}"
  110. else:
  111. err = ValueError
  112. msg = "|".join(
  113. [
  114. r"Buffer has wrong number of dimensions \(expected 1, got 3\)",
  115. "Cannot set values with ndim > 1",
  116. "Index data must be 1-dimensional",
  117. "Data must be 1-dimensional",
  118. "Array conditional must be same shape as self",
  119. ]
  120. )
  121. with pytest.raises(err, match=msg):
  122. idxr[nd3] = 0
  123. def test_getitem_ndarray_0d(self):
  124. # GH#24924
  125. key = np.array(0)
  126. # dataframe __getitem__
  127. df = DataFrame([[1, 2], [3, 4]])
  128. result = df[key]
  129. expected = Series([1, 3], name=0)
  130. tm.assert_series_equal(result, expected)
  131. # series __getitem__
  132. ser = Series([1, 2])
  133. result = ser[key]
  134. assert result == 1
  135. def test_inf_upcast(self):
  136. # GH 16957
  137. # We should be able to use np.inf as a key
  138. # np.inf should cause an index to convert to float
  139. # Test with np.inf in rows
  140. df = DataFrame(columns=[0])
  141. df.loc[1] = 1
  142. df.loc[2] = 2
  143. df.loc[np.inf] = 3
  144. # make sure we can look up the value
  145. assert df.loc[np.inf, 0] == 3
  146. result = df.index
  147. expected = Index([1, 2, np.inf], dtype=np.float64)
  148. tm.assert_index_equal(result, expected)
  149. def test_setitem_dtype_upcast(self):
  150. # GH3216
  151. df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
  152. df["c"] = np.nan
  153. assert df["c"].dtype == np.float64
  154. with tm.assert_produces_warning(
  155. FutureWarning, match="item of incompatible dtype"
  156. ):
  157. df.loc[0, "c"] = "foo"
  158. expected = DataFrame(
  159. {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)}
  160. )
  161. tm.assert_frame_equal(df, expected)
  162. @pytest.mark.parametrize("val", [3.14, "wxyz"])
  163. def test_setitem_dtype_upcast2(self, val):
  164. # GH10280
  165. df = DataFrame(
  166. np.arange(6, dtype="int64").reshape(2, 3),
  167. index=list("ab"),
  168. columns=["foo", "bar", "baz"],
  169. )
  170. left = df.copy()
  171. with tm.assert_produces_warning(
  172. FutureWarning, match="item of incompatible dtype"
  173. ):
  174. left.loc["a", "bar"] = val
  175. right = DataFrame(
  176. [[0, val, 2], [3, 4, 5]],
  177. index=list("ab"),
  178. columns=["foo", "bar", "baz"],
  179. )
  180. tm.assert_frame_equal(left, right)
  181. assert is_integer_dtype(left["foo"])
  182. assert is_integer_dtype(left["baz"])
  183. def test_setitem_dtype_upcast3(self):
  184. left = DataFrame(
  185. np.arange(6, dtype="int64").reshape(2, 3) / 10.0,
  186. index=list("ab"),
  187. columns=["foo", "bar", "baz"],
  188. )
  189. with tm.assert_produces_warning(
  190. FutureWarning, match="item of incompatible dtype"
  191. ):
  192. left.loc["a", "bar"] = "wxyz"
  193. right = DataFrame(
  194. [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]],
  195. index=list("ab"),
  196. columns=["foo", "bar", "baz"],
  197. )
  198. tm.assert_frame_equal(left, right)
  199. assert is_float_dtype(left["foo"])
  200. assert is_float_dtype(left["baz"])
  201. def test_dups_fancy_indexing(self):
  202. # GH 3455
  203. df = DataFrame(np.eye(3), columns=["a", "a", "b"])
  204. result = df[["b", "a"]].columns
  205. expected = Index(["b", "a", "a"])
  206. tm.assert_index_equal(result, expected)
  207. def test_dups_fancy_indexing_across_dtypes(self):
  208. # across dtypes
  209. df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa"))
  210. result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
  211. result.columns = list("aaaaaaa") # GH#3468
  212. # GH#3509 smoke tests for indexing with duplicate columns
  213. df.iloc[:, 4]
  214. result.iloc[:, 4]
  215. tm.assert_frame_equal(df, result)
  216. def test_dups_fancy_indexing_not_in_order(self):
  217. # GH 3561, dups not in selected order
  218. df = DataFrame(
  219. {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")},
  220. index=["A", "A", "B", "C"],
  221. )
  222. rows = ["C", "B"]
  223. expected = DataFrame(
  224. {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows
  225. )
  226. result = df.loc[rows]
  227. tm.assert_frame_equal(result, expected)
  228. result = df.loc[Index(rows)]
  229. tm.assert_frame_equal(result, expected)
  230. rows = ["C", "B", "E"]
  231. with pytest.raises(KeyError, match="not in index"):
  232. df.loc[rows]
  233. # see GH5553, make sure we use the right indexer
  234. rows = ["F", "G", "H", "C", "B", "E"]
  235. with pytest.raises(KeyError, match="not in index"):
  236. df.loc[rows]
  237. def test_dups_fancy_indexing_only_missing_label(self, using_infer_string):
  238. # List containing only missing label
  239. dfnu = DataFrame(
  240. np.random.default_rng(2).standard_normal((5, 3)), index=list("AABCD")
  241. )
  242. if using_infer_string:
  243. with pytest.raises(
  244. KeyError,
  245. match=re.escape(
  246. "\"None of [Index(['E'], dtype='str')] are in the [index]\""
  247. ),
  248. ):
  249. dfnu.loc[["E"]]
  250. else:
  251. with pytest.raises(
  252. KeyError,
  253. match=re.escape(
  254. "\"None of [Index(['E'], dtype='object')] are in the [index]\""
  255. ),
  256. ):
  257. dfnu.loc[["E"]]
  258. @pytest.mark.parametrize("vals", [[0, 1, 2], list("abc")])
  259. def test_dups_fancy_indexing_missing_label(self, vals):
  260. # GH 4619; duplicate indexer with missing label
  261. df = DataFrame({"A": vals})
  262. with pytest.raises(KeyError, match="not in index"):
  263. df.loc[[0, 8, 0]]
  264. def test_dups_fancy_indexing_non_unique(self):
  265. # non unique with non unique selector
  266. df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"])
  267. with pytest.raises(KeyError, match="not in index"):
  268. df.loc[["A", "A", "E"]]
  269. def test_dups_fancy_indexing2(self):
  270. # GH 5835
  271. # dups on index and missing values
  272. df = DataFrame(
  273. np.random.default_rng(2).standard_normal((5, 5)),
  274. columns=["A", "B", "B", "B", "A"],
  275. )
  276. with pytest.raises(KeyError, match="not in index"):
  277. df.loc[:, ["A", "B", "C"]]
  278. def test_dups_fancy_indexing3(self):
  279. # GH 6504, multi-axis indexing
  280. df = DataFrame(
  281. np.random.default_rng(2).standard_normal((9, 2)),
  282. index=[1, 1, 1, 2, 2, 2, 3, 3, 3],
  283. columns=["a", "b"],
  284. )
  285. expected = df.iloc[0:6]
  286. result = df.loc[[1, 2]]
  287. tm.assert_frame_equal(result, expected)
  288. expected = df
  289. result = df.loc[:, ["a", "b"]]
  290. tm.assert_frame_equal(result, expected)
  291. expected = df.iloc[0:6, :]
  292. result = df.loc[[1, 2], ["a", "b"]]
  293. tm.assert_frame_equal(result, expected)
  294. def test_duplicate_int_indexing(self, indexer_sl):
  295. # GH 17347
  296. ser = Series(range(3), index=[1, 1, 3])
  297. expected = Series(range(2), index=[1, 1])
  298. result = indexer_sl(ser)[[1]]
  299. tm.assert_series_equal(result, expected)
  300. def test_indexing_mixed_frame_bug(self):
  301. # GH3492
  302. df = DataFrame(
  303. {"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}}
  304. )
  305. # this works, new column is created correctly
  306. df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x)
  307. # this does not work, ie column test is not changed
  308. idx = df["test"] == "_"
  309. temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x)
  310. df.loc[idx, "test"] = temp
  311. assert df.iloc[0, 2] == "-----"
  312. def test_multitype_list_index_access(self):
  313. # GH 10610
  314. df = DataFrame(
  315. np.random.default_rng(2).random((10, 5)), columns=["a"] + [20, 21, 22, 23]
  316. )
  317. with pytest.raises(KeyError, match=re.escape("'[26, -8] not in index'")):
  318. df[[22, 26, -8]]
  319. assert df[21].shape[0] == df.shape[0]
  320. def test_set_index_nan(self):
  321. # GH 3586
  322. df = DataFrame(
  323. {
  324. "PRuid": {
  325. 17: "nonQC",
  326. 18: "nonQC",
  327. 19: "nonQC",
  328. 20: "10",
  329. 21: "11",
  330. 22: "12",
  331. 23: "13",
  332. 24: "24",
  333. 25: "35",
  334. 26: "46",
  335. 27: "47",
  336. 28: "48",
  337. 29: "59",
  338. 30: "10",
  339. },
  340. "QC": {
  341. 17: 0.0,
  342. 18: 0.0,
  343. 19: 0.0,
  344. 20: np.nan,
  345. 21: np.nan,
  346. 22: np.nan,
  347. 23: np.nan,
  348. 24: 1.0,
  349. 25: np.nan,
  350. 26: np.nan,
  351. 27: np.nan,
  352. 28: np.nan,
  353. 29: np.nan,
  354. 30: np.nan,
  355. },
  356. "data": {
  357. 17: 7.9544899999999998,
  358. 18: 8.0142609999999994,
  359. 19: 7.8591520000000008,
  360. 20: 0.86140349999999999,
  361. 21: 0.87853110000000001,
  362. 22: 0.8427041999999999,
  363. 23: 0.78587700000000005,
  364. 24: 0.73062459999999996,
  365. 25: 0.81668560000000001,
  366. 26: 0.81927080000000008,
  367. 27: 0.80705009999999999,
  368. 28: 0.81440240000000008,
  369. 29: 0.80140849999999997,
  370. 30: 0.81307740000000006,
  371. },
  372. "year": {
  373. 17: 2006,
  374. 18: 2007,
  375. 19: 2008,
  376. 20: 1985,
  377. 21: 1985,
  378. 22: 1985,
  379. 23: 1985,
  380. 24: 1985,
  381. 25: 1985,
  382. 26: 1985,
  383. 27: 1985,
  384. 28: 1985,
  385. 29: 1985,
  386. 30: 1986,
  387. },
  388. }
  389. ).reset_index()
  390. result = (
  391. df.set_index(["year", "PRuid", "QC"])
  392. .reset_index()
  393. .reindex(columns=df.columns)
  394. )
  395. tm.assert_frame_equal(result, df)
  396. def test_multi_assign(self):
  397. # GH 3626, an assignment of a sub-df to a df
  398. # set float64 to avoid upcast when setting nan
  399. df = DataFrame(
  400. {
  401. "FC": ["a", "b", "a", "b", "a", "b"],
  402. "PF": [0, 0, 0, 0, 1, 1],
  403. "col1": list(range(6)),
  404. "col2": list(range(6, 12)),
  405. }
  406. ).astype({"col2": "float64"})
  407. df.iloc[1, 0] = np.nan
  408. df2 = df.copy()
  409. mask = ~df2.FC.isna()
  410. cols = ["col1", "col2"]
  411. dft = df2 * 2
  412. dft.iloc[3, 3] = np.nan
  413. expected = DataFrame(
  414. {
  415. "FC": ["a", np.nan, "a", "b", "a", "b"],
  416. "PF": [0, 0, 0, 0, 1, 1],
  417. "col1": Series([0, 1, 4, 6, 8, 10]),
  418. "col2": [12, 7, 16, np.nan, 20, 22],
  419. }
  420. )
  421. # frame on rhs
  422. df2.loc[mask, cols] = dft.loc[mask, cols]
  423. tm.assert_frame_equal(df2, expected)
  424. # with an ndarray on rhs
  425. # coerces to float64 because values has float64 dtype
  426. # GH 14001
  427. expected = DataFrame(
  428. {
  429. "FC": ["a", np.nan, "a", "b", "a", "b"],
  430. "PF": [0, 0, 0, 0, 1, 1],
  431. "col1": [0, 1, 4, 6, 8, 10],
  432. "col2": [12, 7, 16, np.nan, 20, 22],
  433. }
  434. )
  435. df2 = df.copy()
  436. df2.loc[mask, cols] = dft.loc[mask, cols].values
  437. tm.assert_frame_equal(df2, expected)
  438. def test_multi_assign_broadcasting_rhs(self):
  439. # broadcasting on the rhs is required
  440. df = DataFrame(
  441. {
  442. "A": [1, 2, 0, 0, 0],
  443. "B": [0, 0, 0, 10, 11],
  444. "C": [0, 0, 0, 10, 11],
  445. "D": [3, 4, 5, 6, 7],
  446. }
  447. )
  448. expected = df.copy()
  449. mask = expected["A"] == 0
  450. for col in ["A", "B"]:
  451. expected.loc[mask, col] = df["D"]
  452. df.loc[df["A"] == 0, ["A", "B"]] = df["D"].copy()
  453. tm.assert_frame_equal(df, expected)
  454. def test_setitem_list(self):
  455. # GH 6043
  456. # iloc with a list
  457. df = DataFrame(index=[0, 1], columns=[0])
  458. df.iloc[1, 0] = [1, 2, 3]
  459. df.iloc[1, 0] = [1, 2]
  460. result = DataFrame(index=[0, 1], columns=[0])
  461. result.iloc[1, 0] = [1, 2]
  462. tm.assert_frame_equal(result, df)
  463. def test_string_slice(self):
  464. # GH 14424
  465. # string indexing against datetimelike with object
  466. # dtype should properly raises KeyError
  467. df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object))
  468. assert df.index._is_all_dates
  469. with pytest.raises(KeyError, match="'2011'"):
  470. df["2011"]
  471. with pytest.raises(KeyError, match="'2011'"):
  472. df.loc["2011", 0]
  473. def test_string_slice_empty(self):
  474. # GH 14424
  475. df = DataFrame()
  476. assert not df.index._is_all_dates
  477. with pytest.raises(KeyError, match="'2011'"):
  478. df["2011"]
  479. with pytest.raises(KeyError, match="^0$"):
  480. df.loc["2011", 0]
  481. def test_astype_assignment(self, using_infer_string):
  482. # GH4312 (iloc)
  483. df_orig = DataFrame(
  484. [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
  485. )
  486. df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object)
  487. df = df_orig.copy()
  488. # with the enforcement of GH#45333 in 2.0, this setting is attempted inplace,
  489. # so object dtype is retained
  490. df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
  491. expected = DataFrame(
  492. [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
  493. )
  494. expected[list("CDG")] = expected[list("CDG")].astype(object)
  495. expected["A"] = expected["A"].astype(object)
  496. expected["B"] = expected["B"].astype(object)
  497. tm.assert_frame_equal(df, expected)
  498. # GH5702 (loc)
  499. df = df_orig.copy()
  500. df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
  501. expected = DataFrame(
  502. [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
  503. )
  504. expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
  505. tm.assert_frame_equal(df, expected)
  506. df = df_orig.copy()
  507. df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
  508. expected = DataFrame(
  509. [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
  510. )
  511. expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
  512. tm.assert_frame_equal(df, expected)
  513. def test_astype_assignment_full_replacements(self):
  514. # full replacements / no nans
  515. df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
  516. # With the enforcement of GH#45333 in 2.0, this assignment occurs inplace,
  517. # so float64 is retained
  518. df.iloc[:, 0] = df["A"].astype(np.int64)
  519. expected = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
  520. tm.assert_frame_equal(df, expected)
  521. df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
  522. df.loc[:, "A"] = df["A"].astype(np.int64)
  523. tm.assert_frame_equal(df, expected)
  524. @pytest.mark.parametrize("indexer", [tm.getitem, tm.loc])
  525. def test_index_type_coercion(self, indexer):
  526. # GH 11836
  527. # if we have an index type and set it with something that looks
  528. # to numpy like the same, but is actually, not
  529. # (e.g. setting with a float or string '0')
  530. # then we need to coerce to object
  531. # integer indexes
  532. for s in [Series(range(5)), Series(range(5), index=range(1, 6))]:
  533. assert is_integer_dtype(s.index)
  534. s2 = s.copy()
  535. indexer(s2)[0.1] = 0
  536. assert is_float_dtype(s2.index)
  537. assert indexer(s2)[0.1] == 0
  538. s2 = s.copy()
  539. indexer(s2)[0.0] = 0
  540. exp = s.index
  541. if 0 not in s:
  542. exp = Index(s.index.tolist() + [0])
  543. tm.assert_index_equal(s2.index, exp)
  544. s2 = s.copy()
  545. indexer(s2)["0"] = 0
  546. assert is_object_dtype(s2.index)
  547. for s in [Series(range(5), index=np.arange(5.0))]:
  548. assert is_float_dtype(s.index)
  549. s2 = s.copy()
  550. indexer(s2)[0.1] = 0
  551. assert is_float_dtype(s2.index)
  552. assert indexer(s2)[0.1] == 0
  553. s2 = s.copy()
  554. indexer(s2)[0.0] = 0
  555. tm.assert_index_equal(s2.index, s.index)
  556. s2 = s.copy()
  557. indexer(s2)["0"] = 0
  558. assert is_object_dtype(s2.index)
  559. class TestMisc:
  560. def test_float_index_to_mixed(self):
  561. df = DataFrame(
  562. {
  563. 0.0: np.random.default_rng(2).random(10),
  564. 1.0: np.random.default_rng(2).random(10),
  565. }
  566. )
  567. df["a"] = 10
  568. expected = DataFrame({0.0: df[0.0], 1.0: df[1.0], "a": [10] * 10})
  569. tm.assert_frame_equal(expected, df)
  570. def test_float_index_non_scalar_assignment(self):
  571. df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0])
  572. df.loc[df.index[:2]] = 1
  573. expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index)
  574. tm.assert_frame_equal(expected, df)
  575. def test_loc_setitem_fullindex_views(self):
  576. df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0])
  577. df2 = df.copy()
  578. df.loc[df.index] = df.loc[df.index]
  579. tm.assert_frame_equal(df, df2)
  580. def test_rhs_alignment(self, using_infer_string):
  581. # GH8258, tests that both rows & columns are aligned to what is
  582. # assigned to. covers both uniform data-type & multi-type cases
  583. def run_tests(df, rhs, right_loc, right_iloc):
  584. # label, index, slice
  585. lbl_one, idx_one, slice_one = list("bcd"), [1, 2, 3], slice(1, 4)
  586. lbl_two, idx_two, slice_two = ["joe", "jolie"], [1, 2], slice(1, 3)
  587. left = df.copy()
  588. left.loc[lbl_one, lbl_two] = rhs
  589. tm.assert_frame_equal(left, right_loc)
  590. left = df.copy()
  591. left.iloc[idx_one, idx_two] = rhs
  592. tm.assert_frame_equal(left, right_iloc)
  593. left = df.copy()
  594. left.iloc[slice_one, slice_two] = rhs
  595. tm.assert_frame_equal(left, right_iloc)
  596. xs = np.arange(20).reshape(5, 4)
  597. cols = ["jim", "joe", "jolie", "joline"]
  598. df = DataFrame(xs, columns=cols, index=list("abcde"), dtype="int64")
  599. # right hand side; permute the indices and multiplpy by -2
  600. rhs = -2 * df.iloc[3:0:-1, 2:0:-1]
  601. # expected `right` result; just multiply by -2
  602. right_iloc = df.copy()
  603. right_iloc["joe"] = [1, 14, 10, 6, 17]
  604. right_iloc["jolie"] = [2, 13, 9, 5, 18]
  605. right_iloc.iloc[1:4, 1:3] *= -2
  606. right_loc = df.copy()
  607. right_loc.iloc[1:4, 1:3] *= -2
  608. # run tests with uniform dtypes
  609. run_tests(df, rhs, right_loc, right_iloc)
  610. # make frames multi-type & re-run tests
  611. for frame in [df, rhs, right_loc, right_iloc]:
  612. frame["joe"] = frame["joe"].astype("float64")
  613. frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}")
  614. right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0]
  615. right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"]
  616. if using_infer_string:
  617. with pytest.raises(TypeError, match="Invalid value"):
  618. with tm.assert_produces_warning(
  619. FutureWarning, match="incompatible dtype"
  620. ):
  621. run_tests(df, rhs, right_loc, right_iloc)
  622. else:
  623. with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
  624. run_tests(df, rhs, right_loc, right_iloc)
  625. @pytest.mark.parametrize(
  626. "idx", [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)]
  627. )
  628. def test_str_label_slicing_with_negative_step(self, idx):
  629. SLC = pd.IndexSlice
  630. idx = Index(idx)
  631. ser = Series(np.arange(20), index=idx)
  632. tm.assert_indexing_slices_equivalent(ser, SLC[idx[9] :: -1], SLC[9::-1])
  633. tm.assert_indexing_slices_equivalent(ser, SLC[: idx[9] : -1], SLC[:8:-1])
  634. tm.assert_indexing_slices_equivalent(
  635. ser, SLC[idx[13] : idx[9] : -1], SLC[13:8:-1]
  636. )
  637. tm.assert_indexing_slices_equivalent(ser, SLC[idx[9] : idx[13] : -1], SLC[:0])
  638. def test_slice_with_zero_step_raises(self, index, indexer_sl, frame_or_series):
  639. obj = frame_or_series(np.arange(len(index)), index=index)
  640. with pytest.raises(ValueError, match="slice step cannot be zero"):
  641. indexer_sl(obj)[::0]
  642. def test_loc_setitem_indexing_assignment_dict_already_exists(self):
  643. index = Index([-5, 0, 5], name="z")
  644. df = DataFrame({"x": [1, 2, 6], "y": [2, 2, 8]}, index=index)
  645. expected = df.copy()
  646. rhs = {"x": 9, "y": 99}
  647. df.loc[5] = rhs
  648. expected.loc[5] = [9, 99]
  649. tm.assert_frame_equal(df, expected)
  650. # GH#38335 same thing, mixed dtypes
  651. df = DataFrame({"x": [1, 2, 6], "y": [2.0, 2.0, 8.0]}, index=index)
  652. df.loc[5] = rhs
  653. expected = DataFrame({"x": [1, 2, 9], "y": [2.0, 2.0, 99.0]}, index=index)
  654. tm.assert_frame_equal(df, expected)
  655. def test_iloc_getitem_indexing_dtypes_on_empty(self):
  656. # Check that .iloc returns correct dtypes GH9983
  657. df = DataFrame({"a": [1, 2, 3], "b": ["b", "b2", "b3"]})
  658. df2 = df.iloc[[], :]
  659. assert df2.loc[:, "a"].dtype == np.int64
  660. tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0])
  661. @pytest.mark.parametrize("size", [5, 999999, 1000000])
  662. def test_loc_range_in_series_indexing(self, size):
  663. # range can cause an indexing error
  664. # GH 11652
  665. s = Series(index=range(size), dtype=np.float64)
  666. s.loc[range(1)] = 42
  667. tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0]))
  668. s.loc[range(2)] = 43
  669. tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1]))
  670. def test_partial_boolean_frame_indexing(self):
  671. # GH 17170
  672. df = DataFrame(
  673. np.arange(9.0).reshape(3, 3), index=list("abc"), columns=list("ABC")
  674. )
  675. index_df = DataFrame(1, index=list("ab"), columns=list("AB"))
  676. result = df[index_df.notnull()]
  677. expected = DataFrame(
  678. np.array([[0.0, 1.0, np.nan], [3.0, 4.0, np.nan], [np.nan] * 3]),
  679. index=list("abc"),
  680. columns=list("ABC"),
  681. )
  682. tm.assert_frame_equal(result, expected)
  683. def test_no_reference_cycle(self):
  684. df = DataFrame({"a": [0, 1], "b": [2, 3]})
  685. for name in ("loc", "iloc", "at", "iat"):
  686. getattr(df, name)
  687. wr = weakref.ref(df)
  688. del df
  689. assert wr() is None
  690. def test_label_indexing_on_nan(self, nulls_fixture):
  691. # GH 32431
  692. df = Series([1, "{1,2}", 1, nulls_fixture])
  693. vc = df.value_counts(dropna=False)
  694. result1 = vc.loc[nulls_fixture]
  695. result2 = vc[nulls_fixture]
  696. expected = 1
  697. assert result1 == expected
  698. assert result2 == expected
  699. class TestDataframeNoneCoercion:
  700. EXPECTED_SINGLE_ROW_RESULTS = [
  701. # For numeric series, we should coerce to NaN.
  702. ([1, 2, 3], [np.nan, 2, 3], FutureWarning),
  703. ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0], None),
  704. # For datetime series, we should coerce to NaT.
  705. (
  706. [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
  707. [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
  708. None,
  709. ),
  710. # For objects, we should preserve the None value.
  711. (["foo", "bar", "baz"], [None, "bar", "baz"], None),
  712. ]
  713. @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
  714. def test_coercion_with_loc(self, expected):
  715. start_data, expected_result, warn = expected
  716. start_dataframe = DataFrame({"foo": start_data})
  717. start_dataframe.loc[0, ["foo"]] = None
  718. expected_dataframe = DataFrame({"foo": expected_result})
  719. tm.assert_frame_equal(start_dataframe, expected_dataframe)
  720. @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
  721. def test_coercion_with_setitem_and_dataframe(self, expected):
  722. start_data, expected_result, warn = expected
  723. start_dataframe = DataFrame({"foo": start_data})
  724. start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
  725. expected_dataframe = DataFrame({"foo": expected_result})
  726. tm.assert_frame_equal(start_dataframe, expected_dataframe)
  727. @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
  728. def test_none_coercion_loc_and_dataframe(self, expected):
  729. start_data, expected_result, warn = expected
  730. start_dataframe = DataFrame({"foo": start_data})
  731. start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
  732. expected_dataframe = DataFrame({"foo": expected_result})
  733. tm.assert_frame_equal(start_dataframe, expected_dataframe)
  734. def test_none_coercion_mixed_dtypes(self):
  735. start_dataframe = DataFrame(
  736. {
  737. "a": [1, 2, 3],
  738. "b": [1.0, 2.0, 3.0],
  739. "c": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
  740. "d": ["a", "b", "c"],
  741. }
  742. )
  743. start_dataframe.iloc[0] = None
  744. exp = DataFrame(
  745. {
  746. "a": [np.nan, 2, 3],
  747. "b": [np.nan, 2.0, 3.0],
  748. "c": [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
  749. "d": [None, "b", "c"],
  750. }
  751. )
  752. tm.assert_frame_equal(start_dataframe, exp)
  753. class TestDatetimelikeCoercion:
  754. def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli):
  755. # dispatching _can_hold_element to underlying DatetimeArray
  756. tz = tz_naive_fixture
  757. dti = date_range("2016-01-01", periods=3, tz=tz)
  758. ser = Series(dti.copy(deep=True))
  759. values = ser._values
  760. newval = "2018-01-01"
  761. values._validate_setitem_value(newval)
  762. indexer_sli(ser)[0] = newval
  763. if tz is None:
  764. # TODO(EA2D): we can make this no-copy in tz-naive case too
  765. assert ser.dtype == dti.dtype
  766. assert ser._values._ndarray is values._ndarray
  767. else:
  768. assert ser._values is values
  769. @pytest.mark.parametrize("box", [list, np.array, pd.array, pd.Categorical, Index])
  770. @pytest.mark.parametrize(
  771. "key", [[0, 1], slice(0, 2), np.array([True, True, False])]
  772. )
  773. def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, box):
  774. # dispatching _can_hold_element to underling DatetimeArray
  775. tz = tz_naive_fixture
  776. if isinstance(key, slice) and indexer_sli is tm.loc:
  777. key = slice(0, 1)
  778. dti = date_range("2016-01-01", periods=3, tz=tz)
  779. ser = Series(dti.copy(deep=True))
  780. values = ser._values
  781. newvals = box(["2019-01-01", "2010-01-02"])
  782. values._validate_setitem_value(newvals)
  783. indexer_sli(ser)[key] = newvals
  784. if tz is None:
  785. # TODO(EA2D): we can make this no-copy in tz-naive case too
  786. assert ser.dtype == dti.dtype
  787. assert ser._values._ndarray is values._ndarray
  788. else:
  789. assert ser._values is values
  790. @pytest.mark.parametrize("scalar", ["3 Days", offsets.Hour(4)])
  791. def test_setitem_td64_scalar(self, indexer_sli, scalar):
  792. # dispatching _can_hold_element to underling TimedeltaArray
  793. tdi = timedelta_range("1 Day", periods=3)
  794. ser = Series(tdi.copy(deep=True))
  795. values = ser._values
  796. values._validate_setitem_value(scalar)
  797. indexer_sli(ser)[0] = scalar
  798. assert ser._values._ndarray is values._ndarray
  799. @pytest.mark.parametrize("box", [list, np.array, pd.array, pd.Categorical, Index])
  800. @pytest.mark.parametrize(
  801. "key", [[0, 1], slice(0, 2), np.array([True, True, False])]
  802. )
  803. def test_setitem_td64_string_values(self, indexer_sli, key, box):
  804. # dispatching _can_hold_element to underling TimedeltaArray
  805. if isinstance(key, slice) and indexer_sli is tm.loc:
  806. key = slice(0, 1)
  807. tdi = timedelta_range("1 Day", periods=3)
  808. ser = Series(tdi.copy(deep=True))
  809. values = ser._values
  810. newvals = box(["10 Days", "44 hours"])
  811. values._validate_setitem_value(newvals)
  812. indexer_sli(ser)[key] = newvals
  813. assert ser._values._ndarray is values._ndarray
  814. def test_extension_array_cross_section():
  815. # A cross-section of a homogeneous EA should be an EA
  816. df = DataFrame(
  817. {
  818. "A": pd.array([1, 2], dtype="Int64"),
  819. "B": pd.array([3, 4], dtype="Int64"),
  820. },
  821. index=["a", "b"],
  822. )
  823. expected = Series(pd.array([1, 3], dtype="Int64"), index=["A", "B"], name="a")
  824. result = df.loc["a"]
  825. tm.assert_series_equal(result, expected)
  826. result = df.iloc[0]
  827. tm.assert_series_equal(result, expected)
  828. def test_extension_array_cross_section_converts():
  829. # all numeric columns -> numeric series
  830. df = DataFrame(
  831. {
  832. "A": pd.array([1, 2], dtype="Int64"),
  833. "B": np.array([1, 2], dtype="int64"),
  834. },
  835. index=["a", "b"],
  836. )
  837. result = df.loc["a"]
  838. expected = Series([1, 1], dtype="Int64", index=["A", "B"], name="a")
  839. tm.assert_series_equal(result, expected)
  840. result = df.iloc[0]
  841. tm.assert_series_equal(result, expected)
  842. # mixed columns -> object series
  843. df = DataFrame(
  844. {"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])},
  845. index=["a", "b"],
  846. )
  847. result = df.loc["a"]
  848. expected = Series([1, "a"], dtype=object, index=["A", "B"], name="a")
  849. tm.assert_series_equal(result, expected)
  850. result = df.iloc[0]
  851. tm.assert_series_equal(result, expected)
  852. @pytest.mark.parametrize(
  853. "ser, keys",
  854. [(Series([10]), (0, 0)), (Series([1, 2, 3], index=list("abc")), (0, 1))],
  855. )
  856. def test_ser_tup_indexer_exceeds_dimensions(ser, keys, indexer_li):
  857. # GH#13831
  858. exp_err, exp_msg = IndexingError, "Too many indexers"
  859. with pytest.raises(exp_err, match=exp_msg):
  860. indexer_li(ser)[keys]
  861. if indexer_li == tm.iloc:
  862. # For iloc.__setitem__ we let numpy handle the error reporting.
  863. exp_err, exp_msg = IndexError, "too many indices for array"
  864. with pytest.raises(exp_err, match=exp_msg):
  865. indexer_li(ser)[keys] = 0
  866. def test_ser_list_indexer_exceeds_dimensions(indexer_li):
  867. # GH#13831
  868. # Make sure an exception is raised when a tuple exceeds the dimension of the series,
  869. # but not list when a list is used.
  870. ser = Series([10])
  871. res = indexer_li(ser)[[0, 0]]
  872. exp = Series([10, 10], index=Index([0, 0]))
  873. tm.assert_series_equal(res, exp)
  874. @pytest.mark.parametrize(
  875. "value", [(0, 1), [0, 1], np.array([0, 1]), array.array("b", [0, 1])]
  876. )
  877. def test_scalar_setitem_with_nested_value(value):
  878. # For numeric data, we try to unpack and thus raise for mismatching length
  879. df = DataFrame({"A": [1, 2, 3]})
  880. msg = "|".join(
  881. [
  882. "Must have equal len keys and value",
  883. "setting an array element with a sequence",
  884. ]
  885. )
  886. with pytest.raises(ValueError, match=msg):
  887. df.loc[0, "B"] = value
  888. # TODO For object dtype this happens as well, but should we rather preserve
  889. # the nested data and set as such?
  890. df = DataFrame({"A": [1, 2, 3], "B": np.array([1, "a", "b"], dtype=object)})
  891. with pytest.raises(ValueError, match="Must have equal len keys and value"):
  892. df.loc[0, "B"] = value
  893. # if isinstance(value, np.ndarray):
  894. # assert (df.loc[0, "B"] == value).all()
  895. # else:
  896. # assert df.loc[0, "B"] == value
  897. @pytest.mark.parametrize(
  898. "value", [(0, 1), [0, 1], np.array([0, 1]), array.array("b", [0, 1])]
  899. )
  900. def test_scalar_setitem_series_with_nested_value(value, indexer_sli):
  901. # For numeric data, we try to unpack and thus raise for mismatching length
  902. ser = Series([1, 2, 3])
  903. with pytest.raises(ValueError, match="setting an array element with a sequence"):
  904. indexer_sli(ser)[0] = value
  905. # but for object dtype we preserve the nested data and set as such
  906. ser = Series([1, "a", "b"], dtype=object)
  907. indexer_sli(ser)[0] = value
  908. if isinstance(value, np.ndarray):
  909. assert (ser.loc[0] == value).all()
  910. else:
  911. assert ser.loc[0] == value
  912. @pytest.mark.parametrize(
  913. "value", [(0.0,), [0.0], np.array([0.0]), array.array("d", [0.0])]
  914. )
  915. def test_scalar_setitem_with_nested_value_length1(value):
  916. # https://github.com/pandas-dev/pandas/issues/46268
  917. # For numeric data, assigning length-1 array to scalar position gets unpacked
  918. df = DataFrame({"A": [1, 2, 3]})
  919. df.loc[0, "B"] = value
  920. expected = DataFrame({"A": [1, 2, 3], "B": [0.0, np.nan, np.nan]})
  921. tm.assert_frame_equal(df, expected)
  922. # but for object dtype we preserve the nested data
  923. df = DataFrame({"A": [1, 2, 3], "B": np.array([1, "a", "b"], dtype=object)})
  924. df.loc[0, "B"] = value
  925. if isinstance(value, np.ndarray):
  926. assert (df.loc[0, "B"] == value).all()
  927. else:
  928. assert df.loc[0, "B"] == value
  929. @pytest.mark.parametrize(
  930. "value", [(0.0,), [0.0], np.array([0.0]), array.array("d", [0.0])]
  931. )
  932. def test_scalar_setitem_series_with_nested_value_length1(value, indexer_sli):
  933. # For numeric data, assigning length-1 array to scalar position gets unpacked
  934. # TODO this only happens in case of ndarray, should we make this consistent
  935. # for all list-likes? (as happens for DataFrame.(i)loc, see test above)
  936. ser = Series([1.0, 2.0, 3.0])
  937. if isinstance(value, np.ndarray):
  938. indexer_sli(ser)[0] = value
  939. expected = Series([0.0, 2.0, 3.0])
  940. tm.assert_series_equal(ser, expected)
  941. else:
  942. with pytest.raises(
  943. ValueError, match="setting an array element with a sequence"
  944. ):
  945. indexer_sli(ser)[0] = value
  946. # but for object dtype we preserve the nested data
  947. ser = Series([1, "a", "b"], dtype=object)
  948. indexer_sli(ser)[0] = value
  949. if isinstance(value, np.ndarray):
  950. assert (ser.loc[0] == value).all()
  951. else:
  952. assert ser.loc[0] == value
  953. def test_object_dtype_series_set_series_element():
  954. # GH 48933
  955. s1 = Series(dtype="O", index=["a", "b"])
  956. s1["a"] = Series()
  957. s1.loc["b"] = Series()
  958. tm.assert_series_equal(s1.loc["a"], Series())
  959. tm.assert_series_equal(s1.loc["b"], Series())
  960. s2 = Series(dtype="O", index=["a", "b"])
  961. s2.iloc[1] = Series()
  962. tm.assert_series_equal(s2.iloc[1], Series())