test_categorical.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. import re
  2. import numpy as np
  3. import pytest
  4. import pandas.util._test_decorators as td
  5. import pandas as pd
  6. from pandas import (
  7. Categorical,
  8. CategoricalDtype,
  9. CategoricalIndex,
  10. DataFrame,
  11. Index,
  12. Interval,
  13. Series,
  14. Timedelta,
  15. Timestamp,
  16. option_context,
  17. )
  18. import pandas._testing as tm
  19. @pytest.fixture
  20. def df():
  21. return DataFrame(
  22. {
  23. "A": np.arange(6, dtype="int64"),
  24. },
  25. index=CategoricalIndex(
  26. list("aabbca"), dtype=CategoricalDtype(list("cab")), name="B"
  27. ),
  28. )
  29. @pytest.fixture
  30. def df2():
  31. return DataFrame(
  32. {
  33. "A": np.arange(6, dtype="int64"),
  34. },
  35. index=CategoricalIndex(
  36. list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B"
  37. ),
  38. )
  39. class TestCategoricalIndex:
  40. def test_loc_scalar(self, df):
  41. dtype = CategoricalDtype(list("cab"))
  42. result = df.loc["a"]
  43. bidx = Series(list("aaa"), name="B").astype(dtype)
  44. assert bidx.dtype == dtype
  45. expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx))
  46. tm.assert_frame_equal(result, expected)
  47. df = df.copy()
  48. df.loc["a"] = 20
  49. bidx2 = Series(list("aabbca"), name="B").astype(dtype)
  50. assert bidx2.dtype == dtype
  51. expected = DataFrame(
  52. {
  53. "A": [20, 20, 2, 3, 4, 20],
  54. },
  55. index=Index(bidx2),
  56. )
  57. tm.assert_frame_equal(df, expected)
  58. # value not in the categories
  59. with pytest.raises(KeyError, match=r"^'d'$"):
  60. df.loc["d"]
  61. df2 = df.copy()
  62. expected = df2.copy()
  63. expected.index = expected.index.astype(object)
  64. expected.loc["d"] = 10
  65. df2.loc["d"] = 10
  66. tm.assert_frame_equal(df2, expected)
  67. def test_loc_setitem_with_expansion_non_category(self, df):
  68. # Setting-with-expansion with a new key "d" that is not among caegories
  69. df.loc["a"] = 20
  70. # Setting a new row on an existing column
  71. df3 = df.copy()
  72. df3.loc["d", "A"] = 10
  73. bidx3 = Index(list("aabbcad"), name="B")
  74. expected3 = DataFrame(
  75. {
  76. "A": [20, 20, 2, 3, 4, 20, 10.0],
  77. },
  78. index=Index(bidx3),
  79. )
  80. tm.assert_frame_equal(df3, expected3)
  81. # Setting a new row _and_ new column
  82. df4 = df.copy()
  83. df4.loc["d", "C"] = 10
  84. expected3 = DataFrame(
  85. {
  86. "A": [20, 20, 2, 3, 4, 20, np.nan],
  87. "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10],
  88. },
  89. index=Index(bidx3),
  90. )
  91. tm.assert_frame_equal(df4, expected3)
  92. def test_loc_getitem_scalar_non_category(self, df):
  93. with pytest.raises(KeyError, match="^1$"):
  94. df.loc[1]
  95. def test_slicing(self):
  96. cat = Series(Categorical([1, 2, 3, 4]))
  97. reverse = cat[::-1]
  98. exp = np.array([4, 3, 2, 1], dtype=np.int64)
  99. tm.assert_numpy_array_equal(reverse.__array__(), exp)
  100. df = DataFrame({"value": (np.arange(100) + 1).astype("int64")})
  101. df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])
  102. expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10)
  103. result = df.iloc[10]
  104. tm.assert_series_equal(result, expected)
  105. expected = DataFrame(
  106. {"value": np.arange(11, 21).astype("int64")},
  107. index=np.arange(10, 20).astype("int64"),
  108. )
  109. expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
  110. result = df.iloc[10:20]
  111. tm.assert_frame_equal(result, expected)
  112. expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8)
  113. result = df.loc[8]
  114. tm.assert_series_equal(result, expected)
  115. def test_slicing_and_getting_ops(self):
  116. # systematically test the slicing operations:
  117. # for all slicing ops:
  118. # - returning a dataframe
  119. # - returning a column
  120. # - returning a row
  121. # - returning a single value
  122. cats = Categorical(
  123. ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]
  124. )
  125. idx = Index(["h", "i", "j", "k", "l", "m", "n"])
  126. values = [1, 2, 3, 4, 5, 6, 7]
  127. df = DataFrame({"cats": cats, "values": values}, index=idx)
  128. # the expected values
  129. cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
  130. idx2 = Index(["j", "k"])
  131. values2 = [3, 4]
  132. # 2:4,: | "j":"k",:
  133. exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)
  134. # :,"cats" | :,0
  135. exp_col = Series(cats, index=idx, name="cats")
  136. # "j",: | 2,:
  137. exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j")
  138. # "j","cats | 2,0
  139. exp_val = "b"
  140. # iloc
  141. # frame
  142. res_df = df.iloc[2:4, :]
  143. tm.assert_frame_equal(res_df, exp_df)
  144. assert isinstance(res_df["cats"].dtype, CategoricalDtype)
  145. # row
  146. res_row = df.iloc[2, :]
  147. tm.assert_series_equal(res_row, exp_row)
  148. assert isinstance(res_row["cats"], str)
  149. # col
  150. res_col = df.iloc[:, 0]
  151. tm.assert_series_equal(res_col, exp_col)
  152. assert isinstance(res_col.dtype, CategoricalDtype)
  153. # single value
  154. res_val = df.iloc[2, 0]
  155. assert res_val == exp_val
  156. # loc
  157. # frame
  158. res_df = df.loc["j":"k", :]
  159. tm.assert_frame_equal(res_df, exp_df)
  160. assert isinstance(res_df["cats"].dtype, CategoricalDtype)
  161. # row
  162. res_row = df.loc["j", :]
  163. tm.assert_series_equal(res_row, exp_row)
  164. assert isinstance(res_row["cats"], str)
  165. # col
  166. res_col = df.loc[:, "cats"]
  167. tm.assert_series_equal(res_col, exp_col)
  168. assert isinstance(res_col.dtype, CategoricalDtype)
  169. # single value
  170. res_val = df.loc["j", "cats"]
  171. assert res_val == exp_val
  172. # single value
  173. res_val = df.loc["j", df.columns[0]]
  174. assert res_val == exp_val
  175. # iat
  176. res_val = df.iat[2, 0]
  177. assert res_val == exp_val
  178. # at
  179. res_val = df.at["j", "cats"]
  180. assert res_val == exp_val
  181. # fancy indexing
  182. exp_fancy = df.iloc[[2]]
  183. res_fancy = df[df["cats"] == "b"]
  184. tm.assert_frame_equal(res_fancy, exp_fancy)
  185. res_fancy = df[df["values"] == 3]
  186. tm.assert_frame_equal(res_fancy, exp_fancy)
  187. # get_value
  188. res_val = df.at["j", "cats"]
  189. assert res_val == exp_val
  190. # i : int, slice, or sequence of integers
  191. res_row = df.iloc[2]
  192. tm.assert_series_equal(res_row, exp_row)
  193. assert isinstance(res_row["cats"], str)
  194. res_df = df.iloc[slice(2, 4)]
  195. tm.assert_frame_equal(res_df, exp_df)
  196. assert isinstance(res_df["cats"].dtype, CategoricalDtype)
  197. res_df = df.iloc[[2, 3]]
  198. tm.assert_frame_equal(res_df, exp_df)
  199. assert isinstance(res_df["cats"].dtype, CategoricalDtype)
  200. res_col = df.iloc[:, 0]
  201. tm.assert_series_equal(res_col, exp_col)
  202. assert isinstance(res_col.dtype, CategoricalDtype)
  203. res_df = df.iloc[:, slice(0, 2)]
  204. tm.assert_frame_equal(res_df, df)
  205. assert isinstance(res_df["cats"].dtype, CategoricalDtype)
  206. res_df = df.iloc[:, [0, 1]]
  207. tm.assert_frame_equal(res_df, df)
  208. assert isinstance(res_df["cats"].dtype, CategoricalDtype)
  209. def test_slicing_doc_examples(self):
  210. # GH 7918
  211. cats = Categorical(
  212. ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]
  213. )
  214. idx = Index(["h", "i", "j", "k", "l", "m", "n"])
  215. values = [1, 2, 2, 2, 3, 4, 5]
  216. df = DataFrame({"cats": cats, "values": values}, index=idx)
  217. result = df.iloc[2:4, :]
  218. expected = DataFrame(
  219. {
  220. "cats": Categorical(["b", "b"], categories=["a", "b", "c"]),
  221. "values": [2, 2],
  222. },
  223. index=["j", "k"],
  224. )
  225. tm.assert_frame_equal(result, expected)
  226. result = df.iloc[2:4, :].dtypes
  227. expected = Series(["category", "int64"], ["cats", "values"], dtype=object)
  228. tm.assert_series_equal(result, expected)
  229. result = df.loc["h":"j", "cats"]
  230. expected = Series(
  231. Categorical(["a", "b", "b"], categories=["a", "b", "c"]),
  232. index=["h", "i", "j"],
  233. name="cats",
  234. )
  235. tm.assert_series_equal(result, expected)
  236. result = df.loc["h":"j", df.columns[0:1]]
  237. expected = DataFrame(
  238. {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])},
  239. index=["h", "i", "j"],
  240. )
  241. tm.assert_frame_equal(result, expected)
  242. def test_loc_getitem_listlike_labels(self, df):
  243. # list of labels
  244. result = df.loc[["c", "a"]]
  245. expected = df.iloc[[4, 0, 1, 5]]
  246. tm.assert_frame_equal(result, expected, check_index_type=True)
  247. def test_loc_getitem_listlike_unused_category(self, df2):
  248. # GH#37901 a label that is in index.categories but not in index
  249. # listlike containing an element in the categories but not in the values
  250. with pytest.raises(KeyError, match=re.escape("['e'] not in index")):
  251. df2.loc[["a", "b", "e"]]
  252. def test_loc_getitem_label_unused_category(self, df2):
  253. # element in the categories but not in the values
  254. with pytest.raises(KeyError, match=r"^'e'$"):
  255. df2.loc["e"]
  256. def test_loc_getitem_non_category(self, df2):
  257. # not all labels in the categories
  258. with pytest.raises(KeyError, match=re.escape("['d'] not in index")):
  259. df2.loc[["a", "d"]]
  260. def test_loc_setitem_expansion_label_unused_category(self, df2):
  261. # assigning with a label that is in the categories but not in the index
  262. df = df2.copy()
  263. df.loc["e"] = 20
  264. result = df.loc[["a", "b", "e"]]
  265. exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B")
  266. expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index)
  267. tm.assert_frame_equal(result, expected)
  268. def test_loc_listlike_dtypes(self):
  269. # GH 11586
  270. # unique categories and codes
  271. index = CategoricalIndex(["a", "b", "c"])
  272. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)
  273. # unique slice
  274. res = df.loc[["a", "b"]]
  275. exp_index = CategoricalIndex(["a", "b"], categories=index.categories)
  276. exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index)
  277. tm.assert_frame_equal(res, exp, check_index_type=True)
  278. # duplicated slice
  279. res = df.loc[["a", "a", "b"]]
  280. exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories)
  281. exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index)
  282. tm.assert_frame_equal(res, exp, check_index_type=True)
  283. with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
  284. df.loc[["a", "x"]]
  285. def test_loc_listlike_dtypes_duplicated_categories_and_codes(self):
  286. # duplicated categories and codes
  287. index = CategoricalIndex(["a", "b", "a"])
  288. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)
  289. # unique slice
  290. res = df.loc[["a", "b"]]
  291. exp = DataFrame(
  292. {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"])
  293. )
  294. tm.assert_frame_equal(res, exp, check_index_type=True)
  295. # duplicated slice
  296. res = df.loc[["a", "a", "b"]]
  297. exp = DataFrame(
  298. {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]},
  299. index=CategoricalIndex(["a", "a", "a", "a", "b"]),
  300. )
  301. tm.assert_frame_equal(res, exp, check_index_type=True)
  302. with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
  303. df.loc[["a", "x"]]
  304. def test_loc_listlike_dtypes_unused_category(self):
  305. # contains unused category
  306. index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
  307. df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
  308. res = df.loc[["a", "b"]]
  309. exp = DataFrame(
  310. {"A": [1, 3, 2], "B": [5, 7, 6]},
  311. index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")),
  312. )
  313. tm.assert_frame_equal(res, exp, check_index_type=True)
  314. # duplicated slice
  315. res = df.loc[["a", "a", "b"]]
  316. exp = DataFrame(
  317. {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]},
  318. index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")),
  319. )
  320. tm.assert_frame_equal(res, exp, check_index_type=True)
  321. with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
  322. df.loc[["a", "x"]]
  323. def test_loc_getitem_listlike_unused_category_raises_keyerror(self):
  324. # key that is an *unused* category raises
  325. index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
  326. df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
  327. with pytest.raises(KeyError, match="e"):
  328. # For comparison, check the scalar behavior
  329. df.loc["e"]
  330. with pytest.raises(KeyError, match=re.escape("['e'] not in index")):
  331. df.loc[["a", "e"]]
  332. def test_ix_categorical_index(self):
  333. # GH 12531
  334. df = DataFrame(
  335. np.random.default_rng(2).standard_normal((3, 3)),
  336. index=list("ABC"),
  337. columns=list("XYZ"),
  338. )
  339. cdf = df.copy()
  340. cdf.index = CategoricalIndex(df.index)
  341. cdf.columns = CategoricalIndex(df.columns)
  342. expect = Series(df.loc["A", :], index=cdf.columns, name="A")
  343. tm.assert_series_equal(cdf.loc["A", :], expect)
  344. expect = Series(df.loc[:, "X"], index=cdf.index, name="X")
  345. tm.assert_series_equal(cdf.loc[:, "X"], expect)
  346. exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"])
  347. expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index)
  348. tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
  349. exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"])
  350. expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns)
  351. tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
  352. @pytest.mark.parametrize(
  353. "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
  354. )
  355. def test_ix_categorical_index_non_unique(self, infer_string):
  356. # non-unique
  357. with option_context("future.infer_string", infer_string):
  358. df = DataFrame(
  359. np.random.default_rng(2).standard_normal((3, 3)),
  360. index=list("ABA"),
  361. columns=list("XYX"),
  362. )
  363. cdf = df.copy()
  364. cdf.index = CategoricalIndex(df.index)
  365. cdf.columns = CategoricalIndex(df.columns)
  366. exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
  367. expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index)
  368. tm.assert_frame_equal(cdf.loc["A", :], expect)
  369. exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
  370. expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns)
  371. tm.assert_frame_equal(cdf.loc[:, "X"], expect)
  372. expect = DataFrame(
  373. df.loc[["A", "B"], :],
  374. columns=cdf.columns,
  375. index=CategoricalIndex(list("AAB")),
  376. )
  377. tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
  378. expect = DataFrame(
  379. df.loc[:, ["X", "Y"]],
  380. index=cdf.index,
  381. columns=CategoricalIndex(list("XXY")),
  382. )
  383. tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
  384. def test_loc_slice(self, df):
  385. # GH9748
  386. msg = (
  387. "cannot do slice indexing on CategoricalIndex with these "
  388. r"indexers \[1\] of type int"
  389. )
  390. with pytest.raises(TypeError, match=msg):
  391. df.loc[1:5]
  392. result = df.loc["b":"c"]
  393. expected = df.iloc[[2, 3, 4]]
  394. tm.assert_frame_equal(result, expected)
  395. def test_loc_and_at_with_categorical_index(self):
  396. # GH 20629
  397. df = DataFrame(
  398. [[1, 2], [3, 4], [5, 6]], index=CategoricalIndex(["A", "B", "C"])
  399. )
  400. s = df[0]
  401. assert s.loc["A"] == 1
  402. assert s.at["A"] == 1
  403. assert df.loc["B", 1] == 4
  404. assert df.at["B", 1] == 4
  405. @pytest.mark.parametrize(
  406. "idx_values",
  407. [
  408. # python types
  409. [1, 2, 3],
  410. [-1, -2, -3],
  411. [1.5, 2.5, 3.5],
  412. [-1.5, -2.5, -3.5],
  413. # numpy int/uint
  414. *(np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_NUMPY_DTYPES),
  415. # numpy floats
  416. *(np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_NUMPY_DTYPES),
  417. # numpy object
  418. np.array([1, "b", 3.5], dtype=object),
  419. # pandas scalars
  420. [Interval(1, 4), Interval(4, 6), Interval(6, 9)],
  421. [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)],
  422. [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")],
  423. # pandas Integer arrays
  424. *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES),
  425. # other pandas arrays
  426. pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array,
  427. pd.date_range("2019-01-01", periods=3).array,
  428. pd.timedelta_range(start="1d", periods=3).array,
  429. ],
  430. )
  431. def test_loc_getitem_with_non_string_categories(self, idx_values, ordered):
  432. # GH-17569
  433. cat_idx = CategoricalIndex(idx_values, ordered=ordered)
  434. df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx)
  435. sl = slice(idx_values[0], idx_values[1])
  436. # scalar selection
  437. result = df.loc[idx_values[0]]
  438. expected = Series(["foo"], index=["A"], name=idx_values[0])
  439. tm.assert_series_equal(result, expected)
  440. # list selection
  441. result = df.loc[idx_values[:2]]
  442. expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
  443. tm.assert_frame_equal(result, expected)
  444. # slice selection
  445. result = df.loc[sl]
  446. expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
  447. tm.assert_frame_equal(result, expected)
  448. # scalar assignment
  449. result = df.copy()
  450. result.loc[idx_values[0]] = "qux"
  451. expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx)
  452. tm.assert_frame_equal(result, expected)
  453. # list assignment
  454. result = df.copy()
  455. result.loc[idx_values[:2], "A"] = ["qux", "qux2"]
  456. expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
  457. tm.assert_frame_equal(result, expected)
  458. # slice assignment
  459. result = df.copy()
  460. result.loc[sl, "A"] = ["qux", "qux2"]
  461. expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
  462. tm.assert_frame_equal(result, expected)
  463. def test_getitem_categorical_with_nan(self):
  464. # GH#41933
  465. ci = CategoricalIndex(["A", "B", np.nan])
  466. ser = Series(range(3), index=ci)
  467. assert ser[np.nan] == 2
  468. assert ser.loc[np.nan] == 2
  469. df = DataFrame(ser)
  470. assert df.loc[np.nan, 0] == 2
  471. assert df.loc[np.nan][0] == 2