test_get_dummies.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740
  1. import re
  2. import unicodedata
  3. import numpy as np
  4. import pytest
  5. import pandas.util._test_decorators as td
  6. from pandas.core.dtypes.common import is_integer_dtype
  7. import pandas as pd
  8. from pandas import (
  9. ArrowDtype,
  10. Categorical,
  11. CategoricalDtype,
  12. CategoricalIndex,
  13. DataFrame,
  14. Index,
  15. RangeIndex,
  16. Series,
  17. SparseDtype,
  18. get_dummies,
  19. )
  20. import pandas._testing as tm
  21. from pandas.core.arrays.sparse import SparseArray
  22. try:
  23. import pyarrow as pa
  24. except ImportError:
  25. pa = None
  26. class TestGetDummies:
  27. @pytest.fixture
  28. def df(self):
  29. return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]})
  30. @pytest.fixture(params=["uint8", "i8", np.float64, bool, None])
  31. def dtype(self, request):
  32. return np.dtype(request.param)
  33. @pytest.fixture(params=["dense", "sparse"])
  34. def sparse(self, request):
  35. # params are strings to simplify reading test results,
  36. # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
  37. return request.param == "sparse"
  38. def effective_dtype(self, dtype):
  39. if dtype is None:
  40. return np.uint8
  41. return dtype
  42. def test_get_dummies_raises_on_dtype_object(self, df):
  43. msg = "dtype=object is not a valid dtype for get_dummies"
  44. with pytest.raises(ValueError, match=msg):
  45. get_dummies(df, dtype="object")
  46. def test_get_dummies_basic(self, sparse, dtype):
  47. s_list = list("abc")
  48. s_series = Series(s_list)
  49. s_series_index = Series(s_list, list("ABC"))
  50. expected = DataFrame(
  51. {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
  52. dtype=self.effective_dtype(dtype),
  53. )
  54. if sparse:
  55. if dtype.kind == "b":
  56. expected = expected.apply(SparseArray, fill_value=False)
  57. else:
  58. expected = expected.apply(SparseArray, fill_value=0.0)
  59. result = get_dummies(s_list, sparse=sparse, dtype=dtype)
  60. tm.assert_frame_equal(result, expected)
  61. result = get_dummies(s_series, sparse=sparse, dtype=dtype)
  62. tm.assert_frame_equal(result, expected)
  63. expected.index = list("ABC")
  64. result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
  65. tm.assert_frame_equal(result, expected)
  66. def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string):
  67. # GH 10531
  68. s_list = list("abc")
  69. s_series = Series(s_list)
  70. s_df = DataFrame(
  71. {"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]}
  72. )
  73. expected = DataFrame(
  74. {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
  75. dtype=self.effective_dtype(dtype),
  76. columns=list("abc"),
  77. )
  78. if sparse:
  79. if is_integer_dtype(dtype):
  80. fill_value = 0
  81. elif dtype == bool:
  82. fill_value = False
  83. else:
  84. fill_value = 0.0
  85. expected = expected.apply(SparseArray, fill_value=fill_value)
  86. result = get_dummies(s_list, sparse=sparse, dtype=dtype)
  87. tm.assert_frame_equal(result, expected)
  88. result = get_dummies(s_series, sparse=sparse, dtype=dtype)
  89. tm.assert_frame_equal(result, expected)
  90. result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype)
  91. if sparse:
  92. dtype_name = f"Sparse[{self.effective_dtype(dtype).name}, {fill_value}]"
  93. else:
  94. dtype_name = self.effective_dtype(dtype).name
  95. expected = Series({dtype_name: 8}, name="count")
  96. result = result.dtypes.value_counts()
  97. result.index = [str(i) for i in result.index]
  98. tm.assert_series_equal(result, expected)
  99. result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype)
  100. key = "str" if using_infer_string else "object"
  101. expected_counts = {"int64": 1, key: 1}
  102. expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
  103. expected = Series(expected_counts, name="count").sort_index()
  104. result = result.dtypes.value_counts()
  105. result.index = [str(i) for i in result.index]
  106. result = result.sort_index()
  107. tm.assert_series_equal(result, expected)
  108. def test_get_dummies_just_na(self, sparse):
  109. just_na_list = [np.nan]
  110. just_na_series = Series(just_na_list)
  111. just_na_series_index = Series(just_na_list, index=["A"])
  112. res_list = get_dummies(just_na_list, sparse=sparse)
  113. res_series = get_dummies(just_na_series, sparse=sparse)
  114. res_series_index = get_dummies(just_na_series_index, sparse=sparse)
  115. assert res_list.empty
  116. assert res_series.empty
  117. assert res_series_index.empty
  118. assert res_list.index.tolist() == [0]
  119. assert res_series.index.tolist() == [0]
  120. assert res_series_index.index.tolist() == ["A"]
  121. def test_get_dummies_include_na(self, sparse, dtype):
  122. s = ["a", "b", np.nan]
  123. res = get_dummies(s, sparse=sparse, dtype=dtype)
  124. exp = DataFrame(
  125. {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
  126. )
  127. if sparse:
  128. if dtype.kind == "b":
  129. exp = exp.apply(SparseArray, fill_value=False)
  130. else:
  131. exp = exp.apply(SparseArray, fill_value=0.0)
  132. tm.assert_frame_equal(res, exp)
  133. # Sparse dataframes do not allow nan labelled columns, see #GH8822
  134. res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
  135. exp_na = DataFrame(
  136. {np.nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]},
  137. dtype=self.effective_dtype(dtype),
  138. )
  139. exp_na = exp_na.reindex(["a", "b", np.nan], axis=1)
  140. # hack (NaN handling in assert_index_equal)
  141. exp_na.columns = res_na.columns
  142. if sparse:
  143. if dtype.kind == "b":
  144. exp_na = exp_na.apply(SparseArray, fill_value=False)
  145. else:
  146. exp_na = exp_na.apply(SparseArray, fill_value=0.0)
  147. tm.assert_frame_equal(res_na, exp_na)
  148. res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
  149. exp_just_na = DataFrame(
  150. Series(1, index=[0]), columns=[np.nan], dtype=self.effective_dtype(dtype)
  151. )
  152. tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
  153. def test_get_dummies_unicode(self, sparse):
  154. # See GH 6885 - get_dummies chokes on unicode values
  155. e = "e"
  156. eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
  157. s = [e, eacute, eacute]
  158. res = get_dummies(s, prefix="letter", sparse=sparse)
  159. exp = DataFrame(
  160. {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
  161. )
  162. if sparse:
  163. exp = exp.apply(SparseArray, fill_value=False)
  164. tm.assert_frame_equal(res, exp)
  165. def test_dataframe_dummies_all_obj(self, df, sparse):
  166. df = df[["A", "B"]]
  167. result = get_dummies(df, sparse=sparse)
  168. expected = DataFrame(
  169. {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
  170. dtype=bool,
  171. )
  172. if sparse:
  173. expected = DataFrame(
  174. {
  175. "A_a": SparseArray([1, 0, 1], dtype="bool"),
  176. "A_b": SparseArray([0, 1, 0], dtype="bool"),
  177. "B_b": SparseArray([1, 1, 0], dtype="bool"),
  178. "B_c": SparseArray([0, 0, 1], dtype="bool"),
  179. }
  180. )
  181. tm.assert_frame_equal(result, expected)
  182. def test_dataframe_dummies_string_dtype(self, df, any_string_dtype):
  183. # GH44965
  184. df = df[["A", "B"]]
  185. df = df.astype({"A": "str", "B": any_string_dtype})
  186. result = get_dummies(df)
  187. expected = DataFrame(
  188. {
  189. "A_a": [1, 0, 1],
  190. "A_b": [0, 1, 0],
  191. "B_b": [1, 1, 0],
  192. "B_c": [0, 0, 1],
  193. },
  194. dtype=bool,
  195. )
  196. if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA:
  197. expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean")
  198. tm.assert_frame_equal(result, expected)
  199. def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
  200. result = get_dummies(df, sparse=sparse, dtype=dtype)
  201. if sparse:
  202. arr = SparseArray
  203. if dtype.kind == "b":
  204. typ = SparseDtype(dtype, False)
  205. else:
  206. typ = SparseDtype(dtype, 0)
  207. else:
  208. arr = np.array
  209. typ = dtype
  210. expected = DataFrame(
  211. {
  212. "C": [1, 2, 3],
  213. "A_a": arr([1, 0, 1], dtype=typ),
  214. "A_b": arr([0, 1, 0], dtype=typ),
  215. "B_b": arr([1, 1, 0], dtype=typ),
  216. "B_c": arr([0, 0, 1], dtype=typ),
  217. }
  218. )
  219. expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
  220. tm.assert_frame_equal(result, expected)
  221. def test_dataframe_dummies_prefix_list(self, df, sparse):
  222. prefixes = ["from_A", "from_B"]
  223. result = get_dummies(df, prefix=prefixes, sparse=sparse)
  224. expected = DataFrame(
  225. {
  226. "C": [1, 2, 3],
  227. "from_A_a": [True, False, True],
  228. "from_A_b": [False, True, False],
  229. "from_B_b": [True, True, False],
  230. "from_B_c": [False, False, True],
  231. },
  232. )
  233. expected[["C"]] = df[["C"]]
  234. cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
  235. expected = expected[["C"] + cols]
  236. typ = SparseArray if sparse else Series
  237. expected[cols] = expected[cols].apply(lambda x: typ(x))
  238. tm.assert_frame_equal(result, expected)
  239. def test_dataframe_dummies_prefix_str(self, df, sparse):
  240. # not that you should do this...
  241. result = get_dummies(df, prefix="bad", sparse=sparse)
  242. bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
  243. expected = DataFrame(
  244. [
  245. [1, True, False, True, False],
  246. [2, False, True, True, False],
  247. [3, True, False, False, True],
  248. ],
  249. columns=["C"] + bad_columns,
  250. )
  251. expected = expected.astype({"C": np.int64})
  252. if sparse:
  253. # work around astyping & assigning with duplicate columns
  254. # https://github.com/pandas-dev/pandas/issues/14427
  255. expected = pd.concat(
  256. [
  257. Series([1, 2, 3], name="C"),
  258. Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
  259. Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
  260. Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
  261. Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
  262. ],
  263. axis=1,
  264. )
  265. tm.assert_frame_equal(result, expected)
  266. def test_dataframe_dummies_subset(self, df, sparse):
  267. result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
  268. expected = DataFrame(
  269. {
  270. "B": ["b", "b", "c"],
  271. "C": [1, 2, 3],
  272. "from_A_a": [1, 0, 1],
  273. "from_A_b": [0, 1, 0],
  274. },
  275. )
  276. cols = expected.columns
  277. expected[cols[1:]] = expected[cols[1:]].astype(bool)
  278. expected[["C"]] = df[["C"]]
  279. if sparse:
  280. cols = ["from_A_a", "from_A_b"]
  281. expected[cols] = expected[cols].astype(SparseDtype("bool", False))
  282. tm.assert_frame_equal(result, expected)
  283. def test_dataframe_dummies_prefix_sep(self, df, sparse):
  284. result = get_dummies(df, prefix_sep="..", sparse=sparse)
  285. expected = DataFrame(
  286. {
  287. "C": [1, 2, 3],
  288. "A..a": [True, False, True],
  289. "A..b": [False, True, False],
  290. "B..b": [True, True, False],
  291. "B..c": [False, False, True],
  292. },
  293. )
  294. expected[["C"]] = df[["C"]]
  295. expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
  296. if sparse:
  297. cols = ["A..a", "A..b", "B..b", "B..c"]
  298. expected[cols] = expected[cols].astype(SparseDtype("bool", False))
  299. tm.assert_frame_equal(result, expected)
  300. result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
  301. expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
  302. tm.assert_frame_equal(result, expected)
  303. result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
  304. tm.assert_frame_equal(result, expected)
  305. def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
  306. msg = re.escape(
  307. "Length of 'prefix' (1) did not match the length of the columns being "
  308. "encoded (2)"
  309. )
  310. with pytest.raises(ValueError, match=msg):
  311. get_dummies(df, prefix=["too few"], sparse=sparse)
  312. def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
  313. msg = re.escape(
  314. "Length of 'prefix_sep' (1) did not match the length of the columns being "
  315. "encoded (2)"
  316. )
  317. with pytest.raises(ValueError, match=msg):
  318. get_dummies(df, prefix_sep=["bad"], sparse=sparse)
  319. def test_dataframe_dummies_prefix_dict(self, sparse):
  320. prefixes = {"A": "from_A", "B": "from_B"}
  321. df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
  322. result = get_dummies(df, prefix=prefixes, sparse=sparse)
  323. expected = DataFrame(
  324. {
  325. "C": [1, 2, 3],
  326. "from_A_a": [1, 0, 1],
  327. "from_A_b": [0, 1, 0],
  328. "from_B_b": [1, 1, 0],
  329. "from_B_c": [0, 0, 1],
  330. }
  331. )
  332. columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
  333. expected[columns] = expected[columns].astype(bool)
  334. if sparse:
  335. expected[columns] = expected[columns].astype(SparseDtype("bool", False))
  336. tm.assert_frame_equal(result, expected)
  337. def test_dataframe_dummies_with_na(self, df, sparse, dtype):
  338. df.loc[3, :] = [np.nan, np.nan, np.nan]
  339. result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(
  340. axis=1
  341. )
  342. if sparse:
  343. arr = SparseArray
  344. if dtype.kind == "b":
  345. typ = SparseDtype(dtype, False)
  346. else:
  347. typ = SparseDtype(dtype, 0)
  348. else:
  349. arr = np.array
  350. typ = dtype
  351. expected = DataFrame(
  352. {
  353. "C": [1, 2, 3, np.nan],
  354. "A_a": arr([1, 0, 1, 0], dtype=typ),
  355. "A_b": arr([0, 1, 0, 0], dtype=typ),
  356. "A_nan": arr([0, 0, 0, 1], dtype=typ),
  357. "B_b": arr([1, 1, 0, 0], dtype=typ),
  358. "B_c": arr([0, 0, 1, 0], dtype=typ),
  359. "B_nan": arr([0, 0, 0, 1], dtype=typ),
  360. }
  361. ).sort_index(axis=1)
  362. tm.assert_frame_equal(result, expected)
  363. result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
  364. expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
  365. tm.assert_frame_equal(result, expected)
  366. def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
  367. df["cat"] = Categorical(["x", "y", "y"])
  368. result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
  369. if sparse:
  370. arr = SparseArray
  371. if dtype.kind == "b":
  372. typ = SparseDtype(dtype, False)
  373. else:
  374. typ = SparseDtype(dtype, 0)
  375. else:
  376. arr = np.array
  377. typ = dtype
  378. expected = DataFrame(
  379. {
  380. "C": [1, 2, 3],
  381. "A_a": arr([1, 0, 1], dtype=typ),
  382. "A_b": arr([0, 1, 0], dtype=typ),
  383. "B_b": arr([1, 1, 0], dtype=typ),
  384. "B_c": arr([0, 0, 1], dtype=typ),
  385. "cat_x": arr([1, 0, 0], dtype=typ),
  386. "cat_y": arr([0, 1, 1], dtype=typ),
  387. }
  388. ).sort_index(axis=1)
  389. tm.assert_frame_equal(result, expected)
  390. @pytest.mark.parametrize(
  391. "get_dummies_kwargs,expected",
  392. [
  393. (
  394. {"data": DataFrame({"ä": ["a"]})},
  395. DataFrame({"ä_a": [True]}),
  396. ),
  397. (
  398. {"data": DataFrame({"x": ["ä"]})},
  399. DataFrame({"x_ä": [True]}),
  400. ),
  401. (
  402. {"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
  403. DataFrame({"ä_a": [True]}),
  404. ),
  405. (
  406. {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
  407. DataFrame({"xäa": [True]}),
  408. ),
  409. ],
  410. )
  411. def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
  412. # GH22084 get_dummies incorrectly encodes unicode characters
  413. # in dataframe column names
  414. result = get_dummies(**get_dummies_kwargs)
  415. tm.assert_frame_equal(result, expected)
  416. def test_get_dummies_basic_drop_first(self, sparse):
  417. # GH12402 Add a new parameter `drop_first` to avoid collinearity
  418. # Basic case
  419. s_list = list("abc")
  420. s_series = Series(s_list)
  421. s_series_index = Series(s_list, list("ABC"))
  422. expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)
  423. result = get_dummies(s_list, drop_first=True, sparse=sparse)
  424. if sparse:
  425. expected = expected.apply(SparseArray, fill_value=False)
  426. tm.assert_frame_equal(result, expected)
  427. result = get_dummies(s_series, drop_first=True, sparse=sparse)
  428. tm.assert_frame_equal(result, expected)
  429. expected.index = list("ABC")
  430. result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
  431. tm.assert_frame_equal(result, expected)
  432. def test_get_dummies_basic_drop_first_one_level(self, sparse):
  433. # Test the case that categorical variable only has one level.
  434. s_list = list("aaa")
  435. s_series = Series(s_list)
  436. s_series_index = Series(s_list, list("ABC"))
  437. expected = DataFrame(index=RangeIndex(3))
  438. result = get_dummies(s_list, drop_first=True, sparse=sparse)
  439. tm.assert_frame_equal(result, expected)
  440. result = get_dummies(s_series, drop_first=True, sparse=sparse)
  441. tm.assert_frame_equal(result, expected)
  442. expected = DataFrame(index=list("ABC"))
  443. result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
  444. tm.assert_frame_equal(result, expected)
  445. def test_get_dummies_basic_drop_first_NA(self, sparse):
  446. # Test NA handling together with drop_first
  447. s_NA = ["a", "b", np.nan]
  448. res = get_dummies(s_NA, drop_first=True, sparse=sparse)
  449. exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
  450. if sparse:
  451. exp = exp.apply(SparseArray, fill_value=False)
  452. tm.assert_frame_equal(res, exp)
  453. res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
  454. exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
  455. ["b", np.nan], axis=1
  456. )
  457. if sparse:
  458. exp_na = exp_na.apply(SparseArray, fill_value=False)
  459. tm.assert_frame_equal(res_na, exp_na)
  460. res_just_na = get_dummies(
  461. [np.nan], dummy_na=True, drop_first=True, sparse=sparse
  462. )
  463. exp_just_na = DataFrame(index=RangeIndex(1))
  464. tm.assert_frame_equal(res_just_na, exp_just_na)
  465. def test_dataframe_dummies_drop_first(self, df, sparse):
  466. df = df[["A", "B"]]
  467. result = get_dummies(df, drop_first=True, sparse=sparse)
  468. expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
  469. if sparse:
  470. expected = expected.apply(SparseArray, fill_value=False)
  471. tm.assert_frame_equal(result, expected)
  472. def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
  473. df["cat"] = Categorical(["x", "y", "y"])
  474. result = get_dummies(df, drop_first=True, sparse=sparse)
  475. expected = DataFrame(
  476. {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
  477. )
  478. cols = ["A_b", "B_c", "cat_y"]
  479. expected[cols] = expected[cols].astype(bool)
  480. expected = expected[["C", "A_b", "B_c", "cat_y"]]
  481. if sparse:
  482. for col in cols:
  483. expected[col] = SparseArray(expected[col])
  484. tm.assert_frame_equal(result, expected)
  485. def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
  486. df.loc[3, :] = [np.nan, np.nan, np.nan]
  487. result = get_dummies(
  488. df, dummy_na=True, drop_first=True, sparse=sparse
  489. ).sort_index(axis=1)
  490. expected = DataFrame(
  491. {
  492. "C": [1, 2, 3, np.nan],
  493. "A_b": [0, 1, 0, 0],
  494. "A_nan": [0, 0, 0, 1],
  495. "B_c": [0, 0, 1, 0],
  496. "B_nan": [0, 0, 0, 1],
  497. }
  498. )
  499. cols = ["A_b", "A_nan", "B_c", "B_nan"]
  500. expected[cols] = expected[cols].astype(bool)
  501. expected = expected.sort_index(axis=1)
  502. if sparse:
  503. for col in cols:
  504. expected[col] = SparseArray(expected[col])
  505. tm.assert_frame_equal(result, expected)
  506. result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
  507. expected = expected[["C", "A_b", "B_c"]]
  508. tm.assert_frame_equal(result, expected)
  509. def test_get_dummies_int_int(self):
  510. data = Series([1, 2, 1])
  511. result = get_dummies(data)
  512. expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
  513. tm.assert_frame_equal(result, expected)
  514. data = Series(Categorical(["a", "b", "a"]))
  515. result = get_dummies(data)
  516. expected = DataFrame(
  517. [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
  518. )
  519. tm.assert_frame_equal(result, expected)
  520. def test_get_dummies_int_df(self, dtype):
  521. data = DataFrame(
  522. {
  523. "A": [1, 2, 1],
  524. "B": Categorical(["a", "b", "a"]),
  525. "C": [1, 2, 1],
  526. "D": [1.0, 2.0, 1.0],
  527. }
  528. )
  529. columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"]
  530. expected = DataFrame(
  531. [[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]],
  532. columns=columns,
  533. )
  534. expected[columns[2:]] = expected[columns[2:]].astype(dtype)
  535. result = get_dummies(data, columns=["A", "B"], dtype=dtype)
  536. tm.assert_frame_equal(result, expected)
  537. @pytest.mark.parametrize("ordered", [True, False])
  538. def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
  539. # GH13854
  540. cat = Categorical(list("xy"), categories=list("xyz"), ordered=ordered)
  541. result = get_dummies(cat, dtype=dtype)
  542. data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype))
  543. cols = CategoricalIndex(
  544. cat.categories, categories=cat.categories, ordered=ordered
  545. )
  546. expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype))
  547. tm.assert_frame_equal(result, expected)
  548. @pytest.mark.parametrize("sparse", [True, False])
  549. def test_get_dummies_dont_sparsify_all_columns(self, sparse):
  550. # GH18914
  551. df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
  552. df = get_dummies(df, columns=["Nation"], sparse=sparse)
  553. df2 = df.reindex(columns=["GDP"])
  554. tm.assert_frame_equal(df[["GDP"]], df2)
  555. def test_get_dummies_duplicate_columns(self, df):
  556. # GH20839
  557. df.columns = ["A", "A", "A"]
  558. result = get_dummies(df).sort_index(axis=1)
  559. expected = DataFrame(
  560. [
  561. [1, True, False, True, False],
  562. [2, False, True, True, False],
  563. [3, True, False, False, True],
  564. ],
  565. columns=["A", "A_a", "A_b", "A_b", "A_c"],
  566. ).sort_index(axis=1)
  567. expected = expected.astype({"A": np.int64})
  568. tm.assert_frame_equal(result, expected)
  569. def test_get_dummies_all_sparse(self):
  570. df = DataFrame({"A": [1, 2]})
  571. result = get_dummies(df, columns=["A"], sparse=True)
  572. dtype = SparseDtype("bool", False)
  573. expected = DataFrame(
  574. {
  575. "A_1": SparseArray([1, 0], dtype=dtype),
  576. "A_2": SparseArray([0, 1], dtype=dtype),
  577. }
  578. )
  579. tm.assert_frame_equal(result, expected)
  580. @pytest.mark.parametrize("values", ["baz"])
  581. def test_get_dummies_with_string_values(self, values):
  582. # issue #28383
  583. df = DataFrame(
  584. {
  585. "bar": [1, 2, 3, 4, 5, 6],
  586. "foo": ["one", "one", "one", "two", "two", "two"],
  587. "baz": ["A", "B", "C", "A", "B", "C"],
  588. "zoo": ["x", "y", "z", "q", "w", "t"],
  589. }
  590. )
  591. msg = "Input must be a list-like for parameter `columns`"
  592. with pytest.raises(TypeError, match=msg):
  593. get_dummies(df, columns=values)
  594. def test_get_dummies_ea_dtype_series(self, any_numeric_ea_and_arrow_dtype):
  595. # GH#32430
  596. ser = Series(list("abca"))
  597. result = get_dummies(ser, dtype=any_numeric_ea_and_arrow_dtype)
  598. expected = DataFrame(
  599. {"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]},
  600. dtype=any_numeric_ea_and_arrow_dtype,
  601. )
  602. tm.assert_frame_equal(result, expected)
  603. def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype):
  604. # GH#32430
  605. df = DataFrame({"x": list("abca")})
  606. result = get_dummies(df, dtype=any_numeric_ea_and_arrow_dtype)
  607. expected = DataFrame(
  608. {"x_a": [1, 0, 0, 1], "x_b": [0, 1, 0, 0], "x_c": [0, 0, 1, 0]},
  609. dtype=any_numeric_ea_and_arrow_dtype,
  610. )
  611. tm.assert_frame_equal(result, expected)
  612. @pytest.mark.parametrize("dtype_type", ["string", "category"])
  613. def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object):
  614. # GH#56273
  615. dtype = string_dtype_no_object
  616. exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool"
  617. if dtype_type == "category":
  618. dtype = CategoricalDtype(Index(["a"], dtype))
  619. df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
  620. result = get_dummies(df)
  621. expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
  622. tm.assert_frame_equal(result, expected)
  623. @td.skip_if_no("pyarrow")
  624. def test_get_dummies_arrow_dtype(self):
  625. # GH#56273
  626. df = DataFrame({"name": Series(["a"], dtype=ArrowDtype(pa.string())), "x": 1})
  627. result = get_dummies(df)
  628. expected = DataFrame({"x": 1, "name_a": Series([True], dtype="bool[pyarrow]")})
  629. tm.assert_frame_equal(result, expected)
  630. df = DataFrame(
  631. {
  632. "name": Series(
  633. ["a"],
  634. dtype=CategoricalDtype(Index(["a"], dtype=ArrowDtype(pa.string()))),
  635. ),
  636. "x": 1,
  637. }
  638. )
  639. result = get_dummies(df)
  640. tm.assert_frame_equal(result, expected)