test_from_dummies.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. Series,
  6. from_dummies,
  7. get_dummies,
  8. )
  9. import pandas._testing as tm
  10. @pytest.fixture
  11. def dummies_basic():
  12. return DataFrame(
  13. {
  14. "col1_a": [1, 0, 1],
  15. "col1_b": [0, 1, 0],
  16. "col2_a": [0, 1, 0],
  17. "col2_b": [1, 0, 0],
  18. "col2_c": [0, 0, 1],
  19. },
  20. )
  21. @pytest.fixture
  22. def dummies_with_unassigned():
  23. return DataFrame(
  24. {
  25. "col1_a": [1, 0, 0],
  26. "col1_b": [0, 1, 0],
  27. "col2_a": [0, 1, 0],
  28. "col2_b": [0, 0, 0],
  29. "col2_c": [0, 0, 1],
  30. },
  31. )
  32. def test_error_wrong_data_type():
  33. dummies = [0, 1, 0]
  34. with pytest.raises(
  35. TypeError,
  36. match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list",
  37. ):
  38. from_dummies(dummies)
  39. def test_error_no_prefix_contains_unassigned():
  40. dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
  41. with pytest.raises(
  42. ValueError,
  43. match=(
  44. r"Dummy DataFrame contains unassigned value\(s\); "
  45. r"First instance in row: 2"
  46. ),
  47. ):
  48. from_dummies(dummies)
  49. def test_error_no_prefix_wrong_default_category_type():
  50. dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
  51. with pytest.raises(
  52. TypeError,
  53. match=(
  54. r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
  55. r"Received 'default_category' of type: list"
  56. ),
  57. ):
  58. from_dummies(dummies, default_category=["c", "d"])
  59. def test_error_no_prefix_multi_assignment():
  60. dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
  61. with pytest.raises(
  62. ValueError,
  63. match=(
  64. r"Dummy DataFrame contains multi-assignment\(s\); "
  65. r"First instance in row: 2"
  66. ),
  67. ):
  68. from_dummies(dummies)
  69. def test_error_no_prefix_contains_nan():
  70. dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]})
  71. with pytest.raises(
  72. ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'"
  73. ):
  74. from_dummies(dummies)
  75. def test_error_contains_non_dummies():
  76. dummies = DataFrame(
  77. {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]}
  78. )
  79. with pytest.raises(
  80. TypeError,
  81. match=r"Passed DataFrame contains non-dummy data",
  82. ):
  83. from_dummies(dummies)
  84. def test_error_with_prefix_multiple_seperators():
  85. dummies = DataFrame(
  86. {
  87. "col1_a": [1, 0, 1],
  88. "col1_b": [0, 1, 0],
  89. "col2-a": [0, 1, 0],
  90. "col2-b": [1, 0, 1],
  91. },
  92. )
  93. with pytest.raises(
  94. ValueError,
  95. match=(r"Separator not specified for column: col2-a"),
  96. ):
  97. from_dummies(dummies, sep="_")
  98. def test_error_with_prefix_sep_wrong_type(dummies_basic):
  99. with pytest.raises(
  100. TypeError,
  101. match=(
  102. r"Expected 'sep' to be of type 'str' or 'None'; "
  103. r"Received 'sep' of type: list"
  104. ),
  105. ):
  106. from_dummies(dummies_basic, sep=["_"])
  107. def test_error_with_prefix_contains_unassigned(dummies_with_unassigned):
  108. with pytest.raises(
  109. ValueError,
  110. match=(
  111. r"Dummy DataFrame contains unassigned value\(s\); "
  112. r"First instance in row: 2"
  113. ),
  114. ):
  115. from_dummies(dummies_with_unassigned, sep="_")
  116. def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned):
  117. with pytest.raises(
  118. TypeError,
  119. match=(
  120. r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
  121. r"Received 'default_category' of type: list"
  122. ),
  123. ):
  124. from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"])
  125. def test_error_with_prefix_default_category_dict_not_complete(
  126. dummies_with_unassigned,
  127. ):
  128. with pytest.raises(
  129. ValueError,
  130. match=(
  131. r"Length of 'default_category' \(1\) did not match "
  132. r"the length of the columns being encoded \(2\)"
  133. ),
  134. ):
  135. from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"})
  136. def test_error_with_prefix_contains_nan(dummies_basic):
  137. # Set float64 dtype to avoid upcast when setting np.nan
  138. dummies_basic["col2_c"] = dummies_basic["col2_c"].astype("float64")
  139. dummies_basic.loc[2, "col2_c"] = np.nan
  140. with pytest.raises(
  141. ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'"
  142. ):
  143. from_dummies(dummies_basic, sep="_")
  144. def test_error_with_prefix_contains_non_dummies(dummies_basic):
  145. # Set object dtype to avoid upcast when setting "str"
  146. dummies_basic["col2_c"] = dummies_basic["col2_c"].astype(object)
  147. dummies_basic.loc[2, "col2_c"] = "str"
  148. with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"):
  149. from_dummies(dummies_basic, sep="_")
  150. def test_error_with_prefix_double_assignment():
  151. dummies = DataFrame(
  152. {
  153. "col1_a": [1, 0, 1],
  154. "col1_b": [1, 1, 0],
  155. "col2_a": [0, 1, 0],
  156. "col2_b": [1, 0, 0],
  157. "col2_c": [0, 0, 1],
  158. },
  159. )
  160. with pytest.raises(
  161. ValueError,
  162. match=(
  163. r"Dummy DataFrame contains multi-assignment\(s\); "
  164. r"First instance in row: 0"
  165. ),
  166. ):
  167. from_dummies(dummies, sep="_")
  168. def test_roundtrip_series_to_dataframe():
  169. categories = Series(["a", "b", "c", "a"])
  170. dummies = get_dummies(categories)
  171. result = from_dummies(dummies)
  172. expected = DataFrame({"": ["a", "b", "c", "a"]})
  173. tm.assert_frame_equal(result, expected)
  174. def test_roundtrip_single_column_dataframe():
  175. categories = DataFrame({"": ["a", "b", "c", "a"]})
  176. dummies = get_dummies(categories)
  177. result = from_dummies(dummies, sep="_")
  178. expected = categories
  179. tm.assert_frame_equal(result, expected)
  180. def test_roundtrip_with_prefixes():
  181. categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
  182. dummies = get_dummies(categories)
  183. result = from_dummies(dummies, sep="_")
  184. expected = categories
  185. tm.assert_frame_equal(result, expected)
  186. def test_no_prefix_string_cats_basic():
  187. dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
  188. expected = DataFrame({"": ["a", "b", "c", "a"]})
  189. result = from_dummies(dummies)
  190. tm.assert_frame_equal(result, expected)
  191. def test_no_prefix_string_cats_basic_bool_values():
  192. dummies = DataFrame(
  193. {
  194. "a": [True, False, False, True],
  195. "b": [False, True, False, False],
  196. "c": [False, False, True, False],
  197. }
  198. )
  199. expected = DataFrame({"": ["a", "b", "c", "a"]})
  200. result = from_dummies(dummies)
  201. tm.assert_frame_equal(result, expected)
  202. def test_no_prefix_string_cats_basic_mixed_bool_values():
  203. dummies = DataFrame(
  204. {"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]}
  205. )
  206. expected = DataFrame({"": ["a", "b", "c", "a"]})
  207. result = from_dummies(dummies)
  208. tm.assert_frame_equal(result, expected)
  209. def test_no_prefix_int_cats_basic():
  210. dummies = DataFrame(
  211. {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]}
  212. )
  213. expected = DataFrame({"": [1, 25, 2, 5]})
  214. result = from_dummies(dummies)
  215. tm.assert_frame_equal(result, expected)
  216. def test_no_prefix_float_cats_basic():
  217. dummies = DataFrame(
  218. {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]}
  219. )
  220. expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]})
  221. result = from_dummies(dummies)
  222. tm.assert_frame_equal(result, expected)
  223. def test_no_prefix_mixed_cats_basic():
  224. dummies = DataFrame(
  225. {
  226. 1.23: [1, 0, 0, 0, 0],
  227. "c": [0, 1, 0, 0, 0],
  228. 2: [0, 0, 1, 0, 0],
  229. False: [0, 0, 0, 1, 0],
  230. None: [0, 0, 0, 0, 1],
  231. }
  232. )
  233. expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object")
  234. result = from_dummies(dummies)
  235. tm.assert_frame_equal(result, expected)
  236. def test_no_prefix_string_cats_contains_get_dummies_NaN_column():
  237. dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]})
  238. expected = DataFrame({"": ["a", "b", "NaN"]})
  239. result = from_dummies(dummies)
  240. tm.assert_frame_equal(result, expected)
  241. @pytest.mark.parametrize(
  242. "default_category, expected",
  243. [
  244. pytest.param(
  245. "c",
  246. DataFrame({"": ["a", "b", "c"]}),
  247. id="default_category is a str",
  248. ),
  249. pytest.param(
  250. 1,
  251. DataFrame({"": ["a", "b", 1]}),
  252. id="default_category is a int",
  253. ),
  254. pytest.param(
  255. 1.25,
  256. DataFrame({"": ["a", "b", 1.25]}),
  257. id="default_category is a float",
  258. ),
  259. pytest.param(
  260. 0,
  261. DataFrame({"": ["a", "b", 0]}),
  262. id="default_category is a 0",
  263. ),
  264. pytest.param(
  265. False,
  266. DataFrame({"": ["a", "b", False]}),
  267. id="default_category is a bool",
  268. ),
  269. pytest.param(
  270. (1, 2),
  271. DataFrame({"": ["a", "b", (1, 2)]}),
  272. id="default_category is a tuple",
  273. ),
  274. ],
  275. )
  276. def test_no_prefix_string_cats_default_category(
  277. default_category, expected, using_infer_string
  278. ):
  279. dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
  280. result = from_dummies(dummies, default_category=default_category)
  281. if using_infer_string:
  282. expected[""] = expected[""].astype("str")
  283. tm.assert_frame_equal(result, expected)
  284. def test_with_prefix_basic(dummies_basic):
  285. expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
  286. result = from_dummies(dummies_basic, sep="_")
  287. tm.assert_frame_equal(result, expected)
  288. def test_with_prefix_contains_get_dummies_NaN_column():
  289. dummies = DataFrame(
  290. {
  291. "col1_a": [1, 0, 0],
  292. "col1_b": [0, 1, 0],
  293. "col1_NaN": [0, 0, 1],
  294. "col2_a": [0, 1, 0],
  295. "col2_b": [0, 0, 0],
  296. "col2_c": [0, 0, 1],
  297. "col2_NaN": [1, 0, 0],
  298. },
  299. )
  300. expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]})
  301. result = from_dummies(dummies, sep="_")
  302. tm.assert_frame_equal(result, expected)
  303. @pytest.mark.parametrize(
  304. "default_category, expected",
  305. [
  306. pytest.param(
  307. "x",
  308. DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}),
  309. id="default_category is a str",
  310. ),
  311. pytest.param(
  312. 0,
  313. DataFrame({"col1": ["a", "b", 0], "col2": [0, "a", "c"]}),
  314. id="default_category is a 0",
  315. ),
  316. pytest.param(
  317. False,
  318. DataFrame({"col1": ["a", "b", False], "col2": [False, "a", "c"]}),
  319. id="default_category is a False",
  320. ),
  321. pytest.param(
  322. {"col2": 1, "col1": 2.5},
  323. DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}),
  324. id="default_category is a dict with int and float values",
  325. ),
  326. pytest.param(
  327. {"col2": None, "col1": False},
  328. DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}),
  329. id="default_category is a dict with bool and None values",
  330. ),
  331. pytest.param(
  332. {"col2": (1, 2), "col1": [1.25, False]},
  333. DataFrame({"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}),
  334. id="default_category is a dict with list and tuple values",
  335. ),
  336. ],
  337. )
  338. def test_with_prefix_default_category(
  339. dummies_with_unassigned, default_category, expected, using_infer_string
  340. ):
  341. result = from_dummies(
  342. dummies_with_unassigned, sep="_", default_category=default_category
  343. )
  344. if using_infer_string:
  345. expected = expected.astype("str")
  346. tm.assert_frame_equal(result, expected)
  347. def test_ea_categories():
  348. # GH 54300
  349. df = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
  350. df.columns = df.columns.astype("string[python]")
  351. result = from_dummies(df)
  352. expected = DataFrame({"": Series(list("abca"), dtype="string[python]")})
  353. tm.assert_frame_equal(result, expected)
  354. def test_ea_categories_with_sep():
  355. # GH 54300
  356. df = DataFrame(
  357. {
  358. "col1_a": [1, 0, 1],
  359. "col1_b": [0, 1, 0],
  360. "col2_a": [0, 1, 0],
  361. "col2_b": [1, 0, 0],
  362. "col2_c": [0, 0, 1],
  363. }
  364. )
  365. df.columns = df.columns.astype("string[python]")
  366. result = from_dummies(df, sep="_")
  367. expected = DataFrame(
  368. {
  369. "col1": Series(list("aba"), dtype="string[python]"),
  370. "col2": Series(list("bac"), dtype="string[python]"),
  371. }
  372. )
  373. expected.columns = expected.columns.astype("string[python]")
  374. tm.assert_frame_equal(result, expected)
  375. def test_maintain_original_index():
  376. # GH 54300
  377. df = DataFrame(
  378. {"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}, index=list("abcd")
  379. )
  380. result = from_dummies(df)
  381. expected = DataFrame({"": list("abca")}, index=list("abcd"))
  382. tm.assert_frame_equal(result, expected)