test_crosstab.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. CategoricalDtype,
  6. CategoricalIndex,
  7. DataFrame,
  8. Index,
  9. MultiIndex,
  10. Series,
  11. crosstab,
  12. )
  13. import pandas._testing as tm
  14. @pytest.fixture
  15. def df():
  16. df = DataFrame(
  17. {
  18. "A": [
  19. "foo",
  20. "foo",
  21. "foo",
  22. "foo",
  23. "bar",
  24. "bar",
  25. "bar",
  26. "bar",
  27. "foo",
  28. "foo",
  29. "foo",
  30. ],
  31. "B": [
  32. "one",
  33. "one",
  34. "one",
  35. "two",
  36. "one",
  37. "one",
  38. "one",
  39. "two",
  40. "two",
  41. "two",
  42. "one",
  43. ],
  44. "C": [
  45. "dull",
  46. "dull",
  47. "shiny",
  48. "dull",
  49. "dull",
  50. "shiny",
  51. "shiny",
  52. "dull",
  53. "shiny",
  54. "shiny",
  55. "shiny",
  56. ],
  57. "D": np.random.default_rng(2).standard_normal(11),
  58. "E": np.random.default_rng(2).standard_normal(11),
  59. "F": np.random.default_rng(2).standard_normal(11),
  60. }
  61. )
  62. return pd.concat([df, df], ignore_index=True)
  63. class TestCrosstab:
  64. def test_crosstab_single(self, df):
  65. result = crosstab(df["A"], df["C"])
  66. expected = df.groupby(["A", "C"]).size().unstack()
  67. tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
  68. def test_crosstab_multiple(self, df):
  69. result = crosstab(df["A"], [df["B"], df["C"]])
  70. expected = df.groupby(["A", "B", "C"]).size()
  71. expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64)
  72. tm.assert_frame_equal(result, expected)
  73. result = crosstab([df["B"], df["C"]], df["A"])
  74. expected = df.groupby(["B", "C", "A"]).size()
  75. expected = expected.unstack("A").fillna(0).astype(np.int64)
  76. tm.assert_frame_equal(result, expected)
  77. @pytest.mark.parametrize("box", [np.array, list, tuple])
  78. def test_crosstab_ndarray(self, box):
  79. # GH 44076
  80. a = box(np.random.default_rng(2).integers(0, 5, size=100))
  81. b = box(np.random.default_rng(2).integers(0, 3, size=100))
  82. c = box(np.random.default_rng(2).integers(0, 10, size=100))
  83. df = DataFrame({"a": a, "b": b, "c": c})
  84. result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"))
  85. expected = crosstab(df["a"], [df["b"], df["c"]])
  86. tm.assert_frame_equal(result, expected)
  87. result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c"))
  88. expected = crosstab([df["b"], df["c"]], df["a"])
  89. tm.assert_frame_equal(result, expected)
  90. # assign arbitrary names
  91. result = crosstab(a, c)
  92. expected = crosstab(df["a"], df["c"])
  93. expected.index.names = ["row_0"]
  94. expected.columns.names = ["col_0"]
  95. tm.assert_frame_equal(result, expected)
  96. def test_crosstab_non_aligned(self):
  97. # GH 17005
  98. a = Series([0, 1, 1], index=["a", "b", "c"])
  99. b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"])
  100. c = np.array([3, 4, 3], dtype=np.int64)
  101. expected = DataFrame(
  102. [[1, 0], [1, 1]],
  103. index=Index([0, 1], name="row_0"),
  104. columns=Index([3, 4], name="col_0"),
  105. )
  106. result = crosstab(a, b)
  107. tm.assert_frame_equal(result, expected)
  108. result = crosstab(a, c)
  109. tm.assert_frame_equal(result, expected)
  110. def test_crosstab_margins(self):
  111. a = np.random.default_rng(2).integers(0, 7, size=100)
  112. b = np.random.default_rng(2).integers(0, 3, size=100)
  113. c = np.random.default_rng(2).integers(0, 5, size=100)
  114. df = DataFrame({"a": a, "b": b, "c": c})
  115. result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True)
  116. assert result.index.names == ("a",)
  117. assert result.columns.names == ["b", "c"]
  118. all_cols = result["All", ""]
  119. exp_cols = df.groupby(["a"]).size().astype("i8")
  120. # to keep index.name
  121. exp_margin = Series([len(df)], index=Index(["All"], name="a"))
  122. exp_cols = pd.concat([exp_cols, exp_margin])
  123. exp_cols.name = ("All", "")
  124. tm.assert_series_equal(all_cols, exp_cols)
  125. all_rows = result.loc["All"]
  126. exp_rows = df.groupby(["b", "c"]).size().astype("i8")
  127. exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("All", "")])])
  128. exp_rows.name = "All"
  129. exp_rows = exp_rows.reindex(all_rows.index)
  130. exp_rows = exp_rows.fillna(0).astype(np.int64)
  131. tm.assert_series_equal(all_rows, exp_rows)
  132. def test_crosstab_margins_set_margin_name(self):
  133. # GH 15972
  134. a = np.random.default_rng(2).integers(0, 7, size=100)
  135. b = np.random.default_rng(2).integers(0, 3, size=100)
  136. c = np.random.default_rng(2).integers(0, 5, size=100)
  137. df = DataFrame({"a": a, "b": b, "c": c})
  138. result = crosstab(
  139. a,
  140. [b, c],
  141. rownames=["a"],
  142. colnames=("b", "c"),
  143. margins=True,
  144. margins_name="TOTAL",
  145. )
  146. assert result.index.names == ("a",)
  147. assert result.columns.names == ["b", "c"]
  148. all_cols = result["TOTAL", ""]
  149. exp_cols = df.groupby(["a"]).size().astype("i8")
  150. # to keep index.name
  151. exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a"))
  152. exp_cols = pd.concat([exp_cols, exp_margin])
  153. exp_cols.name = ("TOTAL", "")
  154. tm.assert_series_equal(all_cols, exp_cols)
  155. all_rows = result.loc["TOTAL"]
  156. exp_rows = df.groupby(["b", "c"]).size().astype("i8")
  157. exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("TOTAL", "")])])
  158. exp_rows.name = "TOTAL"
  159. exp_rows = exp_rows.reindex(all_rows.index)
  160. exp_rows = exp_rows.fillna(0).astype(np.int64)
  161. tm.assert_series_equal(all_rows, exp_rows)
  162. msg = "margins_name argument must be a string"
  163. for margins_name in [666, None, ["a", "b"]]:
  164. with pytest.raises(ValueError, match=msg):
  165. crosstab(
  166. a,
  167. [b, c],
  168. rownames=["a"],
  169. colnames=("b", "c"),
  170. margins=True,
  171. margins_name=margins_name,
  172. )
  173. def test_crosstab_pass_values(self):
  174. a = np.random.default_rng(2).integers(0, 7, size=100)
  175. b = np.random.default_rng(2).integers(0, 3, size=100)
  176. c = np.random.default_rng(2).integers(0, 5, size=100)
  177. values = np.random.default_rng(2).standard_normal(100)
  178. table = crosstab(
  179. [a, b], c, values, aggfunc="sum", rownames=["foo", "bar"], colnames=["baz"]
  180. )
  181. df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values})
  182. expected = df.pivot_table(
  183. "values", index=["foo", "bar"], columns="baz", aggfunc="sum"
  184. )
  185. tm.assert_frame_equal(table, expected)
  186. def test_crosstab_dropna(self):
  187. # GH 3820
  188. a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
  189. b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object)
  190. c = np.array(
  191. ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
  192. )
  193. res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False)
  194. m = MultiIndex.from_tuples(
  195. [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")],
  196. names=["b", "c"],
  197. )
  198. tm.assert_index_equal(res.columns, m)
  199. def test_crosstab_no_overlap(self):
  200. # GS 10291
  201. s1 = Series([1, 2, 3], index=[1, 2, 3])
  202. s2 = Series([4, 5, 6], index=[4, 5, 6])
  203. actual = crosstab(s1, s2)
  204. expected = DataFrame(
  205. index=Index([], dtype="int64", name="row_0"),
  206. columns=Index([], dtype="int64", name="col_0"),
  207. )
  208. tm.assert_frame_equal(actual, expected)
  209. def test_margin_dropna(self):
  210. # GH 12577
  211. # pivot_table counts null into margin ('All')
  212. # when margins=true and dropna=true
  213. df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
  214. actual = crosstab(df.a, df.b, margins=True, dropna=True)
  215. expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
  216. expected.index = Index([1.0, 2.0, "All"], name="a")
  217. expected.columns = Index([3, 4, "All"], name="b")
  218. tm.assert_frame_equal(actual, expected)
  219. def test_margin_dropna2(self):
  220. df = DataFrame(
  221. {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
  222. )
  223. actual = crosstab(df.a, df.b, margins=True, dropna=True)
  224. expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
  225. expected.index = Index([1.0, 2.0, "All"], name="a")
  226. expected.columns = Index([3.0, 4.0, "All"], name="b")
  227. tm.assert_frame_equal(actual, expected)
  228. def test_margin_dropna3(self):
  229. df = DataFrame(
  230. {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]}
  231. )
  232. actual = crosstab(df.a, df.b, margins=True, dropna=True)
  233. expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
  234. expected.index = Index([1.0, 2.0, "All"], name="a")
  235. expected.columns = Index([3, 4, "All"], name="b")
  236. tm.assert_frame_equal(actual, expected)
  237. def test_margin_dropna4(self):
  238. # GH 12642
  239. # _add_margins raises KeyError: Level None not found
  240. # when margins=True and dropna=False
  241. # GH: 10772: Keep np.nan in result with dropna=False
  242. df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
  243. actual = crosstab(df.a, df.b, margins=True, dropna=False)
  244. expected = DataFrame([[1, 0, 1.0], [1, 3, 4.0], [0, 1, np.nan], [2, 4, 6.0]])
  245. expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
  246. expected.columns = Index([3, 4, "All"], name="b")
  247. tm.assert_frame_equal(actual, expected)
  248. def test_margin_dropna5(self):
  249. # GH: 10772: Keep np.nan in result with dropna=False
  250. df = DataFrame(
  251. {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
  252. )
  253. actual = crosstab(df.a, df.b, margins=True, dropna=False)
  254. expected = DataFrame(
  255. [[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, np.nan], [1, 4, 0, 6.0]]
  256. )
  257. expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
  258. expected.columns = Index([3.0, 4.0, np.nan, "All"], name="b")
  259. tm.assert_frame_equal(actual, expected)
  260. def test_margin_dropna6(self):
  261. # GH: 10772: Keep np.nan in result with dropna=False
  262. a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
  263. b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object)
  264. c = np.array(
  265. ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
  266. )
  267. actual = crosstab(
  268. a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False
  269. )
  270. m = MultiIndex.from_arrays(
  271. [
  272. ["one", "one", "two", "two", np.nan, np.nan, "All"],
  273. ["dull", "shiny", "dull", "shiny", "dull", "shiny", ""],
  274. ],
  275. names=["b", "c"],
  276. )
  277. expected = DataFrame(
  278. [[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 0, 7]],
  279. columns=m,
  280. )
  281. expected.index = Index(["bar", "foo", "All"], name="a")
  282. tm.assert_frame_equal(actual, expected)
  283. actual = crosstab(
  284. [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False
  285. )
  286. m = MultiIndex.from_arrays(
  287. [
  288. ["bar", "bar", "bar", "foo", "foo", "foo", "All"],
  289. ["one", "two", np.nan, "one", "two", np.nan, ""],
  290. ],
  291. names=["a", "b"],
  292. )
  293. expected = DataFrame(
  294. [
  295. [1, 0, 1.0],
  296. [1, 0, 1.0],
  297. [0, 0, np.nan],
  298. [2, 0, 2.0],
  299. [1, 1, 2.0],
  300. [0, 1, np.nan],
  301. [5, 2, 7.0],
  302. ],
  303. index=m,
  304. )
  305. expected.columns = Index(["dull", "shiny", "All"], name="c")
  306. tm.assert_frame_equal(actual, expected)
  307. actual = crosstab(
  308. [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True
  309. )
  310. m = MultiIndex.from_arrays(
  311. [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]],
  312. names=["a", "b"],
  313. )
  314. expected = DataFrame(
  315. [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m
  316. )
  317. expected.columns = Index(["dull", "shiny", "All"], name="c")
  318. tm.assert_frame_equal(actual, expected)
  319. def test_crosstab_normalize(self):
  320. # Issue 12578
  321. df = DataFrame(
  322. {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
  323. )
  324. rindex = Index([1, 2], name="a")
  325. cindex = Index([3, 4], name="b")
  326. full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex)
  327. row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex)
  328. col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex)
  329. # Check all normalize args
  330. tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal)
  331. tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal)
  332. tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal)
  333. tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal)
  334. tm.assert_frame_equal(
  335. crosstab(df.a, df.b, normalize=1),
  336. crosstab(df.a, df.b, normalize="columns"),
  337. )
  338. tm.assert_frame_equal(
  339. crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index")
  340. )
  341. row_normal_margins = DataFrame(
  342. [[1.0, 0], [0.25, 0.75], [0.4, 0.6]],
  343. index=Index([1, 2, "All"], name="a", dtype="object"),
  344. columns=Index([3, 4], name="b", dtype="object"),
  345. )
  346. col_normal_margins = DataFrame(
  347. [[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
  348. index=Index([1, 2], name="a", dtype="object"),
  349. columns=Index([3, 4, "All"], name="b", dtype="object"),
  350. )
  351. all_normal_margins = DataFrame(
  352. [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]],
  353. index=Index([1, 2, "All"], name="a", dtype="object"),
  354. columns=Index([3, 4, "All"], name="b", dtype="object"),
  355. )
  356. tm.assert_frame_equal(
  357. crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins
  358. )
  359. tm.assert_frame_equal(
  360. crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins
  361. )
  362. tm.assert_frame_equal(
  363. crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins
  364. )
  365. def test_crosstab_normalize_arrays(self):
  366. # GH#12578
  367. df = DataFrame(
  368. {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
  369. )
  370. # Test arrays
  371. crosstab(
  372. [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2])
  373. )
  374. # Test with aggfunc
  375. norm_counts = DataFrame(
  376. [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]],
  377. index=Index([1, 2, "All"], name="a", dtype="object"),
  378. columns=Index([3, 4, "All"], name="b"),
  379. )
  380. test_case = crosstab(
  381. df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True
  382. )
  383. tm.assert_frame_equal(test_case, norm_counts)
  384. df = DataFrame(
  385. {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]}
  386. )
  387. norm_sum = DataFrame(
  388. [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]],
  389. index=Index([1, 2, "All"], name="a", dtype="object"),
  390. columns=Index([3, 4, "All"], name="b", dtype="object"),
  391. )
  392. msg = "using DataFrameGroupBy.sum"
  393. with tm.assert_produces_warning(FutureWarning, match=msg):
  394. test_case = crosstab(
  395. df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True
  396. )
  397. tm.assert_frame_equal(test_case, norm_sum)
  398. def test_crosstab_with_empties(self, using_array_manager):
  399. # Check handling of empties
  400. df = DataFrame(
  401. {
  402. "a": [1, 2, 2, 2, 2],
  403. "b": [3, 3, 4, 4, 4],
  404. "c": [np.nan, np.nan, np.nan, np.nan, np.nan],
  405. }
  406. )
  407. empty = DataFrame(
  408. [[0.0, 0.0], [0.0, 0.0]],
  409. index=Index([1, 2], name="a", dtype="int64"),
  410. columns=Index([3, 4], name="b"),
  411. )
  412. for i in [True, "index", "columns"]:
  413. calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i)
  414. tm.assert_frame_equal(empty, calculated)
  415. nans = DataFrame(
  416. [[0.0, np.nan], [0.0, 0.0]],
  417. index=Index([1, 2], name="a", dtype="int64"),
  418. columns=Index([3, 4], name="b"),
  419. )
  420. if using_array_manager:
  421. # INFO(ArrayManager) column without NaNs can preserve int dtype
  422. nans[3] = nans[3].astype("int64")
  423. calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False)
  424. tm.assert_frame_equal(nans, calculated)
  425. def test_crosstab_errors(self):
  426. # Issue 12578
  427. df = DataFrame(
  428. {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
  429. )
  430. error = "values cannot be used without an aggfunc."
  431. with pytest.raises(ValueError, match=error):
  432. crosstab(df.a, df.b, values=df.c)
  433. error = "aggfunc cannot be used without values"
  434. with pytest.raises(ValueError, match=error):
  435. crosstab(df.a, df.b, aggfunc=np.mean)
  436. error = "Not a valid normalize argument"
  437. with pytest.raises(ValueError, match=error):
  438. crosstab(df.a, df.b, normalize="42")
  439. with pytest.raises(ValueError, match=error):
  440. crosstab(df.a, df.b, normalize=42)
  441. error = "Not a valid margins argument"
  442. with pytest.raises(ValueError, match=error):
  443. crosstab(df.a, df.b, normalize="all", margins=42)
  444. def test_crosstab_with_categorial_columns(self):
  445. # GH 8860
  446. df = DataFrame(
  447. {
  448. "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"],
  449. "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"],
  450. }
  451. )
  452. categories = ["Sedan", "Electric", "Pickup"]
  453. df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories)
  454. result = crosstab(df["MAKE"], df["MODEL"])
  455. expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE")
  456. expected_columns = CategoricalIndex(
  457. categories, categories=categories, ordered=False, name="MODEL"
  458. )
  459. expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]]
  460. expected = DataFrame(
  461. expected_data, index=expected_index, columns=expected_columns
  462. )
  463. tm.assert_frame_equal(result, expected)
  464. def test_crosstab_with_numpy_size(self):
  465. # GH 4003
  466. df = DataFrame(
  467. {
  468. "A": ["one", "one", "two", "three"] * 6,
  469. "B": ["A", "B", "C"] * 8,
  470. "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
  471. "D": np.random.default_rng(2).standard_normal(24),
  472. "E": np.random.default_rng(2).standard_normal(24),
  473. }
  474. )
  475. result = crosstab(
  476. index=[df["A"], df["B"]],
  477. columns=[df["C"]],
  478. margins=True,
  479. aggfunc=np.size,
  480. values=df["D"],
  481. )
  482. expected_index = MultiIndex(
  483. levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]],
  484. codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
  485. names=["A", "B"],
  486. )
  487. expected_column = Index(["bar", "foo", "All"], name="C")
  488. expected_data = np.array(
  489. [
  490. [2.0, 2.0, 4.0],
  491. [2.0, 2.0, 4.0],
  492. [2.0, 2.0, 4.0],
  493. [2.0, np.nan, 2.0],
  494. [np.nan, 2.0, 2.0],
  495. [2.0, np.nan, 2.0],
  496. [np.nan, 2.0, 2.0],
  497. [2.0, np.nan, 2.0],
  498. [np.nan, 2.0, 2.0],
  499. [12.0, 12.0, 24.0],
  500. ]
  501. )
  502. expected = DataFrame(
  503. expected_data, index=expected_index, columns=expected_column
  504. )
  505. # aggfunc is np.size, resulting in integers
  506. expected["All"] = expected["All"].astype("int64")
  507. tm.assert_frame_equal(result, expected)
  508. def test_crosstab_duplicate_names(self):
  509. # GH 13279 / 22529
  510. s1 = Series(range(3), name="foo")
  511. s2_foo = Series(range(1, 4), name="foo")
  512. s2_bar = Series(range(1, 4), name="bar")
  513. s3 = Series(range(3), name="waldo")
  514. # check result computed with duplicate labels against
  515. # result computed with unique labels, then relabelled
  516. mapper = {"bar": "foo"}
  517. # duplicate row, column labels
  518. result = crosstab(s1, s2_foo)
  519. expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1)
  520. tm.assert_frame_equal(result, expected)
  521. # duplicate row, unique column labels
  522. result = crosstab([s1, s2_foo], s3)
  523. expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0)
  524. tm.assert_frame_equal(result, expected)
  525. # unique row, duplicate column labels
  526. result = crosstab(s3, [s1, s2_foo])
  527. expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1)
  528. tm.assert_frame_equal(result, expected)
  529. @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]])
  530. def test_crosstab_tuple_name(self, names):
  531. s1 = Series(range(3), name=names[0])
  532. s2 = Series(range(1, 4), name=names[1])
  533. mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
  534. expected = Series(1, index=mi).unstack(1, fill_value=0)
  535. result = crosstab(s1, s2)
  536. tm.assert_frame_equal(result, expected)
  537. def test_crosstab_both_tuple_names(self):
  538. # GH 18321
  539. s1 = Series(range(3), name=("a", "b"))
  540. s2 = Series(range(3), name=("c", "d"))
  541. expected = DataFrame(
  542. np.eye(3, dtype="int64"),
  543. index=Index(range(3), name=("a", "b")),
  544. columns=Index(range(3), name=("c", "d")),
  545. )
  546. result = crosstab(s1, s2)
  547. tm.assert_frame_equal(result, expected)
  548. def test_crosstab_unsorted_order(self):
  549. df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"])
  550. result = crosstab(df.index, [df.b, df.a])
  551. e_idx = Index(["A", "B", "C"], name="row_0")
  552. e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"])
  553. expected = DataFrame(
  554. [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns
  555. )
  556. tm.assert_frame_equal(result, expected)
  557. def test_crosstab_normalize_multiple_columns(self):
  558. # GH 15150
  559. df = DataFrame(
  560. {
  561. "A": ["one", "one", "two", "three"] * 6,
  562. "B": ["A", "B", "C"] * 8,
  563. "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
  564. "D": [0] * 24,
  565. "E": [0] * 24,
  566. }
  567. )
  568. msg = "using DataFrameGroupBy.sum"
  569. with tm.assert_produces_warning(FutureWarning, match=msg):
  570. result = crosstab(
  571. [df.A, df.B],
  572. df.C,
  573. values=df.D,
  574. aggfunc=np.sum,
  575. normalize=True,
  576. margins=True,
  577. )
  578. expected = DataFrame(
  579. np.array([0] * 29 + [1], dtype=float).reshape(10, 3),
  580. columns=Index(["bar", "foo", "All"], name="C"),
  581. index=MultiIndex.from_tuples(
  582. [
  583. ("one", "A"),
  584. ("one", "B"),
  585. ("one", "C"),
  586. ("three", "A"),
  587. ("three", "B"),
  588. ("three", "C"),
  589. ("two", "A"),
  590. ("two", "B"),
  591. ("two", "C"),
  592. ("All", ""),
  593. ],
  594. names=["A", "B"],
  595. ),
  596. )
  597. tm.assert_frame_equal(result, expected)
  598. def test_margin_normalize(self):
  599. # GH 27500
  600. df = DataFrame(
  601. {
  602. "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
  603. "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
  604. "C": [
  605. "small",
  606. "large",
  607. "large",
  608. "small",
  609. "small",
  610. "large",
  611. "small",
  612. "small",
  613. "large",
  614. ],
  615. "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  616. "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
  617. }
  618. )
  619. # normalize on index
  620. result = crosstab(
  621. [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
  622. )
  623. expected = DataFrame(
  624. [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
  625. )
  626. expected.index = MultiIndex(
  627. levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
  628. codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
  629. names=["A", "B"],
  630. )
  631. expected.columns = Index(["large", "small"], name="C")
  632. tm.assert_frame_equal(result, expected)
  633. # normalize on columns
  634. result = crosstab(
  635. [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
  636. )
  637. expected = DataFrame(
  638. [
  639. [0.25, 0.2, 0.222222],
  640. [0.25, 0.2, 0.222222],
  641. [0.5, 0.2, 0.333333],
  642. [0, 0.4, 0.222222],
  643. ]
  644. )
  645. expected.columns = Index(["large", "small", "Sub-Total"], name="C")
  646. expected.index = MultiIndex(
  647. levels=[["bar", "foo"], ["one", "two"]],
  648. codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
  649. names=["A", "B"],
  650. )
  651. tm.assert_frame_equal(result, expected)
  652. # normalize on both index and column
  653. result = crosstab(
  654. [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
  655. )
  656. expected = DataFrame(
  657. [
  658. [0.111111, 0.111111, 0.222222],
  659. [0.111111, 0.111111, 0.222222],
  660. [0.222222, 0.111111, 0.333333],
  661. [0.000000, 0.222222, 0.222222],
  662. [0.444444, 0.555555, 1],
  663. ]
  664. )
  665. expected.columns = Index(["large", "small", "Sub-Total"], name="C")
  666. expected.index = MultiIndex(
  667. levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
  668. codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
  669. names=["A", "B"],
  670. )
  671. tm.assert_frame_equal(result, expected)
  672. def test_margin_normalize_multiple_columns(self):
  673. # GH 35144
  674. # use multiple columns with margins and normalization
  675. df = DataFrame(
  676. {
  677. "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
  678. "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
  679. "C": [
  680. "small",
  681. "large",
  682. "large",
  683. "small",
  684. "small",
  685. "large",
  686. "small",
  687. "small",
  688. "large",
  689. ],
  690. "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  691. "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
  692. }
  693. )
  694. result = crosstab(
  695. index=df.C,
  696. columns=[df.A, df.B],
  697. margins=True,
  698. margins_name="margin",
  699. normalize=True,
  700. )
  701. expected = DataFrame(
  702. [
  703. [0.111111, 0.111111, 0.222222, 0.000000, 0.444444],
  704. [0.111111, 0.111111, 0.111111, 0.222222, 0.555556],
  705. [0.222222, 0.222222, 0.333333, 0.222222, 1.0],
  706. ],
  707. index=["large", "small", "margin"],
  708. )
  709. expected.columns = MultiIndex(
  710. levels=[["bar", "foo", "margin"], ["", "one", "two"]],
  711. codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]],
  712. names=["A", "B"],
  713. )
  714. expected.index.name = "C"
  715. tm.assert_frame_equal(result, expected)
  716. def test_margin_support_Float(self):
  717. # GH 50313
  718. # use Float64 formats and function aggfunc with margins
  719. df = DataFrame(
  720. {"A": [1, 2, 2, 1], "B": [3, 3, 4, 5], "C": [-1.0, 10.0, 1.0, 10.0]},
  721. dtype="Float64",
  722. )
  723. result = crosstab(
  724. df["A"],
  725. df["B"],
  726. values=df["C"],
  727. aggfunc="sum",
  728. margins=True,
  729. )
  730. expected = DataFrame(
  731. [
  732. [-1.0, pd.NA, 10.0, 9.0],
  733. [10.0, 1.0, pd.NA, 11.0],
  734. [9.0, 1.0, 10.0, 20.0],
  735. ],
  736. index=Index([1.0, 2.0, "All"], dtype="object", name="A"),
  737. columns=Index([3.0, 4.0, 5.0, "All"], dtype="object", name="B"),
  738. dtype="Float64",
  739. )
  740. tm.assert_frame_equal(result, expected)
  741. def test_margin_with_ordered_categorical_column(self):
  742. # GH 25278
  743. df = DataFrame(
  744. {
  745. "First": ["B", "B", "C", "A", "B", "C"],
  746. "Second": ["C", "B", "B", "B", "C", "A"],
  747. }
  748. )
  749. df["First"] = df["First"].astype(CategoricalDtype(ordered=True))
  750. customized_categories_order = ["C", "A", "B"]
  751. df["First"] = df["First"].cat.reorder_categories(customized_categories_order)
  752. result = crosstab(df["First"], df["Second"], margins=True)
  753. expected_index = Index(["C", "A", "B", "All"], name="First")
  754. expected_columns = Index(["A", "B", "C", "All"], name="Second")
  755. expected_data = [[1, 1, 0, 2], [0, 1, 0, 1], [0, 1, 2, 3], [1, 3, 2, 6]]
  756. expected = DataFrame(
  757. expected_data, index=expected_index, columns=expected_columns
  758. )
  759. tm.assert_frame_equal(result, expected)
  760. @pytest.mark.parametrize("a_dtype", ["category", "int64"])
  761. @pytest.mark.parametrize("b_dtype", ["category", "int64"])
  762. def test_categoricals(a_dtype, b_dtype):
  763. # https://github.com/pandas-dev/pandas/issues/37465
  764. g = np.random.default_rng(2)
  765. a = Series(g.integers(0, 3, size=100)).astype(a_dtype)
  766. b = Series(g.integers(0, 2, size=100)).astype(b_dtype)
  767. result = crosstab(a, b, margins=True, dropna=False)
  768. columns = Index([0, 1, "All"], dtype="object", name="col_0")
  769. index = Index([0, 1, 2, "All"], dtype="object", name="row_0")
  770. values = [[10, 18, 28], [23, 16, 39], [17, 16, 33], [50, 50, 100]]
  771. expected = DataFrame(values, index, columns)
  772. tm.assert_frame_equal(result, expected)
  773. # Verify when categorical does not have all values present
  774. a.loc[a == 1] = 2
  775. a_is_cat = isinstance(a.dtype, CategoricalDtype)
  776. assert not a_is_cat or a.value_counts().loc[1] == 0
  777. result = crosstab(a, b, margins=True, dropna=False)
  778. values = [[10, 18, 28], [0, 0, 0], [40, 32, 72], [50, 50, 100]]
  779. expected = DataFrame(values, index, columns)
  780. if not a_is_cat:
  781. expected = expected.loc[[0, 2, "All"]]
  782. expected["All"] = expected["All"].astype("int64")
  783. tm.assert_frame_equal(result, expected)