test_stack_unstack.py 95 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684
  1. from datetime import datetime
  2. import itertools
  3. import re
  4. import numpy as np
  5. import pytest
  6. from pandas._libs import lib
  7. from pandas.errors import PerformanceWarning
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. Index,
  12. MultiIndex,
  13. Period,
  14. Series,
  15. Timedelta,
  16. date_range,
  17. )
  18. import pandas._testing as tm
  19. from pandas.core.reshape import reshape as reshape_lib
  20. @pytest.fixture(params=[True, False])
  21. def future_stack(request):
  22. return request.param
  23. class TestDataFrameReshape:
  24. def test_stack_unstack(self, float_frame, future_stack):
  25. df = float_frame.copy()
  26. df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
  27. stacked = df.stack(future_stack=future_stack)
  28. stacked_df = DataFrame({"foo": stacked, "bar": stacked})
  29. unstacked = stacked.unstack()
  30. unstacked_df = stacked_df.unstack()
  31. tm.assert_frame_equal(unstacked, df)
  32. tm.assert_frame_equal(unstacked_df["bar"], df)
  33. unstacked_cols = stacked.unstack(0)
  34. unstacked_cols_df = stacked_df.unstack(0)
  35. tm.assert_frame_equal(unstacked_cols.T, df)
  36. tm.assert_frame_equal(unstacked_cols_df["bar"].T, df)
  37. @pytest.mark.filterwarnings(
  38. "ignore:The previous implementation of stack is deprecated"
  39. )
  40. def test_stack_mixed_level(self, future_stack):
  41. # GH 18310
  42. levels = [range(3), [3, "a", "b"], [1, 2]]
  43. # flat columns:
  44. df = DataFrame(1, index=levels[0], columns=levels[1])
  45. result = df.stack(future_stack=future_stack)
  46. expected = Series(1, index=MultiIndex.from_product(levels[:2]))
  47. tm.assert_series_equal(result, expected)
  48. # MultiIndex columns:
  49. df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:]))
  50. result = df.stack(1, future_stack=future_stack)
  51. expected = DataFrame(
  52. 1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1]
  53. )
  54. tm.assert_frame_equal(result, expected)
  55. # as above, but used labels in level are actually of homogeneous type
  56. result = df[["a", "b"]].stack(1, future_stack=future_stack)
  57. expected = expected[["a", "b"]]
  58. tm.assert_frame_equal(result, expected)
  59. def test_unstack_not_consolidated(self, using_array_manager):
  60. # Gh#34708
  61. df = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]})
  62. df2 = df[["x"]]
  63. df2["y"] = df["y"]
  64. if not using_array_manager:
  65. assert len(df2._mgr.blocks) == 2
  66. res = df2.unstack()
  67. expected = df.unstack()
  68. tm.assert_series_equal(res, expected)
  69. @pytest.mark.filterwarnings(
  70. "ignore:The previous implementation of stack is deprecated"
  71. )
  72. def test_unstack_fill(self, future_stack):
  73. # GH #9746: fill_value keyword argument for Series
  74. # and DataFrame unstack
  75. # From a series
  76. data = Series([1, 2, 4, 5], dtype=np.int16)
  77. data.index = MultiIndex.from_tuples(
  78. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  79. )
  80. result = data.unstack(fill_value=-1)
  81. expected = DataFrame(
  82. {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16
  83. )
  84. tm.assert_frame_equal(result, expected)
  85. # From a series with incorrect data type for fill_value
  86. result = data.unstack(fill_value=0.5)
  87. expected = DataFrame(
  88. {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=float
  89. )
  90. tm.assert_frame_equal(result, expected)
  91. # GH #13971: fill_value when unstacking multiple levels:
  92. df = DataFrame(
  93. {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]}
  94. ).set_index(["x", "y", "z"])
  95. unstacked = df.unstack(["x", "y"], fill_value=0)
  96. key = ("w", "b", "j")
  97. expected = unstacked[key]
  98. result = Series([0, 0, 2], index=unstacked.index, name=key)
  99. tm.assert_series_equal(result, expected)
  100. stacked = unstacked.stack(["x", "y"], future_stack=future_stack)
  101. stacked.index = stacked.index.reorder_levels(df.index.names)
  102. # Workaround for GH #17886 (unnecessarily casts to float):
  103. stacked = stacked.astype(np.int64)
  104. result = stacked.loc[df.index]
  105. tm.assert_frame_equal(result, df)
  106. # From a series
  107. s = df["w"]
  108. result = s.unstack(["x", "y"], fill_value=0)
  109. expected = unstacked["w"]
  110. tm.assert_frame_equal(result, expected)
  111. def test_unstack_fill_frame(self):
  112. # From a dataframe
  113. rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
  114. df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
  115. df.index = MultiIndex.from_tuples(
  116. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  117. )
  118. result = df.unstack(fill_value=-1)
  119. rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
  120. expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
  121. expected.columns = MultiIndex.from_tuples(
  122. [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
  123. )
  124. tm.assert_frame_equal(result, expected)
  125. # From a mixed type dataframe
  126. df["A"] = df["A"].astype(np.int16)
  127. df["B"] = df["B"].astype(np.float64)
  128. result = df.unstack(fill_value=-1)
  129. expected["A"] = expected["A"].astype(np.int16)
  130. expected["B"] = expected["B"].astype(np.float64)
  131. tm.assert_frame_equal(result, expected)
  132. # From a dataframe with incorrect data type for fill_value
  133. result = df.unstack(fill_value=0.5)
  134. rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
  135. expected = DataFrame(rows, index=list("xyz"), dtype=float)
  136. expected.columns = MultiIndex.from_tuples(
  137. [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
  138. )
  139. tm.assert_frame_equal(result, expected)
  140. def test_unstack_fill_frame_datetime(self):
  141. # Test unstacking with date times
  142. dv = date_range("2012-01-01", periods=4).values
  143. data = Series(dv)
  144. data.index = MultiIndex.from_tuples(
  145. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  146. )
  147. result = data.unstack()
  148. expected = DataFrame(
  149. {"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]},
  150. index=["x", "y", "z"],
  151. )
  152. tm.assert_frame_equal(result, expected)
  153. result = data.unstack(fill_value=dv[0])
  154. expected = DataFrame(
  155. {"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]},
  156. index=["x", "y", "z"],
  157. )
  158. tm.assert_frame_equal(result, expected)
  159. def test_unstack_fill_frame_timedelta(self):
  160. # Test unstacking with time deltas
  161. td = [Timedelta(days=i) for i in range(4)]
  162. data = Series(td)
  163. data.index = MultiIndex.from_tuples(
  164. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  165. )
  166. result = data.unstack()
  167. expected = DataFrame(
  168. {"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]},
  169. index=["x", "y", "z"],
  170. )
  171. tm.assert_frame_equal(result, expected)
  172. result = data.unstack(fill_value=td[1])
  173. expected = DataFrame(
  174. {"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]},
  175. index=["x", "y", "z"],
  176. )
  177. tm.assert_frame_equal(result, expected)
  178. def test_unstack_fill_frame_period(self):
  179. # Test unstacking with period
  180. periods = [
  181. Period("2012-01"),
  182. Period("2012-02"),
  183. Period("2012-03"),
  184. Period("2012-04"),
  185. ]
  186. data = Series(periods)
  187. data.index = MultiIndex.from_tuples(
  188. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  189. )
  190. result = data.unstack()
  191. expected = DataFrame(
  192. {"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]},
  193. index=["x", "y", "z"],
  194. )
  195. tm.assert_frame_equal(result, expected)
  196. result = data.unstack(fill_value=periods[1])
  197. expected = DataFrame(
  198. {
  199. "a": [periods[0], periods[1], periods[3]],
  200. "b": [periods[1], periods[2], periods[1]],
  201. },
  202. index=["x", "y", "z"],
  203. )
  204. tm.assert_frame_equal(result, expected)
  205. def test_unstack_fill_frame_categorical(self):
  206. # Test unstacking with categorical
  207. data = Series(["a", "b", "c", "a"], dtype="category")
  208. data.index = MultiIndex.from_tuples(
  209. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  210. )
  211. # By default missing values will be NaN
  212. result = data.unstack()
  213. expected = DataFrame(
  214. {
  215. "a": pd.Categorical(list("axa"), categories=list("abc")),
  216. "b": pd.Categorical(list("bcx"), categories=list("abc")),
  217. },
  218. index=list("xyz"),
  219. )
  220. tm.assert_frame_equal(result, expected)
  221. # Fill with non-category results in a ValueError
  222. msg = r"Cannot setitem on a Categorical with a new category \(d\)"
  223. with pytest.raises(TypeError, match=msg):
  224. data.unstack(fill_value="d")
  225. # Fill with category value replaces missing values as expected
  226. result = data.unstack(fill_value="c")
  227. expected = DataFrame(
  228. {
  229. "a": pd.Categorical(list("aca"), categories=list("abc")),
  230. "b": pd.Categorical(list("bcc"), categories=list("abc")),
  231. },
  232. index=list("xyz"),
  233. )
  234. tm.assert_frame_equal(result, expected)
  235. def test_unstack_tuplename_in_multiindex(self):
  236. # GH 19966
  237. idx = MultiIndex.from_product(
  238. [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")]
  239. )
  240. df = DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx)
  241. result = df.unstack(("A", "a"))
  242. expected = DataFrame(
  243. [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]],
  244. columns=MultiIndex.from_tuples(
  245. [
  246. ("d", "a"),
  247. ("d", "b"),
  248. ("d", "c"),
  249. ("e", "a"),
  250. ("e", "b"),
  251. ("e", "c"),
  252. ],
  253. names=[None, ("A", "a")],
  254. ),
  255. index=Index([1, 2, 3], name=("B", "b")),
  256. )
  257. tm.assert_frame_equal(result, expected)
  258. @pytest.mark.parametrize(
  259. "unstack_idx, expected_values, expected_index, expected_columns",
  260. [
  261. (
  262. ("A", "a"),
  263. [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]],
  264. MultiIndex.from_tuples(
  265. [(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"]
  266. ),
  267. MultiIndex.from_tuples(
  268. [("d", "a"), ("d", "b"), ("e", "a"), ("e", "b")],
  269. names=[None, ("A", "a")],
  270. ),
  271. ),
  272. (
  273. (("A", "a"), "B"),
  274. [[1, 1, 1, 1, 2, 2, 2, 2], [1, 1, 1, 1, 2, 2, 2, 2]],
  275. Index([3, 4], name="C"),
  276. MultiIndex.from_tuples(
  277. [
  278. ("d", "a", 1),
  279. ("d", "a", 2),
  280. ("d", "b", 1),
  281. ("d", "b", 2),
  282. ("e", "a", 1),
  283. ("e", "a", 2),
  284. ("e", "b", 1),
  285. ("e", "b", 2),
  286. ],
  287. names=[None, ("A", "a"), "B"],
  288. ),
  289. ),
  290. ],
  291. )
  292. def test_unstack_mixed_type_name_in_multiindex(
  293. self, unstack_idx, expected_values, expected_index, expected_columns
  294. ):
  295. # GH 19966
  296. idx = MultiIndex.from_product(
  297. [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"]
  298. )
  299. df = DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx)
  300. result = df.unstack(unstack_idx)
  301. expected = DataFrame(
  302. expected_values, columns=expected_columns, index=expected_index
  303. )
  304. tm.assert_frame_equal(result, expected)
  305. def test_unstack_preserve_dtypes(self):
  306. # Checks fix for #11847
  307. df = DataFrame(
  308. {
  309. "state": ["IL", "MI", "NC"],
  310. "index": ["a", "b", "c"],
  311. "some_categories": Series(["a", "b", "c"]).astype("category"),
  312. "A": np.random.default_rng(2).random(3),
  313. "B": 1,
  314. "C": "foo",
  315. "D": pd.Timestamp("20010102"),
  316. "E": Series([1.0, 50.0, 100.0]).astype("float32"),
  317. "F": Series([3.0, 4.0, 5.0]).astype("float64"),
  318. "G": False,
  319. "H": Series([1, 200, 923442]).astype("int8"),
  320. }
  321. )
  322. def unstack_and_compare(df, column_name):
  323. unstacked1 = df.unstack([column_name])
  324. unstacked2 = df.unstack(column_name)
  325. tm.assert_frame_equal(unstacked1, unstacked2)
  326. df1 = df.set_index(["state", "index"])
  327. unstack_and_compare(df1, "index")
  328. df1 = df.set_index(["state", "some_categories"])
  329. unstack_and_compare(df1, "some_categories")
  330. df1 = df.set_index(["F", "C"])
  331. unstack_and_compare(df1, "F")
  332. df1 = df.set_index(["G", "B", "state"])
  333. unstack_and_compare(df1, "B")
  334. df1 = df.set_index(["E", "A"])
  335. unstack_and_compare(df1, "E")
  336. df1 = df.set_index(["state", "index"])
  337. s = df1["A"]
  338. unstack_and_compare(s, "index")
  339. @pytest.mark.filterwarnings(
  340. "ignore:The previous implementation of stack is deprecated"
  341. )
  342. def test_stack_ints(self, future_stack):
  343. columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3)))
  344. df = DataFrame(
  345. np.random.default_rng(2).standard_normal((30, 27)), columns=columns
  346. )
  347. tm.assert_frame_equal(
  348. df.stack(level=[1, 2], future_stack=future_stack),
  349. df.stack(level=1, future_stack=future_stack).stack(
  350. level=1, future_stack=future_stack
  351. ),
  352. )
  353. tm.assert_frame_equal(
  354. df.stack(level=[-2, -1], future_stack=future_stack),
  355. df.stack(level=1, future_stack=future_stack).stack(
  356. level=1, future_stack=future_stack
  357. ),
  358. )
  359. df_named = df.copy()
  360. return_value = df_named.columns.set_names(range(3), inplace=True)
  361. assert return_value is None
  362. tm.assert_frame_equal(
  363. df_named.stack(level=[1, 2], future_stack=future_stack),
  364. df_named.stack(level=1, future_stack=future_stack).stack(
  365. level=1, future_stack=future_stack
  366. ),
  367. )
  368. @pytest.mark.filterwarnings(
  369. "ignore:The previous implementation of stack is deprecated"
  370. )
  371. def test_stack_mixed_levels(self, future_stack):
  372. columns = MultiIndex.from_tuples(
  373. [
  374. ("A", "cat", "long"),
  375. ("B", "cat", "long"),
  376. ("A", "dog", "short"),
  377. ("B", "dog", "short"),
  378. ],
  379. names=["exp", "animal", "hair_length"],
  380. )
  381. df = DataFrame(
  382. np.random.default_rng(2).standard_normal((4, 4)), columns=columns
  383. )
  384. animal_hair_stacked = df.stack(
  385. level=["animal", "hair_length"], future_stack=future_stack
  386. )
  387. exp_hair_stacked = df.stack(
  388. level=["exp", "hair_length"], future_stack=future_stack
  389. )
  390. # GH #8584: Need to check that stacking works when a number
  391. # is passed that is both a level name and in the range of
  392. # the level numbers
  393. df2 = df.copy()
  394. df2.columns.names = ["exp", "animal", 1]
  395. tm.assert_frame_equal(
  396. df2.stack(level=["animal", 1], future_stack=future_stack),
  397. animal_hair_stacked,
  398. check_names=False,
  399. )
  400. tm.assert_frame_equal(
  401. df2.stack(level=["exp", 1], future_stack=future_stack),
  402. exp_hair_stacked,
  403. check_names=False,
  404. )
  405. # When mixed types are passed and the ints are not level
  406. # names, raise
  407. msg = (
  408. "level should contain all level names or all level numbers, not "
  409. "a mixture of the two"
  410. )
  411. with pytest.raises(ValueError, match=msg):
  412. df2.stack(level=["animal", 0], future_stack=future_stack)
  413. # GH #8584: Having 0 in the level names could raise a
  414. # strange error about lexsort depth
  415. df3 = df.copy()
  416. df3.columns.names = ["exp", "animal", 0]
  417. tm.assert_frame_equal(
  418. df3.stack(level=["animal", 0], future_stack=future_stack),
  419. animal_hair_stacked,
  420. check_names=False,
  421. )
  422. @pytest.mark.filterwarnings(
  423. "ignore:The previous implementation of stack is deprecated"
  424. )
  425. def test_stack_int_level_names(self, future_stack):
  426. columns = MultiIndex.from_tuples(
  427. [
  428. ("A", "cat", "long"),
  429. ("B", "cat", "long"),
  430. ("A", "dog", "short"),
  431. ("B", "dog", "short"),
  432. ],
  433. names=["exp", "animal", "hair_length"],
  434. )
  435. df = DataFrame(
  436. np.random.default_rng(2).standard_normal((4, 4)), columns=columns
  437. )
  438. exp_animal_stacked = df.stack(
  439. level=["exp", "animal"], future_stack=future_stack
  440. )
  441. animal_hair_stacked = df.stack(
  442. level=["animal", "hair_length"], future_stack=future_stack
  443. )
  444. exp_hair_stacked = df.stack(
  445. level=["exp", "hair_length"], future_stack=future_stack
  446. )
  447. df2 = df.copy()
  448. df2.columns.names = [0, 1, 2]
  449. tm.assert_frame_equal(
  450. df2.stack(level=[1, 2], future_stack=future_stack),
  451. animal_hair_stacked,
  452. check_names=False,
  453. )
  454. tm.assert_frame_equal(
  455. df2.stack(level=[0, 1], future_stack=future_stack),
  456. exp_animal_stacked,
  457. check_names=False,
  458. )
  459. tm.assert_frame_equal(
  460. df2.stack(level=[0, 2], future_stack=future_stack),
  461. exp_hair_stacked,
  462. check_names=False,
  463. )
  464. # Out-of-order int column names
  465. df3 = df.copy()
  466. df3.columns.names = [2, 0, 1]
  467. tm.assert_frame_equal(
  468. df3.stack(level=[0, 1], future_stack=future_stack),
  469. animal_hair_stacked,
  470. check_names=False,
  471. )
  472. tm.assert_frame_equal(
  473. df3.stack(level=[2, 0], future_stack=future_stack),
  474. exp_animal_stacked,
  475. check_names=False,
  476. )
  477. tm.assert_frame_equal(
  478. df3.stack(level=[2, 1], future_stack=future_stack),
  479. exp_hair_stacked,
  480. check_names=False,
  481. )
  482. def test_unstack_bool(self):
  483. df = DataFrame(
  484. [False, False],
  485. index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]),
  486. columns=["col"],
  487. )
  488. rs = df.unstack()
  489. xp = DataFrame(
  490. np.array([[False, np.nan], [np.nan, False]], dtype=object),
  491. index=["a", "b"],
  492. columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
  493. )
  494. tm.assert_frame_equal(rs, xp)
  495. @pytest.mark.filterwarnings(
  496. "ignore:The previous implementation of stack is deprecated"
  497. )
  498. def test_unstack_level_binding(self, future_stack):
  499. # GH9856
  500. mi = MultiIndex(
  501. levels=[["foo", "bar"], ["one", "two"], ["a", "b"]],
  502. codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
  503. names=["first", "second", "third"],
  504. )
  505. s = Series(0, index=mi)
  506. result = s.unstack([1, 2]).stack(0, future_stack=future_stack)
  507. expected_mi = MultiIndex(
  508. levels=[["foo", "bar"], ["one", "two"]],
  509. codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
  510. names=["first", "second"],
  511. )
  512. expected = DataFrame(
  513. np.array(
  514. [[0, np.nan], [np.nan, 0], [0, np.nan], [np.nan, 0]], dtype=np.float64
  515. ),
  516. index=expected_mi,
  517. columns=Index(["b", "a"], name="third"),
  518. )
  519. tm.assert_frame_equal(result, expected)
  520. def test_unstack_to_series(self, float_frame):
  521. # check reversibility
  522. data = float_frame.unstack()
  523. assert isinstance(data, Series)
  524. undo = data.unstack().T
  525. tm.assert_frame_equal(undo, float_frame)
  526. # check NA handling
  527. data = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]})
  528. data.index = Index(["a", "b", "c"])
  529. result = data.unstack()
  530. midx = MultiIndex(
  531. levels=[["x", "y"], ["a", "b", "c"]],
  532. codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
  533. )
  534. expected = Series([1, 2, np.nan, 3, 4, np.nan], index=midx)
  535. tm.assert_series_equal(result, expected)
  536. # check composability of unstack
  537. old_data = data.copy()
  538. for _ in range(4):
  539. data = data.unstack()
  540. tm.assert_frame_equal(old_data, data)
  541. def test_unstack_dtypes(self, using_infer_string):
  542. # GH 2929
  543. rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]]
  544. df = DataFrame(rows, columns=list("ABCD"))
  545. result = df.dtypes
  546. expected = Series([np.dtype("int64")] * 4, index=list("ABCD"))
  547. tm.assert_series_equal(result, expected)
  548. # single dtype
  549. df2 = df.set_index(["A", "B"])
  550. df3 = df2.unstack("B")
  551. result = df3.dtypes
  552. expected = Series(
  553. [np.dtype("int64")] * 4,
  554. index=MultiIndex.from_arrays(
  555. [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
  556. ),
  557. )
  558. tm.assert_series_equal(result, expected)
  559. # mixed
  560. df2 = df.set_index(["A", "B"])
  561. df2["C"] = 3.0
  562. df3 = df2.unstack("B")
  563. result = df3.dtypes
  564. expected = Series(
  565. [np.dtype("float64")] * 2 + [np.dtype("int64")] * 2,
  566. index=MultiIndex.from_arrays(
  567. [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
  568. ),
  569. )
  570. tm.assert_series_equal(result, expected)
  571. df2["D"] = "foo"
  572. df3 = df2.unstack("B")
  573. result = df3.dtypes
  574. dtype = (
  575. pd.StringDtype(na_value=np.nan)
  576. if using_infer_string
  577. else np.dtype("object")
  578. )
  579. expected = Series(
  580. [np.dtype("float64")] * 2 + [dtype] * 2,
  581. index=MultiIndex.from_arrays(
  582. [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
  583. ),
  584. )
  585. tm.assert_series_equal(result, expected)
  586. @pytest.mark.parametrize(
  587. "c, d",
  588. (
  589. (np.zeros(5), np.zeros(5)),
  590. (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")),
  591. ),
  592. )
  593. def test_unstack_dtypes_mixed_date(self, c, d):
  594. # GH7405
  595. df = DataFrame(
  596. {
  597. "A": ["a"] * 5,
  598. "C": c,
  599. "D": d,
  600. "B": date_range("2012-01-01", periods=5),
  601. }
  602. )
  603. right = df.iloc[:3].copy(deep=True)
  604. df = df.set_index(["A", "B"])
  605. df["D"] = df["D"].astype("int64")
  606. left = df.iloc[:3].unstack(0)
  607. right = right.set_index(["A", "B"]).unstack(0)
  608. right[("D", "a")] = right[("D", "a")].astype("int64")
  609. assert left.shape == (3, 2)
  610. tm.assert_frame_equal(left, right)
  611. @pytest.mark.filterwarnings(
  612. "ignore:The previous implementation of stack is deprecated"
  613. )
  614. def test_unstack_non_unique_index_names(self, future_stack):
  615. idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"])
  616. df = DataFrame([1, 2], index=idx)
  617. msg = "The name c1 occurs multiple times, use a level number"
  618. with pytest.raises(ValueError, match=msg):
  619. df.unstack("c1")
  620. with pytest.raises(ValueError, match=msg):
  621. df.T.stack("c1", future_stack=future_stack)
  622. def test_unstack_unused_levels(self):
  623. # GH 17845: unused codes in index make unstack() cast int to float
  624. idx = MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1]
  625. df = DataFrame([[1, 0]] * 3, index=idx)
  626. result = df.unstack()
  627. exp_col = MultiIndex.from_product([[0, 1], ["A", "B", "C"]])
  628. expected = DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col)
  629. tm.assert_frame_equal(result, expected)
  630. assert (result.columns.levels[1] == idx.levels[1]).all()
  631. # Unused items on both levels
  632. levels = [[0, 1, 7], [0, 1, 2, 3]]
  633. codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
  634. idx = MultiIndex(levels, codes)
  635. block = np.arange(4).reshape(2, 2)
  636. df = DataFrame(np.concatenate([block, block + 4]), index=idx)
  637. result = df.unstack()
  638. expected = DataFrame(
  639. np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx
  640. )
  641. tm.assert_frame_equal(result, expected)
  642. assert (result.columns.levels[1] == idx.levels[1]).all()
  643. @pytest.mark.parametrize(
  644. "level, idces, col_level, idx_level",
  645. (
  646. (0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]),
  647. (1, [8, 11, 1, 4, 12, 15, 13, 16], [np.nan, 5, 1], [np.nan, "a", 2]),
  648. ),
  649. )
  650. def test_unstack_unused_levels_mixed_with_nan(
  651. self, level, idces, col_level, idx_level
  652. ):
  653. # With mixed dtype and NaN
  654. levels = [["a", 2, "c"], [1, 3, 5, 7]]
  655. codes = [[0, -1, 1, 1], [0, 2, -1, 2]]
  656. idx = MultiIndex(levels, codes)
  657. data = np.arange(8)
  658. df = DataFrame(data.reshape(4, 2), index=idx)
  659. result = df.unstack(level=level)
  660. exp_data = np.zeros(18) * np.nan
  661. exp_data[idces] = data
  662. cols = MultiIndex.from_product([[0, 1], col_level])
  663. expected = DataFrame(exp_data.reshape(3, 6), index=idx_level, columns=cols)
  664. tm.assert_frame_equal(result, expected)
  665. @pytest.mark.parametrize("cols", [["A", "C"], slice(None)])
  666. def test_unstack_unused_level(self, cols):
  667. # GH 18562 : unused codes on the unstacked level
  668. df = DataFrame([[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"])
  669. ind = df.set_index(["A", "B", "C"], drop=False)
  670. selection = ind.loc[(slice(None), slice(None), "I"), cols]
  671. result = selection.unstack()
  672. expected = ind.iloc[[0]][cols]
  673. expected.columns = MultiIndex.from_product(
  674. [expected.columns, ["I"]], names=[None, "C"]
  675. )
  676. expected.index = expected.index.droplevel("C")
  677. tm.assert_frame_equal(result, expected)
  678. def test_unstack_long_index(self):
  679. # PH 32624: Error when using a lot of indices to unstack.
  680. # The error occurred only, if a lot of indices are used.
  681. df = DataFrame(
  682. [[1]],
  683. columns=MultiIndex.from_tuples([[0]], names=["c1"]),
  684. index=MultiIndex.from_tuples(
  685. [[0, 0, 1, 0, 0, 0, 1]],
  686. names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"],
  687. ),
  688. )
  689. result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"])
  690. expected = DataFrame(
  691. [[1]],
  692. columns=MultiIndex.from_tuples(
  693. [[0, 0, 1, 0, 0, 0, 1]],
  694. names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"],
  695. ),
  696. index=Index([0], name="i1"),
  697. )
  698. tm.assert_frame_equal(result, expected)
  699. def test_unstack_multi_level_cols(self):
  700. # PH 24729: Unstack a df with multi level columns
  701. df = DataFrame(
  702. [[0.0, 0.0], [0.0, 0.0]],
  703. columns=MultiIndex.from_tuples(
  704. [["B", "C"], ["B", "D"]], names=["c1", "c2"]
  705. ),
  706. index=MultiIndex.from_tuples(
  707. [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"]
  708. ),
  709. )
  710. assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"]
  711. def test_unstack_multi_level_rows_and_cols(self):
  712. # PH 28306: Unstack df with multi level cols and rows
  713. df = DataFrame(
  714. [[1, 2], [3, 4], [-1, -2], [-3, -4]],
  715. columns=MultiIndex.from_tuples([["a", "b", "c"], ["d", "e", "f"]]),
  716. index=MultiIndex.from_tuples(
  717. [
  718. ["m1", "P3", 222],
  719. ["m1", "A5", 111],
  720. ["m2", "P3", 222],
  721. ["m2", "A5", 111],
  722. ],
  723. names=["i1", "i2", "i3"],
  724. ),
  725. )
  726. result = df.unstack(["i3", "i2"])
  727. expected = df.unstack(["i3"]).unstack(["i2"])
  728. tm.assert_frame_equal(result, expected)
  729. @pytest.mark.parametrize("idx", [("jim", "joe"), ("joe", "jim")])
  730. @pytest.mark.parametrize("lev", list(range(2)))
  731. def test_unstack_nan_index1(self, idx, lev):
  732. # GH7466
  733. def cast(val):
  734. val_str = "" if val != val else val
  735. return f"{val_str:1}"
  736. df = DataFrame(
  737. {
  738. "jim": ["a", "b", np.nan, "d"],
  739. "joe": ["w", "x", "y", "z"],
  740. "jolie": ["a.w", "b.x", " .y", "d.z"],
  741. }
  742. )
  743. left = df.set_index(["jim", "joe"]).unstack()["jolie"]
  744. right = df.set_index(["joe", "jim"]).unstack()["jolie"].T
  745. tm.assert_frame_equal(left, right)
  746. mi = df.set_index(list(idx))
  747. udf = mi.unstack(level=lev)
  748. assert udf.notna().values.sum() == len(df)
  749. mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
  750. rows, cols = udf["jolie"].notna().values.nonzero()
  751. for i, j in zip(rows, cols):
  752. left = sorted(udf["jolie"].iloc[i, j].split("."))
  753. right = mk_list(udf["jolie"].index[i]) + mk_list(udf["jolie"].columns[j])
  754. right = sorted(map(cast, right))
  755. assert left == right
  756. @pytest.mark.parametrize("idx", itertools.permutations(["1st", "2nd", "3rd"]))
  757. @pytest.mark.parametrize("lev", list(range(3)))
  758. @pytest.mark.parametrize("col", ["4th", "5th"])
  759. def test_unstack_nan_index_repeats(self, idx, lev, col):
  760. def cast(val):
  761. val_str = "" if val != val else val
  762. return f"{val_str:1}"
  763. df = DataFrame(
  764. {
  765. "1st": ["d"] * 3
  766. + [np.nan] * 5
  767. + ["a"] * 2
  768. + ["c"] * 3
  769. + ["e"] * 2
  770. + ["b"] * 5,
  771. "2nd": ["y"] * 2
  772. + ["w"] * 3
  773. + [np.nan] * 3
  774. + ["z"] * 4
  775. + [np.nan] * 3
  776. + ["x"] * 3
  777. + [np.nan] * 2,
  778. "3rd": [
  779. 67,
  780. 39,
  781. 53,
  782. 72,
  783. 57,
  784. 80,
  785. 31,
  786. 18,
  787. 11,
  788. 30,
  789. 59,
  790. 50,
  791. 62,
  792. 59,
  793. 76,
  794. 52,
  795. 14,
  796. 53,
  797. 60,
  798. 51,
  799. ],
  800. }
  801. )
  802. df["4th"], df["5th"] = (
  803. df.apply(lambda r: ".".join(map(cast, r)), axis=1),
  804. df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1),
  805. )
  806. mi = df.set_index(list(idx))
  807. udf = mi.unstack(level=lev)
  808. assert udf.notna().values.sum() == 2 * len(df)
  809. mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
  810. rows, cols = udf[col].notna().values.nonzero()
  811. for i, j in zip(rows, cols):
  812. left = sorted(udf[col].iloc[i, j].split("."))
  813. right = mk_list(udf[col].index[i]) + mk_list(udf[col].columns[j])
  814. right = sorted(map(cast, right))
  815. assert left == right
  816. def test_unstack_nan_index2(self):
  817. # GH7403
  818. df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)})
  819. # Explicit cast to avoid implicit cast when setting to np.nan
  820. df = df.astype({"B": "float"})
  821. df.iloc[3, 1] = np.nan
  822. left = df.set_index(["A", "B"]).unstack(0)
  823. vals = [
  824. [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
  825. [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7],
  826. ]
  827. vals = list(map(list, zip(*vals)))
  828. idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B")
  829. cols = MultiIndex(
  830. levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
  831. )
  832. right = DataFrame(vals, columns=cols, index=idx)
  833. tm.assert_frame_equal(left, right)
  834. df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)})
  835. # Explicit cast to avoid implicit cast when setting to np.nan
  836. df = df.astype({"B": "float"})
  837. df.iloc[2, 1] = np.nan
  838. left = df.set_index(["A", "B"]).unstack(0)
  839. vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
  840. cols = MultiIndex(
  841. levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
  842. )
  843. idx = Index([np.nan, 0, 1, 2, 3], name="B")
  844. right = DataFrame(vals, columns=cols, index=idx)
  845. tm.assert_frame_equal(left, right)
  846. df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)})
  847. # Explicit cast to avoid implicit cast when setting to np.nan
  848. df = df.astype({"B": "float"})
  849. df.iloc[3, 1] = np.nan
  850. left = df.set_index(["A", "B"]).unstack(0)
  851. vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
  852. cols = MultiIndex(
  853. levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
  854. )
  855. idx = Index([np.nan, 0, 1, 2, 3], name="B")
  856. right = DataFrame(vals, columns=cols, index=idx)
  857. tm.assert_frame_equal(left, right)
  858. def test_unstack_nan_index3(self, using_array_manager):
  859. # GH7401
  860. df = DataFrame(
  861. {
  862. "A": list("aaaaabbbbb"),
  863. "B": (date_range("2012-01-01", periods=5).tolist() * 2),
  864. "C": np.arange(10),
  865. }
  866. )
  867. df.iloc[3, 1] = np.nan
  868. left = df.set_index(["A", "B"]).unstack()
  869. vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
  870. idx = Index(["a", "b"], name="A")
  871. cols = MultiIndex(
  872. levels=[["C"], date_range("2012-01-01", periods=5)],
  873. codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
  874. names=[None, "B"],
  875. )
  876. right = DataFrame(vals, columns=cols, index=idx)
  877. if using_array_manager:
  878. # INFO(ArrayManager) with ArrayManager preserve dtype where possible
  879. cols = right.columns[[1, 2, 3, 5]]
  880. right[cols] = right[cols].astype(df["C"].dtype)
  881. tm.assert_frame_equal(left, right)
  882. def test_unstack_nan_index4(self):
  883. # GH4862
  884. vals = [
  885. ["Hg", np.nan, np.nan, 680585148],
  886. ["U", 0.0, np.nan, 680585148],
  887. ["Pb", 7.07e-06, np.nan, 680585148],
  888. ["Sn", 2.3614e-05, 0.0133, 680607017],
  889. ["Ag", 0.0, 0.0133, 680607017],
  890. ["Hg", -0.00015, 0.0133, 680607017],
  891. ]
  892. df = DataFrame(
  893. vals,
  894. columns=["agent", "change", "dosage", "s_id"],
  895. index=[17263, 17264, 17265, 17266, 17267, 17268],
  896. )
  897. left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack()
  898. vals = [
  899. [np.nan, np.nan, 7.07e-06, np.nan, 0.0],
  900. [0.0, -0.00015, np.nan, 2.3614e-05, np.nan],
  901. ]
  902. idx = MultiIndex(
  903. levels=[[680585148, 680607017], [0.0133]],
  904. codes=[[0, 1], [-1, 0]],
  905. names=["s_id", "dosage"],
  906. )
  907. cols = MultiIndex(
  908. levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]],
  909. codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
  910. names=[None, "agent"],
  911. )
  912. right = DataFrame(vals, columns=cols, index=idx)
  913. tm.assert_frame_equal(left, right)
  914. left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"])
  915. tm.assert_frame_equal(left.unstack(), right)
  916. def test_unstack_nan_index5(self):
  917. # GH9497 - multiple unstack with nulls
  918. df = DataFrame(
  919. {
  920. "1st": [1, 2, 1, 2, 1, 2],
  921. "2nd": date_range("2014-02-01", periods=6, freq="D"),
  922. "jim": 100 + np.arange(6),
  923. "joe": (np.random.default_rng(2).standard_normal(6) * 10).round(2),
  924. }
  925. )
  926. df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02")
  927. df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan
  928. df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan
  929. left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"])
  930. assert left.notna().values.sum() == 2 * len(df)
  931. for col in ["jim", "joe"]:
  932. for _, r in df.iterrows():
  933. key = r["1st"], (col, r["2nd"], r["3rd"])
  934. assert r[col] == left.loc[key]
  935. def test_stack_datetime_column_multiIndex(self, future_stack):
  936. # GH 8039
  937. t = datetime(2014, 1, 1)
  938. df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")]))
  939. warn = None if future_stack else FutureWarning
  940. msg = "The previous implementation of stack is deprecated"
  941. with tm.assert_produces_warning(warn, match=msg):
  942. result = df.stack(future_stack=future_stack)
  943. eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)])
  944. ecols = MultiIndex.from_tuples([(t, "A")])
  945. expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
  946. tm.assert_frame_equal(result, expected)
  947. @pytest.mark.filterwarnings(
  948. "ignore:The previous implementation of stack is deprecated"
  949. )
  950. @pytest.mark.parametrize(
  951. "multiindex_columns",
  952. [
  953. [0, 1, 2, 3, 4],
  954. [0, 1, 2, 3],
  955. [0, 1, 2, 4],
  956. [0, 1, 2],
  957. [1, 2, 3],
  958. [2, 3, 4],
  959. [0, 1],
  960. [0, 2],
  961. [0, 3],
  962. [0],
  963. [2],
  964. [4],
  965. [4, 3, 2, 1, 0],
  966. [3, 2, 1, 0],
  967. [4, 2, 1, 0],
  968. [2, 1, 0],
  969. [3, 2, 1],
  970. [4, 3, 2],
  971. [1, 0],
  972. [2, 0],
  973. [3, 0],
  974. ],
  975. )
  976. @pytest.mark.parametrize("level", (-1, 0, 1, [0, 1], [1, 0]))
  977. def test_stack_partial_multiIndex(self, multiindex_columns, level, future_stack):
  978. # GH 8844
  979. dropna = False if not future_stack else lib.no_default
  980. full_multiindex = MultiIndex.from_tuples(
  981. [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")],
  982. names=["Upper", "Lower"],
  983. )
  984. multiindex = full_multiindex[multiindex_columns]
  985. df = DataFrame(
  986. np.arange(3 * len(multiindex)).reshape(3, len(multiindex)),
  987. columns=multiindex,
  988. )
  989. result = df.stack(level=level, dropna=dropna, future_stack=future_stack)
  990. if isinstance(level, int) and not future_stack:
  991. # Stacking a single level should not make any all-NaN rows,
  992. # so df.stack(level=level, dropna=False) should be the same
  993. # as df.stack(level=level, dropna=True).
  994. expected = df.stack(level=level, dropna=True, future_stack=future_stack)
  995. if isinstance(expected, Series):
  996. tm.assert_series_equal(result, expected)
  997. else:
  998. tm.assert_frame_equal(result, expected)
  999. df.columns = MultiIndex.from_tuples(
  1000. df.columns.to_numpy(), names=df.columns.names
  1001. )
  1002. expected = df.stack(level=level, dropna=dropna, future_stack=future_stack)
  1003. if isinstance(expected, Series):
  1004. tm.assert_series_equal(result, expected)
  1005. else:
  1006. tm.assert_frame_equal(result, expected)
  1007. @pytest.mark.filterwarnings(
  1008. "ignore:The previous implementation of stack is deprecated"
  1009. )
  1010. def test_stack_full_multiIndex(self, future_stack):
  1011. # GH 8844
  1012. full_multiindex = MultiIndex.from_tuples(
  1013. [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")],
  1014. names=["Upper", "Lower"],
  1015. )
  1016. df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]])
  1017. dropna = False if not future_stack else lib.no_default
  1018. result = df.stack(dropna=dropna, future_stack=future_stack)
  1019. expected = DataFrame(
  1020. [[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
  1021. index=MultiIndex(
  1022. levels=[[0, 1], ["u", "x", "y", "z"]],
  1023. codes=[[0, 0, 1, 1], [1, 3, 1, 3]],
  1024. names=[None, "Lower"],
  1025. ),
  1026. columns=Index(["B", "C"], name="Upper"),
  1027. )
  1028. expected["B"] = expected["B"].astype(df.dtypes.iloc[0])
  1029. tm.assert_frame_equal(result, expected)
  1030. @pytest.mark.parametrize("ordered", [False, True])
  1031. def test_stack_preserve_categorical_dtype(self, ordered, future_stack):
  1032. # GH13854
  1033. cidx = pd.CategoricalIndex(list("yxz"), categories=list("xyz"), ordered=ordered)
  1034. df = DataFrame([[10, 11, 12]], columns=cidx)
  1035. result = df.stack(future_stack=future_stack)
  1036. # `MultiIndex.from_product` preserves categorical dtype -
  1037. # it's tested elsewhere.
  1038. midx = MultiIndex.from_product([df.index, cidx])
  1039. expected = Series([10, 11, 12], index=midx)
  1040. tm.assert_series_equal(result, expected)
  1041. @pytest.mark.filterwarnings(
  1042. "ignore:The previous implementation of stack is deprecated"
  1043. )
  1044. @pytest.mark.parametrize("ordered", [False, True])
  1045. @pytest.mark.parametrize(
  1046. "labels,data",
  1047. [
  1048. (list("xyz"), [10, 11, 12, 13, 14, 15]),
  1049. (list("zyx"), [14, 15, 12, 13, 10, 11]),
  1050. ],
  1051. )
  1052. def test_stack_multi_preserve_categorical_dtype(
  1053. self, ordered, labels, data, future_stack
  1054. ):
  1055. # GH-36991
  1056. cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered)
  1057. cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered)
  1058. midx = MultiIndex.from_product([cidx, cidx2])
  1059. df = DataFrame([sorted(data)], columns=midx)
  1060. result = df.stack([0, 1], future_stack=future_stack)
  1061. labels = labels if future_stack else sorted(labels)
  1062. s_cidx = pd.CategoricalIndex(labels, ordered=ordered)
  1063. expected_data = sorted(data) if future_stack else data
  1064. expected = Series(
  1065. expected_data, index=MultiIndex.from_product([[0], s_cidx, cidx2])
  1066. )
  1067. tm.assert_series_equal(result, expected)
  1068. def test_stack_preserve_categorical_dtype_values(self, future_stack):
  1069. # GH-23077
  1070. cat = pd.Categorical(["a", "a", "b", "c"])
  1071. df = DataFrame({"A": cat, "B": cat})
  1072. result = df.stack(future_stack=future_stack)
  1073. index = MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]])
  1074. expected = Series(
  1075. pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index
  1076. )
  1077. tm.assert_series_equal(result, expected)
  1078. @pytest.mark.filterwarnings(
  1079. "ignore:The previous implementation of stack is deprecated"
  1080. )
  1081. @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
  1082. @pytest.mark.parametrize(
  1083. "index, columns",
  1084. [
  1085. ([0, 0, 1, 1], MultiIndex.from_product([[1, 2], ["a", "b"]])),
  1086. ([0, 0, 2, 3], MultiIndex.from_product([[1, 2], ["a", "b"]])),
  1087. ([0, 1, 2, 3], MultiIndex.from_product([[1, 2], ["a", "b"]])),
  1088. ],
  1089. )
  1090. def test_stack_multi_columns_non_unique_index(self, index, columns, future_stack):
  1091. # GH-28301
  1092. df = DataFrame(index=index, columns=columns).fillna(1)
  1093. stacked = df.stack(future_stack=future_stack)
  1094. new_index = MultiIndex.from_tuples(stacked.index.to_numpy())
  1095. expected = DataFrame(
  1096. stacked.to_numpy(), index=new_index, columns=stacked.columns
  1097. )
  1098. tm.assert_frame_equal(stacked, expected)
  1099. stacked_codes = np.asarray(stacked.index.codes)
  1100. expected_codes = np.asarray(new_index.codes)
  1101. tm.assert_numpy_array_equal(stacked_codes, expected_codes)
  1102. @pytest.mark.filterwarnings(
  1103. "ignore:The previous implementation of stack is deprecated"
  1104. )
  1105. @pytest.mark.parametrize(
  1106. "vals1, vals2, dtype1, dtype2, expected_dtype",
  1107. [
  1108. ([1, 2], [3.0, 4.0], "Int64", "Float64", "Float64"),
  1109. ([1, 2], ["foo", "bar"], "Int64", "string", "object"),
  1110. ],
  1111. )
  1112. def test_stack_multi_columns_mixed_extension_types(
  1113. self, vals1, vals2, dtype1, dtype2, expected_dtype, future_stack
  1114. ):
  1115. # GH45740
  1116. df = DataFrame(
  1117. {
  1118. ("A", 1): Series(vals1, dtype=dtype1),
  1119. ("A", 2): Series(vals2, dtype=dtype2),
  1120. }
  1121. )
  1122. result = df.stack(future_stack=future_stack)
  1123. expected = (
  1124. df.astype(object).stack(future_stack=future_stack).astype(expected_dtype)
  1125. )
  1126. tm.assert_frame_equal(result, expected)
  1127. @pytest.mark.parametrize("level", [0, 1])
  1128. def test_unstack_mixed_extension_types(self, level):
  1129. index = MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 1)], names=["a", "b"])
  1130. df = DataFrame(
  1131. {
  1132. "A": pd.array([0, 1, None], dtype="Int64"),
  1133. "B": pd.Categorical(["a", "a", "b"]),
  1134. },
  1135. index=index,
  1136. )
  1137. result = df.unstack(level=level)
  1138. expected = df.astype(object).unstack(level=level)
  1139. if level == 0:
  1140. expected[("A", "B")] = expected[("A", "B")].fillna(pd.NA)
  1141. else:
  1142. expected[("A", 0)] = expected[("A", 0)].fillna(pd.NA)
  1143. expected_dtypes = Series(
  1144. [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns
  1145. )
  1146. tm.assert_series_equal(result.dtypes, expected_dtypes)
  1147. tm.assert_frame_equal(result.astype(object), expected)
  1148. @pytest.mark.parametrize("level", [0, "baz"])
  1149. def test_unstack_swaplevel_sortlevel(self, level):
  1150. # GH 20994
  1151. mi = MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"])
  1152. df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"])
  1153. df.columns.name = "foo"
  1154. expected = DataFrame(
  1155. [[3, 1, 2, 0]],
  1156. columns=MultiIndex.from_tuples(
  1157. [("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"]
  1158. ),
  1159. )
  1160. expected.index.name = "bar"
  1161. result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
  1162. tm.assert_frame_equal(result, expected)
  1163. @pytest.mark.parametrize("dtype", ["float64", "Float64"])
  1164. def test_unstack_sort_false(frame_or_series, dtype):
  1165. # GH 15105
  1166. index = MultiIndex.from_tuples(
  1167. [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")]
  1168. )
  1169. obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype)
  1170. result = obj.unstack(level=-1, sort=False)
  1171. if frame_or_series is DataFrame:
  1172. expected_columns = MultiIndex.from_tuples([(0, "b"), (0, "a")])
  1173. else:
  1174. expected_columns = ["b", "a"]
  1175. expected = DataFrame(
  1176. [[1.0, np.nan], [np.nan, 2.0], [3.0, np.nan], [np.nan, 4.0]],
  1177. columns=expected_columns,
  1178. index=MultiIndex.from_tuples(
  1179. [("two", "z"), ("two", "y"), ("one", "z"), ("one", "y")]
  1180. ),
  1181. dtype=dtype,
  1182. )
  1183. tm.assert_frame_equal(result, expected)
  1184. result = obj.unstack(level=[1, 2], sort=False)
  1185. if frame_or_series is DataFrame:
  1186. expected_columns = MultiIndex.from_tuples([(0, "z", "b"), (0, "y", "a")])
  1187. else:
  1188. expected_columns = MultiIndex.from_tuples([("z", "b"), ("y", "a")])
  1189. expected = DataFrame(
  1190. [[1.0, 2.0], [3.0, 4.0]],
  1191. index=["two", "one"],
  1192. columns=expected_columns,
  1193. dtype=dtype,
  1194. )
  1195. tm.assert_frame_equal(result, expected)
  1196. def test_unstack_fill_frame_object():
  1197. # GH12815 Test unstacking with object.
  1198. data = Series(["a", "b", "c", "a"], dtype="object")
  1199. data.index = MultiIndex.from_tuples(
  1200. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  1201. )
  1202. # By default missing values will be NaN
  1203. result = data.unstack()
  1204. expected = DataFrame(
  1205. {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]},
  1206. index=list("xyz"),
  1207. dtype=object,
  1208. )
  1209. tm.assert_frame_equal(result, expected)
  1210. # Fill with any value replaces missing values as expected
  1211. result = data.unstack(fill_value="d")
  1212. expected = DataFrame(
  1213. {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz"), dtype=object
  1214. )
  1215. tm.assert_frame_equal(result, expected)
  1216. def test_unstack_timezone_aware_values():
  1217. # GH 18338
  1218. df = DataFrame(
  1219. {
  1220. "timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")],
  1221. "a": ["a"],
  1222. "b": ["b"],
  1223. "c": ["c"],
  1224. },
  1225. columns=["timestamp", "a", "b", "c"],
  1226. )
  1227. result = df.set_index(["a", "b"]).unstack()
  1228. expected = DataFrame(
  1229. [[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]],
  1230. index=Index(["a"], name="a"),
  1231. columns=MultiIndex(
  1232. levels=[["timestamp", "c"], ["b"]],
  1233. codes=[[0, 1], [0, 0]],
  1234. names=[None, "b"],
  1235. ),
  1236. )
  1237. tm.assert_frame_equal(result, expected)
  1238. def test_stack_timezone_aware_values(future_stack):
  1239. # GH 19420
  1240. ts = date_range(freq="D", start="20180101", end="20180103", tz="America/New_York")
  1241. df = DataFrame({"A": ts}, index=["a", "b", "c"])
  1242. result = df.stack(future_stack=future_stack)
  1243. expected = Series(
  1244. ts,
  1245. index=MultiIndex(levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]]),
  1246. )
  1247. tm.assert_series_equal(result, expected)
  1248. @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
  1249. @pytest.mark.parametrize("dropna", [True, False, lib.no_default])
  1250. def test_stack_empty_frame(dropna, future_stack):
  1251. # GH 36113
  1252. levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
  1253. expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
  1254. if future_stack and dropna is not lib.no_default:
  1255. with pytest.raises(ValueError, match="dropna must be unspecified"):
  1256. DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack)
  1257. else:
  1258. result = DataFrame(dtype=np.float64).stack(
  1259. dropna=dropna, future_stack=future_stack
  1260. )
  1261. tm.assert_series_equal(result, expected)
  1262. @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
  1263. @pytest.mark.parametrize("dropna", [True, False, lib.no_default])
  1264. @pytest.mark.parametrize("fill_value", [None, 0])
  1265. def test_stack_unstack_empty_frame(dropna, fill_value, future_stack):
  1266. # GH 36113
  1267. if future_stack and dropna is not lib.no_default:
  1268. with pytest.raises(ValueError, match="dropna must be unspecified"):
  1269. DataFrame(dtype=np.int64).stack(
  1270. dropna=dropna, future_stack=future_stack
  1271. ).unstack(fill_value=fill_value)
  1272. else:
  1273. result = (
  1274. DataFrame(dtype=np.int64)
  1275. .stack(dropna=dropna, future_stack=future_stack)
  1276. .unstack(fill_value=fill_value)
  1277. )
  1278. expected = DataFrame(dtype=np.int64)
  1279. tm.assert_frame_equal(result, expected)
  1280. def test_unstack_single_index_series():
  1281. # GH 36113
  1282. msg = r"index must be a MultiIndex to unstack.*"
  1283. with pytest.raises(ValueError, match=msg):
  1284. Series(dtype=np.int64).unstack()
  1285. def test_unstacking_multi_index_df():
  1286. # see gh-30740
  1287. df = DataFrame(
  1288. {
  1289. "name": ["Alice", "Bob"],
  1290. "score": [9.5, 8],
  1291. "employed": [False, True],
  1292. "kids": [0, 0],
  1293. "gender": ["female", "male"],
  1294. }
  1295. )
  1296. df = df.set_index(["name", "employed", "kids", "gender"])
  1297. df = df.unstack(["gender"], fill_value=0)
  1298. expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0)
  1299. result = df.unstack(["employed", "kids"], fill_value=0)
  1300. expected = DataFrame(
  1301. [[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]],
  1302. index=Index(["Alice", "Bob"], name="name"),
  1303. columns=MultiIndex.from_tuples(
  1304. [
  1305. ("score", "female", False, 0),
  1306. ("score", "female", True, 0),
  1307. ("score", "male", False, 0),
  1308. ("score", "male", True, 0),
  1309. ],
  1310. names=[None, "gender", "employed", "kids"],
  1311. ),
  1312. )
  1313. tm.assert_frame_equal(result, expected)
  1314. @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
  1315. def test_stack_positional_level_duplicate_column_names(future_stack):
  1316. # https://github.com/pandas-dev/pandas/issues/36353
  1317. columns = MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"])
  1318. df = DataFrame([[1, 1, 1, 1]], columns=columns)
  1319. result = df.stack(0, future_stack=future_stack)
  1320. new_columns = Index(["y", "z"], name="a")
  1321. new_index = MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"])
  1322. expected = DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns)
  1323. tm.assert_frame_equal(result, expected)
  1324. def test_unstack_non_slice_like_blocks(using_array_manager):
  1325. # Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like
  1326. mi = MultiIndex.from_product([range(5), ["A", "B", "C"]])
  1327. df = DataFrame(
  1328. {
  1329. 0: np.random.default_rng(2).standard_normal(15),
  1330. 1: np.random.default_rng(2).standard_normal(15).astype(np.int64),
  1331. 2: np.random.default_rng(2).standard_normal(15),
  1332. 3: np.random.default_rng(2).standard_normal(15),
  1333. },
  1334. index=mi,
  1335. )
  1336. if not using_array_manager:
  1337. assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks)
  1338. res = df.unstack()
  1339. expected = pd.concat([df[n].unstack() for n in range(4)], keys=range(4), axis=1)
  1340. tm.assert_frame_equal(res, expected)
  1341. @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
  1342. def test_stack_sort_false(future_stack):
  1343. # GH 15105
  1344. data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]]
  1345. df = DataFrame(
  1346. data,
  1347. columns=MultiIndex(
  1348. levels=[["B", "A"], ["x", "y"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
  1349. ),
  1350. )
  1351. kwargs = {} if future_stack else {"sort": False}
  1352. result = df.stack(level=0, future_stack=future_stack, **kwargs)
  1353. if future_stack:
  1354. expected = DataFrame(
  1355. {
  1356. "x": [1.0, 3.0, 2.0, 4.0, 3.0, np.nan],
  1357. "y": [2.0, 4.0, 3.0, 5.0, 4.0, np.nan],
  1358. },
  1359. index=MultiIndex.from_arrays(
  1360. [[0, 0, 1, 1, 2, 2], ["B", "A", "B", "A", "B", "A"]]
  1361. ),
  1362. )
  1363. else:
  1364. expected = DataFrame(
  1365. {"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]},
  1366. index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]),
  1367. )
  1368. tm.assert_frame_equal(result, expected)
  1369. # Codes sorted in this call
  1370. df = DataFrame(
  1371. data,
  1372. columns=MultiIndex.from_arrays([["B", "B", "A", "A"], ["x", "y", "x", "y"]]),
  1373. )
  1374. kwargs = {} if future_stack else {"sort": False}
  1375. result = df.stack(level=0, future_stack=future_stack, **kwargs)
  1376. tm.assert_frame_equal(result, expected)
  1377. @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
  1378. def test_stack_sort_false_multi_level(future_stack):
  1379. # GH 15105
  1380. idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")])
  1381. df = DataFrame([[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=idx)
  1382. kwargs = {} if future_stack else {"sort": False}
  1383. result = df.stack([0, 1], future_stack=future_stack, **kwargs)
  1384. expected_index = MultiIndex.from_tuples(
  1385. [
  1386. ("cat", "weight", "kg"),
  1387. ("cat", "height", "m"),
  1388. ("dog", "weight", "kg"),
  1389. ("dog", "height", "m"),
  1390. ]
  1391. )
  1392. expected = Series([1.0, 2.0, 3.0, 4.0], index=expected_index)
  1393. tm.assert_series_equal(result, expected)
  1394. class TestStackUnstackMultiLevel:
  1395. def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
  1396. # just check that it works for now
  1397. ymd = multiindex_year_month_day_dataframe_random_data
  1398. unstacked = ymd.unstack()
  1399. unstacked.unstack()
  1400. # test that ints work
  1401. ymd.astype(int).unstack()
  1402. # test that int32 work
  1403. ymd.astype(np.int32).unstack()
  1404. @pytest.mark.parametrize(
  1405. "result_rows,result_columns,index_product,expected_row",
  1406. [
  1407. (
  1408. [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]],
  1409. ["ix1", "ix2", "col1", "col2", "col3", "col4"],
  1410. 2,
  1411. [None, None, 30.0, None],
  1412. ),
  1413. (
  1414. [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]],
  1415. ["ix1", "ix2", "col1", "col2", "col3"],
  1416. 2,
  1417. [None, None, 30.0],
  1418. ),
  1419. (
  1420. [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
  1421. ["ix1", "ix2", "col1", "col2", "col3"],
  1422. None,
  1423. [None, None, 30.0],
  1424. ),
  1425. ],
  1426. )
  1427. def test_unstack_partial(
  1428. self, result_rows, result_columns, index_product, expected_row
  1429. ):
  1430. # check for regressions on this issue:
  1431. # https://github.com/pandas-dev/pandas/issues/19351
  1432. # make sure DataFrame.unstack() works when its run on a subset of the DataFrame
  1433. # and the Index levels contain values that are not present in the subset
  1434. result = DataFrame(result_rows, columns=result_columns).set_index(
  1435. ["ix1", "ix2"]
  1436. )
  1437. result = result.iloc[1:2].unstack("ix2")
  1438. expected = DataFrame(
  1439. [expected_row],
  1440. columns=MultiIndex.from_product(
  1441. [result_columns[2:], [index_product]], names=[None, "ix2"]
  1442. ),
  1443. index=Index([2], name="ix1"),
  1444. )
  1445. tm.assert_frame_equal(result, expected)
  1446. def test_unstack_multiple_no_empty_columns(self):
  1447. index = MultiIndex.from_tuples(
  1448. [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)]
  1449. )
  1450. s = Series(np.random.default_rng(2).standard_normal(4), index=index)
  1451. unstacked = s.unstack([1, 2])
  1452. expected = unstacked.dropna(axis=1, how="all")
  1453. tm.assert_frame_equal(unstacked, expected)
  1454. @pytest.mark.filterwarnings(
  1455. "ignore:The previous implementation of stack is deprecated"
  1456. )
  1457. def test_stack(self, multiindex_year_month_day_dataframe_random_data, future_stack):
  1458. ymd = multiindex_year_month_day_dataframe_random_data
  1459. # regular roundtrip
  1460. unstacked = ymd.unstack()
  1461. restacked = unstacked.stack(future_stack=future_stack)
  1462. if future_stack:
  1463. # NA values in unstacked persist to restacked in version 3
  1464. restacked = restacked.dropna(how="all")
  1465. tm.assert_frame_equal(restacked, ymd)
  1466. unlexsorted = ymd.sort_index(level=2)
  1467. unstacked = unlexsorted.unstack(2)
  1468. restacked = unstacked.stack(future_stack=future_stack)
  1469. if future_stack:
  1470. # NA values in unstacked persist to restacked in version 3
  1471. restacked = restacked.dropna(how="all")
  1472. tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
  1473. unlexsorted = unlexsorted[::-1]
  1474. unstacked = unlexsorted.unstack(1)
  1475. restacked = unstacked.stack(future_stack=future_stack).swaplevel(1, 2)
  1476. if future_stack:
  1477. # NA values in unstacked persist to restacked in version 3
  1478. restacked = restacked.dropna(how="all")
  1479. tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
  1480. unlexsorted = unlexsorted.swaplevel(0, 1)
  1481. unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
  1482. restacked = unstacked.stack(0, future_stack=future_stack).swaplevel(1, 2)
  1483. if future_stack:
  1484. # NA values in unstacked persist to restacked in version 3
  1485. restacked = restacked.dropna(how="all")
  1486. tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
  1487. # columns unsorted
  1488. unstacked = ymd.unstack()
  1489. restacked = unstacked.stack(future_stack=future_stack)
  1490. if future_stack:
  1491. # NA values in unstacked persist to restacked in version 3
  1492. restacked = restacked.dropna(how="all")
  1493. tm.assert_frame_equal(restacked, ymd)
  1494. # more than 2 levels in the columns
  1495. unstacked = ymd.unstack(1).unstack(1)
  1496. result = unstacked.stack(1, future_stack=future_stack)
  1497. expected = ymd.unstack()
  1498. tm.assert_frame_equal(result, expected)
  1499. result = unstacked.stack(2, future_stack=future_stack)
  1500. expected = ymd.unstack(1)
  1501. tm.assert_frame_equal(result, expected)
  1502. result = unstacked.stack(0, future_stack=future_stack)
  1503. expected = ymd.stack(future_stack=future_stack).unstack(1).unstack(1)
  1504. tm.assert_frame_equal(result, expected)
  1505. # not all levels present in each echelon
  1506. unstacked = ymd.unstack(2).loc[:, ::3]
  1507. stacked = unstacked.stack(future_stack=future_stack).stack(
  1508. future_stack=future_stack
  1509. )
  1510. ymd_stacked = ymd.stack(future_stack=future_stack)
  1511. if future_stack:
  1512. # NA values in unstacked persist to restacked in version 3
  1513. stacked = stacked.dropna(how="all")
  1514. ymd_stacked = ymd_stacked.dropna(how="all")
  1515. tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))
  1516. # stack with negative number
  1517. result = ymd.unstack(0).stack(-2, future_stack=future_stack)
  1518. expected = ymd.unstack(0).stack(0, future_stack=future_stack)
  1519. tm.assert_equal(result, expected)
  1520. @pytest.mark.parametrize(
  1521. "idx, columns, exp_idx",
  1522. [
  1523. [
  1524. list("abab"),
  1525. ["1st", "2nd", "1st"],
  1526. MultiIndex(
  1527. levels=[["a", "b"], ["1st", "2nd"]],
  1528. codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)],
  1529. ),
  1530. ],
  1531. [
  1532. MultiIndex.from_tuples((("a", 2), ("b", 1), ("a", 1), ("b", 2))),
  1533. ["1st", "2nd", "1st"],
  1534. MultiIndex(
  1535. levels=[["a", "b"], [1, 2], ["1st", "2nd"]],
  1536. codes=[
  1537. np.tile(np.arange(2).repeat(3), 2),
  1538. np.repeat([1, 0, 1], [3, 6, 3]),
  1539. np.tile([0, 1, 0], 4),
  1540. ],
  1541. ),
  1542. ],
  1543. ],
  1544. )
  1545. def test_stack_duplicate_index(self, idx, columns, exp_idx, future_stack):
  1546. # GH10417
  1547. df = DataFrame(
  1548. np.arange(12).reshape(4, 3),
  1549. index=idx,
  1550. columns=columns,
  1551. )
  1552. if future_stack:
  1553. msg = "Columns with duplicate values are not supported in stack"
  1554. with pytest.raises(ValueError, match=msg):
  1555. df.stack(future_stack=future_stack)
  1556. else:
  1557. result = df.stack(future_stack=future_stack)
  1558. expected = Series(np.arange(12), index=exp_idx)
  1559. tm.assert_series_equal(result, expected)
  1560. assert result.index.is_unique is False
  1561. li, ri = result.index, expected.index
  1562. tm.assert_index_equal(li, ri)
  1563. @pytest.mark.filterwarnings(
  1564. "ignore:The previous implementation of stack is deprecated"
  1565. )
  1566. def test_unstack_odd_failure(self, future_stack):
  1567. mi = MultiIndex.from_arrays(
  1568. [
  1569. ["Fri"] * 4 + ["Sat"] * 2 + ["Sun"] * 2 + ["Thu"] * 3,
  1570. ["Dinner"] * 2 + ["Lunch"] * 2 + ["Dinner"] * 5 + ["Lunch"] * 2,
  1571. ["No", "Yes"] * 4 + ["No", "No", "Yes"],
  1572. ],
  1573. names=["day", "time", "smoker"],
  1574. )
  1575. df = DataFrame(
  1576. {
  1577. "sum": np.arange(11, dtype="float64"),
  1578. "len": np.arange(11, dtype="float64"),
  1579. },
  1580. index=mi,
  1581. )
  1582. # it works, #2100
  1583. result = df.unstack(2)
  1584. recons = result.stack(future_stack=future_stack)
  1585. if future_stack:
  1586. # NA values in unstacked persist to restacked in version 3
  1587. recons = recons.dropna(how="all")
  1588. tm.assert_frame_equal(recons, df)
  1589. @pytest.mark.filterwarnings(
  1590. "ignore:The previous implementation of stack is deprecated"
  1591. )
  1592. def test_stack_mixed_dtype(self, multiindex_dataframe_random_data, future_stack):
  1593. frame = multiindex_dataframe_random_data
  1594. df = frame.T
  1595. df["foo", "four"] = "foo"
  1596. df = df.sort_index(level=1, axis=1)
  1597. stacked = df.stack(future_stack=future_stack)
  1598. result = df["foo"].stack(future_stack=future_stack).sort_index()
  1599. tm.assert_series_equal(stacked["foo"], result, check_names=False)
  1600. assert result.name is None
  1601. assert stacked["bar"].dtype == np.float64
  1602. def test_unstack_bug(self, future_stack):
  1603. df = DataFrame(
  1604. {
  1605. "state": ["naive", "naive", "naive", "active", "active", "active"],
  1606. "exp": ["a", "b", "b", "b", "a", "a"],
  1607. "barcode": [1, 2, 3, 4, 1, 3],
  1608. "v": ["hi", "hi", "bye", "bye", "bye", "peace"],
  1609. "extra": np.arange(6.0),
  1610. }
  1611. )
  1612. msg = "DataFrameGroupBy.apply operated on the grouping columns"
  1613. with tm.assert_produces_warning(FutureWarning, match=msg):
  1614. result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
  1615. unstacked = result.unstack()
  1616. restacked = unstacked.stack(future_stack=future_stack)
  1617. tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float))
  1618. @pytest.mark.filterwarnings(
  1619. "ignore:The previous implementation of stack is deprecated"
  1620. )
  1621. def test_stack_unstack_preserve_names(
  1622. self, multiindex_dataframe_random_data, future_stack
  1623. ):
  1624. frame = multiindex_dataframe_random_data
  1625. unstacked = frame.unstack()
  1626. assert unstacked.index.name == "first"
  1627. assert unstacked.columns.names == ["exp", "second"]
  1628. restacked = unstacked.stack(future_stack=future_stack)
  1629. assert restacked.index.names == frame.index.names
  1630. @pytest.mark.parametrize("method", ["stack", "unstack"])
  1631. def test_stack_unstack_wrong_level_name(
  1632. self, method, multiindex_dataframe_random_data, future_stack
  1633. ):
  1634. # GH 18303 - wrong level name should raise
  1635. frame = multiindex_dataframe_random_data
  1636. # A DataFrame with flat axes:
  1637. df = frame.loc["foo"]
  1638. kwargs = {"future_stack": future_stack} if method == "stack" else {}
  1639. with pytest.raises(KeyError, match="does not match index name"):
  1640. getattr(df, method)("mistake", **kwargs)
  1641. if method == "unstack":
  1642. # Same on a Series:
  1643. s = df.iloc[:, 0]
  1644. with pytest.raises(KeyError, match="does not match index name"):
  1645. getattr(s, method)("mistake", **kwargs)
  1646. def test_unstack_level_name(self, multiindex_dataframe_random_data):
  1647. frame = multiindex_dataframe_random_data
  1648. result = frame.unstack("second")
  1649. expected = frame.unstack(level=1)
  1650. tm.assert_frame_equal(result, expected)
  1651. @pytest.mark.filterwarnings(
  1652. "ignore:The previous implementation of stack is deprecated"
  1653. )
  1654. def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack):
  1655. frame = multiindex_dataframe_random_data
  1656. unstacked = frame.unstack("second")
  1657. result = unstacked.stack("exp", future_stack=future_stack)
  1658. expected = frame.unstack().stack(0, future_stack=future_stack)
  1659. tm.assert_frame_equal(result, expected)
  1660. result = frame.stack("exp", future_stack=future_stack)
  1661. expected = frame.stack(future_stack=future_stack)
  1662. tm.assert_series_equal(result, expected)
  1663. @pytest.mark.filterwarnings(
  1664. "ignore:The previous implementation of stack is deprecated"
  1665. )
  1666. def test_stack_unstack_multiple(
  1667. self, multiindex_year_month_day_dataframe_random_data, future_stack
  1668. ):
  1669. ymd = multiindex_year_month_day_dataframe_random_data
  1670. unstacked = ymd.unstack(["year", "month"])
  1671. expected = ymd.unstack("year").unstack("month")
  1672. tm.assert_frame_equal(unstacked, expected)
  1673. assert unstacked.columns.names == expected.columns.names
  1674. # series
  1675. s = ymd["A"]
  1676. s_unstacked = s.unstack(["year", "month"])
  1677. tm.assert_frame_equal(s_unstacked, expected["A"])
  1678. restacked = unstacked.stack(["year", "month"], future_stack=future_stack)
  1679. if future_stack:
  1680. # NA values in unstacked persist to restacked in version 3
  1681. restacked = restacked.dropna(how="all")
  1682. restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
  1683. restacked = restacked.sort_index(level=0)
  1684. tm.assert_frame_equal(restacked, ymd)
  1685. assert restacked.index.names == ymd.index.names
  1686. # GH #451
  1687. unstacked = ymd.unstack([1, 2])
  1688. expected = ymd.unstack(1).unstack(1).dropna(axis=1, how="all")
  1689. tm.assert_frame_equal(unstacked, expected)
  1690. unstacked = ymd.unstack([2, 1])
  1691. expected = ymd.unstack(2).unstack(1).dropna(axis=1, how="all")
  1692. tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns])
  1693. @pytest.mark.filterwarnings(
  1694. "ignore:The previous implementation of stack is deprecated"
  1695. )
  1696. def test_stack_names_and_numbers(
  1697. self, multiindex_year_month_day_dataframe_random_data, future_stack
  1698. ):
  1699. ymd = multiindex_year_month_day_dataframe_random_data
  1700. unstacked = ymd.unstack(["year", "month"])
  1701. # Can't use mixture of names and numbers to stack
  1702. with pytest.raises(ValueError, match="level should contain"):
  1703. unstacked.stack([0, "month"], future_stack=future_stack)
  1704. @pytest.mark.filterwarnings(
  1705. "ignore:The previous implementation of stack is deprecated"
  1706. )
  1707. def test_stack_multiple_out_of_bounds(
  1708. self, multiindex_year_month_day_dataframe_random_data, future_stack
  1709. ):
  1710. # nlevels == 3
  1711. ymd = multiindex_year_month_day_dataframe_random_data
  1712. unstacked = ymd.unstack(["year", "month"])
  1713. with pytest.raises(IndexError, match="Too many levels"):
  1714. unstacked.stack([2, 3], future_stack=future_stack)
  1715. with pytest.raises(IndexError, match="not a valid level number"):
  1716. unstacked.stack([-4, -3], future_stack=future_stack)
  1717. def test_unstack_period_series(self):
  1718. # GH4342
  1719. idx1 = pd.PeriodIndex(
  1720. ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
  1721. freq="M",
  1722. name="period",
  1723. )
  1724. idx2 = Index(["A", "B"] * 3, name="str")
  1725. value = [1, 2, 3, 4, 5, 6]
  1726. idx = MultiIndex.from_arrays([idx1, idx2])
  1727. s = Series(value, index=idx)
  1728. result1 = s.unstack()
  1729. result2 = s.unstack(level=1)
  1730. result3 = s.unstack(level=0)
  1731. e_idx = pd.PeriodIndex(
  1732. ["2013-01", "2013-02", "2013-03"], freq="M", name="period"
  1733. )
  1734. expected = DataFrame(
  1735. {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"]
  1736. )
  1737. expected.columns.name = "str"
  1738. tm.assert_frame_equal(result1, expected)
  1739. tm.assert_frame_equal(result2, expected)
  1740. tm.assert_frame_equal(result3, expected.T)
  1741. idx1 = pd.PeriodIndex(
  1742. ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
  1743. freq="M",
  1744. name="period1",
  1745. )
  1746. idx2 = pd.PeriodIndex(
  1747. ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"],
  1748. freq="M",
  1749. name="period2",
  1750. )
  1751. idx = MultiIndex.from_arrays([idx1, idx2])
  1752. s = Series(value, index=idx)
  1753. result1 = s.unstack()
  1754. result2 = s.unstack(level=1)
  1755. result3 = s.unstack(level=0)
  1756. e_idx = pd.PeriodIndex(
  1757. ["2013-01", "2013-02", "2013-03"], freq="M", name="period1"
  1758. )
  1759. e_cols = pd.PeriodIndex(
  1760. ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"],
  1761. freq="M",
  1762. name="period2",
  1763. )
  1764. expected = DataFrame(
  1765. [
  1766. [np.nan, np.nan, np.nan, np.nan, 2, 1],
  1767. [np.nan, np.nan, 4, 3, np.nan, np.nan],
  1768. [6, 5, np.nan, np.nan, np.nan, np.nan],
  1769. ],
  1770. index=e_idx,
  1771. columns=e_cols,
  1772. )
  1773. tm.assert_frame_equal(result1, expected)
  1774. tm.assert_frame_equal(result2, expected)
  1775. tm.assert_frame_equal(result3, expected.T)
  1776. def test_unstack_period_frame(self):
  1777. # GH4342
  1778. idx1 = pd.PeriodIndex(
  1779. ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"],
  1780. freq="M",
  1781. name="period1",
  1782. )
  1783. idx2 = pd.PeriodIndex(
  1784. ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"],
  1785. freq="M",
  1786. name="period2",
  1787. )
  1788. value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]}
  1789. idx = MultiIndex.from_arrays([idx1, idx2])
  1790. df = DataFrame(value, index=idx)
  1791. result1 = df.unstack()
  1792. result2 = df.unstack(level=1)
  1793. result3 = df.unstack(level=0)
  1794. e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1")
  1795. e_2 = pd.PeriodIndex(
  1796. ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"],
  1797. freq="M",
  1798. name="period2",
  1799. )
  1800. e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2])
  1801. expected = DataFrame(
  1802. [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols
  1803. )
  1804. tm.assert_frame_equal(result1, expected)
  1805. tm.assert_frame_equal(result2, expected)
  1806. e_1 = pd.PeriodIndex(
  1807. ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1"
  1808. )
  1809. e_2 = pd.PeriodIndex(
  1810. ["2013-10", "2013-12", "2014-02"], freq="M", name="period2"
  1811. )
  1812. e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1])
  1813. expected = DataFrame(
  1814. [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols
  1815. )
  1816. tm.assert_frame_equal(result3, expected)
  1817. @pytest.mark.filterwarnings(
  1818. "ignore:The previous implementation of stack is deprecated"
  1819. )
  1820. def test_stack_multiple_bug(self, future_stack, using_infer_string):
  1821. # bug when some uniques are not present in the data GH#3170
  1822. id_col = ([1] * 3) + ([2] * 3)
  1823. name = (["a"] * 3) + (["b"] * 3)
  1824. date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2)
  1825. var1 = np.random.default_rng(2).integers(0, 100, 6)
  1826. df = DataFrame({"ID": id_col, "NAME": name, "DATE": date, "VAR1": var1})
  1827. multi = df.set_index(["DATE", "ID"])
  1828. multi.columns.name = "Params"
  1829. unst = multi.unstack("ID")
  1830. msg = re.escape("agg function failed [how->mean,dtype->")
  1831. if using_infer_string:
  1832. msg = "dtype 'str' does not support operation 'mean'"
  1833. with pytest.raises(TypeError, match=msg):
  1834. unst.resample("W-THU").mean()
  1835. down = unst.resample("W-THU").mean(numeric_only=True)
  1836. rs = down.stack("ID", future_stack=future_stack)
  1837. xp = (
  1838. unst.loc[:, ["VAR1"]]
  1839. .resample("W-THU")
  1840. .mean()
  1841. .stack("ID", future_stack=future_stack)
  1842. )
  1843. xp.columns.name = "Params"
  1844. tm.assert_frame_equal(rs, xp)
  1845. @pytest.mark.filterwarnings(
  1846. "ignore:The previous implementation of stack is deprecated"
  1847. )
  1848. def test_stack_dropna(self, future_stack):
  1849. # GH#3997
  1850. df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]})
  1851. df = df.set_index(["A", "B"])
  1852. dropna = False if not future_stack else lib.no_default
  1853. stacked = df.unstack().stack(dropna=dropna, future_stack=future_stack)
  1854. assert len(stacked) > len(stacked.dropna())
  1855. if future_stack:
  1856. with pytest.raises(ValueError, match="dropna must be unspecified"):
  1857. df.unstack().stack(dropna=True, future_stack=future_stack)
  1858. else:
  1859. stacked = df.unstack().stack(dropna=True, future_stack=future_stack)
  1860. tm.assert_frame_equal(stacked, stacked.dropna())
  1861. def test_unstack_multiple_hierarchical(self, future_stack):
  1862. df = DataFrame(
  1863. index=[
  1864. [0, 0, 0, 0, 1, 1, 1, 1],
  1865. [0, 0, 1, 1, 0, 0, 1, 1],
  1866. [0, 1, 0, 1, 0, 1, 0, 1],
  1867. ],
  1868. columns=[[0, 0, 1, 1], [0, 1, 0, 1]],
  1869. )
  1870. df.index.names = ["a", "b", "c"]
  1871. df.columns.names = ["d", "e"]
  1872. # it works!
  1873. df.unstack(["b", "c"])
  1874. def test_unstack_sparse_keyspace(self):
  1875. # memory problems with naive impl GH#2278
  1876. # Generate Long File & Test Pivot
  1877. NUM_ROWS = 1000
  1878. df = DataFrame(
  1879. {
  1880. "A": np.random.default_rng(2).integers(100, size=NUM_ROWS),
  1881. "B": np.random.default_rng(3).integers(300, size=NUM_ROWS),
  1882. "C": np.random.default_rng(4).integers(-7, 7, size=NUM_ROWS),
  1883. "D": np.random.default_rng(5).integers(-19, 19, size=NUM_ROWS),
  1884. "E": np.random.default_rng(6).integers(3000, size=NUM_ROWS),
  1885. "F": np.random.default_rng(7).standard_normal(NUM_ROWS),
  1886. }
  1887. )
  1888. idf = df.set_index(["A", "B", "C", "D", "E"])
  1889. # it works! is sufficient
  1890. idf.unstack("E")
  1891. @pytest.mark.filterwarnings(
  1892. "ignore:The previous implementation of stack is deprecated"
  1893. )
  1894. def test_unstack_unobserved_keys(self, future_stack):
  1895. # related to GH#2278 refactoring
  1896. levels = [[0, 1], [0, 1, 2, 3]]
  1897. codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
  1898. index = MultiIndex(levels, codes)
  1899. df = DataFrame(np.random.default_rng(2).standard_normal((4, 2)), index=index)
  1900. result = df.unstack()
  1901. assert len(result.columns) == 4
  1902. recons = result.stack(future_stack=future_stack)
  1903. tm.assert_frame_equal(recons, df)
  1904. @pytest.mark.slow
  1905. def test_unstack_number_of_levels_larger_than_int32(self, monkeypatch):
  1906. # GH#20601
  1907. # GH 26314: Change ValueError to PerformanceWarning
  1908. class MockUnstacker(reshape_lib._Unstacker):
  1909. def __init__(self, *args, **kwargs) -> None:
  1910. # __init__ will raise the warning
  1911. super().__init__(*args, **kwargs)
  1912. raise Exception("Don't compute final result.")
  1913. with monkeypatch.context() as m:
  1914. m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
  1915. df = DataFrame(
  1916. np.zeros((2**16, 2)),
  1917. index=[np.arange(2**16), np.arange(2**16)],
  1918. )
  1919. msg = "The following operation may generate"
  1920. with tm.assert_produces_warning(PerformanceWarning, match=msg):
  1921. with pytest.raises(Exception, match="Don't compute final result."):
  1922. df.unstack()
  1923. @pytest.mark.filterwarnings(
  1924. "ignore:The previous implementation of stack is deprecated"
  1925. )
  1926. @pytest.mark.parametrize(
  1927. "levels",
  1928. itertools.chain.from_iterable(
  1929. itertools.product(itertools.permutations([0, 1, 2], width), repeat=2)
  1930. for width in [2, 3]
  1931. ),
  1932. )
  1933. @pytest.mark.parametrize("stack_lev", range(2))
  1934. @pytest.mark.parametrize("sort", [True, False])
  1935. def test_stack_order_with_unsorted_levels(
  1936. self, levels, stack_lev, sort, future_stack
  1937. ):
  1938. # GH#16323
  1939. # deep check for 1-row case
  1940. columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
  1941. df = DataFrame(columns=columns, data=[range(4)])
  1942. kwargs = {} if future_stack else {"sort": sort}
  1943. df_stacked = df.stack(stack_lev, future_stack=future_stack, **kwargs)
  1944. for row in df.index:
  1945. for col in df.columns:
  1946. expected = df.loc[row, col]
  1947. result_row = row, col[stack_lev]
  1948. result_col = col[1 - stack_lev]
  1949. result = df_stacked.loc[result_row, result_col]
  1950. assert result == expected
  1951. @pytest.mark.filterwarnings(
  1952. "ignore:The previous implementation of stack is deprecated"
  1953. )
  1954. def test_stack_order_with_unsorted_levels_multi_row(self, future_stack):
  1955. # GH#16323
  1956. # check multi-row case
  1957. mi = MultiIndex(
  1958. levels=[["A", "C", "B"], ["B", "A", "C"]],
  1959. codes=[np.repeat(range(3), 3), np.tile(range(3), 3)],
  1960. )
  1961. df = DataFrame(
  1962. columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1)
  1963. )
  1964. assert all(
  1965. df.loc[row, col]
  1966. == df.stack(0, future_stack=future_stack).loc[(row, col[0]), col[1]]
  1967. for row in df.index
  1968. for col in df.columns
  1969. )
  1970. @pytest.mark.filterwarnings(
  1971. "ignore:The previous implementation of stack is deprecated"
  1972. )
  1973. def test_stack_order_with_unsorted_levels_multi_row_2(self, future_stack):
  1974. # GH#53636
  1975. levels = ((0, 1), (1, 0))
  1976. stack_lev = 1
  1977. columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
  1978. df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3])
  1979. kwargs = {} if future_stack else {"sort": True}
  1980. result = df.stack(stack_lev, future_stack=future_stack, **kwargs)
  1981. expected_index = MultiIndex(
  1982. levels=[[0, 1, 2, 3], [0, 1]],
  1983. codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],
  1984. )
  1985. expected = DataFrame(
  1986. {
  1987. 0: [0, 1, 0, 1, 0, 1, 0, 1],
  1988. 1: [2, 3, 2, 3, 2, 3, 2, 3],
  1989. },
  1990. index=expected_index,
  1991. )
  1992. tm.assert_frame_equal(result, expected)
  1993. @pytest.mark.filterwarnings(
  1994. "ignore:The previous implementation of stack is deprecated"
  1995. )
  1996. def test_stack_unstack_unordered_multiindex(self, future_stack):
  1997. # GH# 18265
  1998. values = np.arange(5)
  1999. data = np.vstack(
  2000. [
  2001. [f"b{x}" for x in values], # b0, b1, ..
  2002. [f"a{x}" for x in values], # a0, a1, ..
  2003. ]
  2004. )
  2005. df = DataFrame(data.T, columns=["b", "a"])
  2006. df.columns.name = "first"
  2007. second_level_dict = {"x": df}
  2008. multi_level_df = pd.concat(second_level_dict, axis=1)
  2009. multi_level_df.columns.names = ["second", "first"]
  2010. df = multi_level_df.reindex(sorted(multi_level_df.columns), axis=1)
  2011. result = df.stack(["first", "second"], future_stack=future_stack).unstack(
  2012. ["first", "second"]
  2013. )
  2014. expected = DataFrame(
  2015. [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]],
  2016. index=[0, 1, 2, 3, 4],
  2017. columns=MultiIndex.from_tuples(
  2018. [("a", "x"), ("b", "x")], names=["first", "second"]
  2019. ),
  2020. )
  2021. tm.assert_frame_equal(result, expected)
  2022. def test_unstack_preserve_types(
  2023. self, multiindex_year_month_day_dataframe_random_data, using_infer_string
  2024. ):
  2025. # GH#403
  2026. ymd = multiindex_year_month_day_dataframe_random_data
  2027. ymd["E"] = "foo"
  2028. ymd["F"] = 2
  2029. unstacked = ymd.unstack("month")
  2030. assert unstacked["A", 1].dtype == np.float64
  2031. assert (
  2032. unstacked["E", 1].dtype == np.object_
  2033. if not using_infer_string
  2034. else "string"
  2035. )
  2036. assert unstacked["F", 1].dtype == np.float64
  2037. def test_unstack_group_index_overflow(self, future_stack):
  2038. codes = np.tile(np.arange(500), 2)
  2039. level = np.arange(500)
  2040. index = MultiIndex(
  2041. levels=[level] * 8 + [[0, 1]],
  2042. codes=[codes] * 8 + [np.arange(2).repeat(500)],
  2043. )
  2044. s = Series(np.arange(1000), index=index)
  2045. result = s.unstack()
  2046. assert result.shape == (500, 2)
  2047. # test roundtrip
  2048. stacked = result.stack(future_stack=future_stack)
  2049. tm.assert_series_equal(s, stacked.reindex(s.index))
  2050. # put it at beginning
  2051. index = MultiIndex(
  2052. levels=[[0, 1]] + [level] * 8,
  2053. codes=[np.arange(2).repeat(500)] + [codes] * 8,
  2054. )
  2055. s = Series(np.arange(1000), index=index)
  2056. result = s.unstack(0)
  2057. assert result.shape == (500, 2)
  2058. # put it in middle
  2059. index = MultiIndex(
  2060. levels=[level] * 4 + [[0, 1]] + [level] * 4,
  2061. codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4),
  2062. )
  2063. s = Series(np.arange(1000), index=index)
  2064. result = s.unstack(4)
  2065. assert result.shape == (500, 2)
  2066. def test_unstack_with_missing_int_cast_to_float(self, using_array_manager):
  2067. # https://github.com/pandas-dev/pandas/issues/37115
  2068. df = DataFrame(
  2069. {
  2070. "a": ["A", "A", "B"],
  2071. "b": ["ca", "cb", "cb"],
  2072. "v": [10] * 3,
  2073. }
  2074. ).set_index(["a", "b"])
  2075. # add another int column to get 2 blocks
  2076. df["is_"] = 1
  2077. if not using_array_manager:
  2078. assert len(df._mgr.blocks) == 2
  2079. result = df.unstack("b")
  2080. result[("is_", "ca")] = result[("is_", "ca")].fillna(0)
  2081. expected = DataFrame(
  2082. [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]],
  2083. index=Index(["A", "B"], name="a"),
  2084. columns=MultiIndex.from_tuples(
  2085. [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")],
  2086. names=[None, "b"],
  2087. ),
  2088. )
  2089. if using_array_manager:
  2090. # INFO(ArrayManager) with ArrayManager preserve dtype where possible
  2091. expected[("v", "cb")] = expected[("v", "cb")].astype("int64")
  2092. expected[("is_", "cb")] = expected[("is_", "cb")].astype("int64")
  2093. tm.assert_frame_equal(result, expected)
  2094. def test_unstack_with_level_has_nan(self):
  2095. # GH 37510
  2096. df1 = DataFrame(
  2097. {
  2098. "L1": [1, 2, 3, 4],
  2099. "L2": [3, 4, 1, 2],
  2100. "L3": [1, 1, 1, 1],
  2101. "x": [1, 2, 3, 4],
  2102. }
  2103. )
  2104. df1 = df1.set_index(["L1", "L2", "L3"])
  2105. new_levels = ["n1", "n2", "n3", None]
  2106. df1.index = df1.index.set_levels(levels=new_levels, level="L1")
  2107. df1.index = df1.index.set_levels(levels=new_levels, level="L2")
  2108. result = df1.unstack("L3")[("x", 1)].sort_index().index
  2109. expected = MultiIndex(
  2110. levels=[["n1", "n2", "n3", None], ["n1", "n2", "n3", None]],
  2111. codes=[[0, 1, 2, 3], [2, 3, 0, 1]],
  2112. names=["L1", "L2"],
  2113. )
  2114. tm.assert_index_equal(result, expected)
  2115. @pytest.mark.filterwarnings(
  2116. "ignore:The previous implementation of stack is deprecated"
  2117. )
  2118. def test_stack_nan_in_multiindex_columns(self, future_stack):
  2119. # GH#39481
  2120. df = DataFrame(
  2121. np.zeros([1, 5]),
  2122. columns=MultiIndex.from_tuples(
  2123. [
  2124. (0, None, None),
  2125. (0, 2, 0),
  2126. (0, 2, 1),
  2127. (0, 3, 0),
  2128. (0, 3, 1),
  2129. ],
  2130. ),
  2131. )
  2132. result = df.stack(2, future_stack=future_stack)
  2133. if future_stack:
  2134. index = MultiIndex(levels=[[0], [0.0, 1.0]], codes=[[0, 0, 0], [-1, 0, 1]])
  2135. columns = MultiIndex(levels=[[0], [2, 3]], codes=[[0, 0, 0], [-1, 0, 1]])
  2136. else:
  2137. index = Index([(0, None), (0, 0), (0, 1)])
  2138. columns = Index([(0, None), (0, 2), (0, 3)])
  2139. expected = DataFrame(
  2140. [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]],
  2141. index=index,
  2142. columns=columns,
  2143. )
  2144. tm.assert_frame_equal(result, expected)
  2145. @pytest.mark.filterwarnings(
  2146. "ignore:The previous implementation of stack is deprecated"
  2147. )
  2148. def test_multi_level_stack_categorical(self, future_stack):
  2149. # GH 15239
  2150. midx = MultiIndex.from_arrays(
  2151. [
  2152. ["A"] * 2 + ["B"] * 2,
  2153. pd.Categorical(list("abab")),
  2154. pd.Categorical(list("ccdd")),
  2155. ]
  2156. )
  2157. df = DataFrame(np.arange(8).reshape(2, 4), columns=midx)
  2158. result = df.stack([1, 2], future_stack=future_stack)
  2159. if future_stack:
  2160. expected = DataFrame(
  2161. [
  2162. [0, np.nan],
  2163. [1, np.nan],
  2164. [np.nan, 2],
  2165. [np.nan, 3],
  2166. [4, np.nan],
  2167. [5, np.nan],
  2168. [np.nan, 6],
  2169. [np.nan, 7],
  2170. ],
  2171. columns=["A", "B"],
  2172. index=MultiIndex.from_arrays(
  2173. [
  2174. [0] * 4 + [1] * 4,
  2175. pd.Categorical(list("abababab")),
  2176. pd.Categorical(list("ccddccdd")),
  2177. ]
  2178. ),
  2179. )
  2180. else:
  2181. expected = DataFrame(
  2182. [
  2183. [0, np.nan],
  2184. [np.nan, 2],
  2185. [1, np.nan],
  2186. [np.nan, 3],
  2187. [4, np.nan],
  2188. [np.nan, 6],
  2189. [5, np.nan],
  2190. [np.nan, 7],
  2191. ],
  2192. columns=["A", "B"],
  2193. index=MultiIndex.from_arrays(
  2194. [
  2195. [0] * 4 + [1] * 4,
  2196. pd.Categorical(list("aabbaabb")),
  2197. pd.Categorical(list("cdcdcdcd")),
  2198. ]
  2199. ),
  2200. )
  2201. tm.assert_frame_equal(result, expected)
  2202. @pytest.mark.filterwarnings(
  2203. "ignore:The previous implementation of stack is deprecated"
  2204. )
  2205. def test_stack_nan_level(self, future_stack):
  2206. # GH 9406
  2207. df_nan = DataFrame(
  2208. np.arange(4).reshape(2, 2),
  2209. columns=MultiIndex.from_tuples(
  2210. [("A", np.nan), ("B", "b")], names=["Upper", "Lower"]
  2211. ),
  2212. index=Index([0, 1], name="Num"),
  2213. dtype=np.float64,
  2214. )
  2215. result = df_nan.stack(future_stack=future_stack)
  2216. if future_stack:
  2217. index = MultiIndex(
  2218. levels=[[0, 1], [np.nan, "b"]],
  2219. codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
  2220. names=["Num", "Lower"],
  2221. )
  2222. else:
  2223. index = MultiIndex.from_tuples(
  2224. [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"]
  2225. )
  2226. expected = DataFrame(
  2227. [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]],
  2228. columns=Index(["A", "B"], name="Upper"),
  2229. index=index,
  2230. )
  2231. tm.assert_frame_equal(result, expected)
  2232. def test_unstack_categorical_columns(self):
  2233. # GH 14018
  2234. idx = MultiIndex.from_product([["A"], [0, 1]])
  2235. df = DataFrame({"cat": pd.Categorical(["a", "b"])}, index=idx)
  2236. result = df.unstack()
  2237. expected = DataFrame(
  2238. {
  2239. 0: pd.Categorical(["a"], categories=["a", "b"]),
  2240. 1: pd.Categorical(["b"], categories=["a", "b"]),
  2241. },
  2242. index=["A"],
  2243. )
  2244. expected.columns = MultiIndex.from_tuples([("cat", 0), ("cat", 1)])
  2245. tm.assert_frame_equal(result, expected)
  2246. @pytest.mark.filterwarnings(
  2247. "ignore:The previous implementation of stack is deprecated"
  2248. )
  2249. def test_stack_unsorted(self, future_stack):
  2250. # GH 16925
  2251. PAE = ["ITA", "FRA"]
  2252. VAR = ["A1", "A2"]
  2253. TYP = ["CRT", "DBT", "NET"]
  2254. MI = MultiIndex.from_product([PAE, VAR, TYP], names=["PAE", "VAR", "TYP"])
  2255. V = list(range(len(MI)))
  2256. DF = DataFrame(data=V, index=MI, columns=["VALUE"])
  2257. DF = DF.unstack(["VAR", "TYP"])
  2258. DF.columns = DF.columns.droplevel(0)
  2259. DF.loc[:, ("A0", "NET")] = 9999
  2260. result = DF.stack(["VAR", "TYP"], future_stack=future_stack).sort_index()
  2261. expected = (
  2262. DF.sort_index(axis=1)
  2263. .stack(["VAR", "TYP"], future_stack=future_stack)
  2264. .sort_index()
  2265. )
  2266. tm.assert_series_equal(result, expected)
  2267. @pytest.mark.filterwarnings(
  2268. "ignore:The previous implementation of stack is deprecated"
  2269. )
  2270. def test_stack_nullable_dtype(self, future_stack):
  2271. # GH#43561
  2272. columns = MultiIndex.from_product(
  2273. [["54511", "54515"], ["r", "t_mean"]], names=["station", "element"]
  2274. )
  2275. index = Index([1, 2, 3], name="time")
  2276. arr = np.array([[50, 226, 10, 215], [10, 215, 9, 220], [305, 232, 111, 220]])
  2277. df = DataFrame(arr, columns=columns, index=index, dtype=pd.Int64Dtype())
  2278. result = df.stack("station", future_stack=future_stack)
  2279. expected = (
  2280. df.astype(np.int64)
  2281. .stack("station", future_stack=future_stack)
  2282. .astype(pd.Int64Dtype())
  2283. )
  2284. tm.assert_frame_equal(result, expected)
  2285. # non-homogeneous case
  2286. df[df.columns[0]] = df[df.columns[0]].astype(pd.Float64Dtype())
  2287. result = df.stack("station", future_stack=future_stack)
  2288. expected = DataFrame(
  2289. {
  2290. "r": pd.array(
  2291. [50.0, 10.0, 10.0, 9.0, 305.0, 111.0], dtype=pd.Float64Dtype()
  2292. ),
  2293. "t_mean": pd.array(
  2294. [226, 215, 215, 220, 232, 220], dtype=pd.Int64Dtype()
  2295. ),
  2296. },
  2297. index=MultiIndex.from_product([index, columns.levels[0]]),
  2298. )
  2299. expected.columns.name = "element"
  2300. tm.assert_frame_equal(result, expected)
  2301. def test_unstack_mixed_level_names(self):
  2302. # GH#48763
  2303. arrays = [["a", "a"], [1, 2], ["red", "blue"]]
  2304. idx = MultiIndex.from_arrays(arrays, names=("x", 0, "y"))
  2305. df = DataFrame({"m": [1, 2]}, index=idx)
  2306. result = df.unstack("x")
  2307. expected = DataFrame(
  2308. [[1], [2]],
  2309. columns=MultiIndex.from_tuples([("m", "a")], names=[None, "x"]),
  2310. index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]),
  2311. )
  2312. tm.assert_frame_equal(result, expected)
  2313. def test_stack_tuple_columns(future_stack):
  2314. # GH#54948 - test stack when the input has a non-MultiIndex with tuples
  2315. df = DataFrame(
  2316. [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=[("a", 1), ("a", 2), ("b", 1)]
  2317. )
  2318. result = df.stack(future_stack=future_stack)
  2319. expected = Series(
  2320. [1, 2, 3, 4, 5, 6, 7, 8, 9],
  2321. index=MultiIndex(
  2322. levels=[[0, 1, 2], [("a", 1), ("a", 2), ("b", 1)]],
  2323. codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
  2324. ),
  2325. )
  2326. tm.assert_series_equal(result, expected)
  2327. @pytest.mark.parametrize(
  2328. "dtype, na_value",
  2329. [
  2330. ("float64", np.nan),
  2331. ("Float64", np.nan),
  2332. ("Float64", pd.NA),
  2333. ("Int64", pd.NA),
  2334. ],
  2335. )
  2336. @pytest.mark.parametrize("test_multiindex", [True, False])
  2337. def test_stack_preserves_na(dtype, na_value, test_multiindex):
  2338. # GH#56573
  2339. if test_multiindex:
  2340. index = MultiIndex.from_arrays(2 * [Index([na_value], dtype=dtype)])
  2341. else:
  2342. index = Index([na_value], dtype=dtype)
  2343. df = DataFrame({"a": [1]}, index=index)
  2344. result = df.stack(future_stack=True)
  2345. if test_multiindex:
  2346. expected_index = MultiIndex.from_arrays(
  2347. [
  2348. Index([na_value], dtype=dtype),
  2349. Index([na_value], dtype=dtype),
  2350. Index(["a"]),
  2351. ]
  2352. )
  2353. else:
  2354. expected_index = MultiIndex.from_arrays(
  2355. [
  2356. Index([na_value], dtype=dtype),
  2357. Index(["a"]),
  2358. ]
  2359. )
  2360. expected = Series(1, index=expected_index)
  2361. tm.assert_series_equal(result, expected)