test_melt.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258
  1. import re
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. date_range,
  9. lreshape,
  10. melt,
  11. wide_to_long,
  12. )
  13. import pandas._testing as tm
  14. @pytest.fixture
  15. def df():
  16. res = DataFrame(
  17. np.random.default_rng(2).standard_normal((10, 4)),
  18. columns=Index(list("ABCD")),
  19. index=date_range("2000-01-01", periods=10, freq="B"),
  20. )
  21. res["id1"] = (res["A"] > 0).astype(np.int64)
  22. res["id2"] = (res["B"] > 0).astype(np.int64)
  23. return res
  24. @pytest.fixture
  25. def df1():
  26. res = DataFrame(
  27. [
  28. [1.067683, -1.110463, 0.20867],
  29. [-1.321405, 0.368915, -1.055342],
  30. [-0.807333, 0.08298, -0.873361],
  31. ]
  32. )
  33. res.columns = [list("ABC"), list("abc")]
  34. res.columns.names = ["CAP", "low"]
  35. return res
  36. @pytest.fixture
  37. def var_name():
  38. return "var"
  39. @pytest.fixture
  40. def value_name():
  41. return "val"
  42. class TestMelt:
  43. def test_top_level_method(self, df):
  44. result = melt(df)
  45. assert result.columns.tolist() == ["variable", "value"]
  46. def test_method_signatures(self, df, df1, var_name, value_name):
  47. tm.assert_frame_equal(df.melt(), melt(df))
  48. tm.assert_frame_equal(
  49. df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]),
  50. melt(df, id_vars=["id1", "id2"], value_vars=["A", "B"]),
  51. )
  52. tm.assert_frame_equal(
  53. df.melt(var_name=var_name, value_name=value_name),
  54. melt(df, var_name=var_name, value_name=value_name),
  55. )
  56. tm.assert_frame_equal(df1.melt(col_level=0), melt(df1, col_level=0))
  57. def test_default_col_names(self, df):
  58. result = df.melt()
  59. assert result.columns.tolist() == ["variable", "value"]
  60. result1 = df.melt(id_vars=["id1"])
  61. assert result1.columns.tolist() == ["id1", "variable", "value"]
  62. result2 = df.melt(id_vars=["id1", "id2"])
  63. assert result2.columns.tolist() == ["id1", "id2", "variable", "value"]
  64. def test_value_vars(self, df):
  65. result3 = df.melt(id_vars=["id1", "id2"], value_vars="A")
  66. assert len(result3) == 10
  67. result4 = df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"])
  68. expected4 = DataFrame(
  69. {
  70. "id1": df["id1"].tolist() * 2,
  71. "id2": df["id2"].tolist() * 2,
  72. "variable": ["A"] * 10 + ["B"] * 10,
  73. "value": (df["A"].tolist() + df["B"].tolist()),
  74. },
  75. columns=["id1", "id2", "variable", "value"],
  76. )
  77. tm.assert_frame_equal(result4, expected4)
  78. @pytest.mark.parametrize("type_", (tuple, list, np.array))
  79. def test_value_vars_types(self, type_, df):
  80. # GH 15348
  81. expected = DataFrame(
  82. {
  83. "id1": df["id1"].tolist() * 2,
  84. "id2": df["id2"].tolist() * 2,
  85. "variable": ["A"] * 10 + ["B"] * 10,
  86. "value": (df["A"].tolist() + df["B"].tolist()),
  87. },
  88. columns=["id1", "id2", "variable", "value"],
  89. )
  90. result = df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B")))
  91. tm.assert_frame_equal(result, expected)
  92. def test_vars_work_with_multiindex(self, df1):
  93. expected = DataFrame(
  94. {
  95. ("A", "a"): df1[("A", "a")],
  96. "CAP": ["B"] * len(df1),
  97. "low": ["b"] * len(df1),
  98. "value": df1[("B", "b")],
  99. },
  100. columns=[("A", "a"), "CAP", "low", "value"],
  101. )
  102. result = df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")])
  103. tm.assert_frame_equal(result, expected)
  104. @pytest.mark.parametrize(
  105. "id_vars, value_vars, col_level, expected",
  106. [
  107. (
  108. ["A"],
  109. ["B"],
  110. 0,
  111. DataFrame(
  112. {
  113. "A": {0: 1.067683, 1: -1.321405, 2: -0.807333},
  114. "CAP": {0: "B", 1: "B", 2: "B"},
  115. "value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
  116. }
  117. ),
  118. ),
  119. (
  120. ["a"],
  121. ["b"],
  122. 1,
  123. DataFrame(
  124. {
  125. "a": {0: 1.067683, 1: -1.321405, 2: -0.807333},
  126. "low": {0: "b", 1: "b", 2: "b"},
  127. "value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
  128. }
  129. ),
  130. ),
  131. ],
  132. )
  133. def test_single_vars_work_with_multiindex(
  134. self, id_vars, value_vars, col_level, expected, df1
  135. ):
  136. result = df1.melt(id_vars, value_vars, col_level=col_level)
  137. tm.assert_frame_equal(result, expected)
  138. @pytest.mark.parametrize(
  139. "id_vars, value_vars",
  140. [
  141. [("A", "a"), [("B", "b")]],
  142. [[("A", "a")], ("B", "b")],
  143. [("A", "a"), ("B", "b")],
  144. ],
  145. )
  146. def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1):
  147. # melt should fail with an informative error message if
  148. # the columns have a MultiIndex and a tuple is passed
  149. # for id_vars or value_vars.
  150. msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex"
  151. with pytest.raises(ValueError, match=msg):
  152. df1.melt(id_vars=id_vars, value_vars=value_vars)
  153. def test_custom_var_name(self, df, var_name):
  154. result5 = df.melt(var_name=var_name)
  155. assert result5.columns.tolist() == ["var", "value"]
  156. result6 = df.melt(id_vars=["id1"], var_name=var_name)
  157. assert result6.columns.tolist() == ["id1", "var", "value"]
  158. result7 = df.melt(id_vars=["id1", "id2"], var_name=var_name)
  159. assert result7.columns.tolist() == ["id1", "id2", "var", "value"]
  160. result8 = df.melt(id_vars=["id1", "id2"], value_vars="A", var_name=var_name)
  161. assert result8.columns.tolist() == ["id1", "id2", "var", "value"]
  162. result9 = df.melt(
  163. id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=var_name
  164. )
  165. expected9 = DataFrame(
  166. {
  167. "id1": df["id1"].tolist() * 2,
  168. "id2": df["id2"].tolist() * 2,
  169. var_name: ["A"] * 10 + ["B"] * 10,
  170. "value": (df["A"].tolist() + df["B"].tolist()),
  171. },
  172. columns=["id1", "id2", var_name, "value"],
  173. )
  174. tm.assert_frame_equal(result9, expected9)
  175. def test_custom_value_name(self, df, value_name):
  176. result10 = df.melt(value_name=value_name)
  177. assert result10.columns.tolist() == ["variable", "val"]
  178. result11 = df.melt(id_vars=["id1"], value_name=value_name)
  179. assert result11.columns.tolist() == ["id1", "variable", "val"]
  180. result12 = df.melt(id_vars=["id1", "id2"], value_name=value_name)
  181. assert result12.columns.tolist() == ["id1", "id2", "variable", "val"]
  182. result13 = df.melt(
  183. id_vars=["id1", "id2"], value_vars="A", value_name=value_name
  184. )
  185. assert result13.columns.tolist() == ["id1", "id2", "variable", "val"]
  186. result14 = df.melt(
  187. id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=value_name
  188. )
  189. expected14 = DataFrame(
  190. {
  191. "id1": df["id1"].tolist() * 2,
  192. "id2": df["id2"].tolist() * 2,
  193. "variable": ["A"] * 10 + ["B"] * 10,
  194. value_name: (df["A"].tolist() + df["B"].tolist()),
  195. },
  196. columns=["id1", "id2", "variable", value_name],
  197. )
  198. tm.assert_frame_equal(result14, expected14)
  199. def test_custom_var_and_value_name(self, df, value_name, var_name):
  200. result15 = df.melt(var_name=var_name, value_name=value_name)
  201. assert result15.columns.tolist() == ["var", "val"]
  202. result16 = df.melt(id_vars=["id1"], var_name=var_name, value_name=value_name)
  203. assert result16.columns.tolist() == ["id1", "var", "val"]
  204. result17 = df.melt(
  205. id_vars=["id1", "id2"], var_name=var_name, value_name=value_name
  206. )
  207. assert result17.columns.tolist() == ["id1", "id2", "var", "val"]
  208. result18 = df.melt(
  209. id_vars=["id1", "id2"],
  210. value_vars="A",
  211. var_name=var_name,
  212. value_name=value_name,
  213. )
  214. assert result18.columns.tolist() == ["id1", "id2", "var", "val"]
  215. result19 = df.melt(
  216. id_vars=["id1", "id2"],
  217. value_vars=["A", "B"],
  218. var_name=var_name,
  219. value_name=value_name,
  220. )
  221. expected19 = DataFrame(
  222. {
  223. "id1": df["id1"].tolist() * 2,
  224. "id2": df["id2"].tolist() * 2,
  225. var_name: ["A"] * 10 + ["B"] * 10,
  226. value_name: (df["A"].tolist() + df["B"].tolist()),
  227. },
  228. columns=["id1", "id2", var_name, value_name],
  229. )
  230. tm.assert_frame_equal(result19, expected19)
  231. df20 = df.copy()
  232. df20.columns.name = "foo"
  233. result20 = df20.melt()
  234. assert result20.columns.tolist() == ["foo", "value"]
  235. @pytest.mark.parametrize("col_level", [0, "CAP"])
  236. def test_col_level(self, col_level, df1):
  237. res = df1.melt(col_level=col_level)
  238. assert res.columns.tolist() == ["CAP", "value"]
  239. def test_multiindex(self, df1):
  240. res = df1.melt()
  241. assert res.columns.tolist() == ["CAP", "low", "value"]
  242. @pytest.mark.parametrize(
  243. "col",
  244. [
  245. pd.Series(date_range("2010", periods=5, tz="US/Pacific")),
  246. pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
  247. pd.Series([0, 1, 0, 0, 0]),
  248. ],
  249. )
  250. def test_pandas_dtypes(self, col):
  251. # GH 15785
  252. df = DataFrame(
  253. {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col}
  254. )
  255. expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True)
  256. result = melt(
  257. df, id_vars=["klass", "col"], var_name="attribute", value_name="value"
  258. )
  259. expected = DataFrame(
  260. {
  261. 0: list(range(5)) * 2,
  262. 1: pd.concat([col] * 2, ignore_index=True),
  263. 2: ["attr1"] * 5 + ["attr2"] * 5,
  264. 3: expected_value,
  265. }
  266. )
  267. expected.columns = ["klass", "col", "attribute", "value"]
  268. tm.assert_frame_equal(result, expected)
  269. def test_preserve_category(self):
  270. # GH 15853
  271. data = DataFrame({"A": [1, 2], "B": pd.Categorical(["X", "Y"])})
  272. result = melt(data, ["B"], ["A"])
  273. expected = DataFrame(
  274. {"B": pd.Categorical(["X", "Y"]), "variable": ["A", "A"], "value": [1, 2]}
  275. )
  276. tm.assert_frame_equal(result, expected)
  277. def test_melt_missing_columns_raises(self):
  278. # GH-23575
  279. # This test is to ensure that pandas raises an error if melting is
  280. # attempted with column names absent from the dataframe
  281. # Generate data
  282. df = DataFrame(
  283. np.random.default_rng(2).standard_normal((5, 4)), columns=list("abcd")
  284. )
  285. # Try to melt with missing `value_vars` column name
  286. msg = "The following id_vars or value_vars are not present in the DataFrame:"
  287. with pytest.raises(KeyError, match=msg):
  288. df.melt(["a", "b"], ["C", "d"])
  289. # Try to melt with missing `id_vars` column name
  290. with pytest.raises(KeyError, match=msg):
  291. df.melt(["A", "b"], ["c", "d"])
  292. # Multiple missing
  293. with pytest.raises(
  294. KeyError,
  295. match=msg,
  296. ):
  297. df.melt(["a", "b", "not_here", "or_there"], ["c", "d"])
  298. # Multiindex melt fails if column is missing from multilevel melt
  299. multi = df.copy()
  300. multi.columns = [list("ABCD"), list("abcd")]
  301. with pytest.raises(KeyError, match=msg):
  302. multi.melt([("E", "a")], [("B", "b")])
  303. # Multiindex fails if column is missing from single level melt
  304. with pytest.raises(KeyError, match=msg):
  305. multi.melt(["A"], ["F"], col_level=0)
  306. def test_melt_mixed_int_str_id_vars(self):
  307. # GH 29718
  308. df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]})
  309. result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"])
  310. expected = DataFrame(
  311. {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]}
  312. )
  313. # the df's columns are mixed type and thus object -> preserves object dtype
  314. expected["variable"] = expected["variable"].astype(object)
  315. tm.assert_frame_equal(result, expected)
  316. def test_melt_mixed_int_str_value_vars(self):
  317. # GH 29718
  318. df = DataFrame({0: ["foo"], "a": ["bar"]})
  319. result = melt(df, value_vars=[0, "a"])
  320. expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]})
  321. tm.assert_frame_equal(result, expected)
  322. def test_ignore_index(self):
  323. # GH 17440
  324. df = DataFrame({"foo": [0], "bar": [1]}, index=["first"])
  325. result = melt(df, ignore_index=False)
  326. expected = DataFrame(
  327. {"variable": ["foo", "bar"], "value": [0, 1]}, index=["first", "first"]
  328. )
  329. tm.assert_frame_equal(result, expected)
  330. def test_ignore_multiindex(self):
  331. # GH 17440
  332. index = pd.MultiIndex.from_tuples(
  333. [("first", "second"), ("first", "third")], names=["baz", "foobar"]
  334. )
  335. df = DataFrame({"foo": [0, 1], "bar": [2, 3]}, index=index)
  336. result = melt(df, ignore_index=False)
  337. expected_index = pd.MultiIndex.from_tuples(
  338. [("first", "second"), ("first", "third")] * 2, names=["baz", "foobar"]
  339. )
  340. expected = DataFrame(
  341. {"variable": ["foo"] * 2 + ["bar"] * 2, "value": [0, 1, 2, 3]},
  342. index=expected_index,
  343. )
  344. tm.assert_frame_equal(result, expected)
  345. def test_ignore_index_name_and_type(self):
  346. # GH 17440
  347. index = Index(["foo", "bar"], dtype="category", name="baz")
  348. df = DataFrame({"x": [0, 1], "y": [2, 3]}, index=index)
  349. result = melt(df, ignore_index=False)
  350. expected_index = Index(["foo", "bar"] * 2, dtype="category", name="baz")
  351. expected = DataFrame(
  352. {"variable": ["x", "x", "y", "y"], "value": [0, 1, 2, 3]},
  353. index=expected_index,
  354. )
  355. tm.assert_frame_equal(result, expected)
  356. def test_melt_with_duplicate_columns(self):
  357. # GH#41951
  358. df = DataFrame([["id", 2, 3]], columns=["a", "b", "b"])
  359. result = df.melt(id_vars=["a"], value_vars=["b"])
  360. expected = DataFrame(
  361. [["id", "b", 2], ["id", "b", 3]], columns=["a", "variable", "value"]
  362. )
  363. tm.assert_frame_equal(result, expected)
  364. @pytest.mark.parametrize("dtype", ["Int8", "Int64"])
  365. def test_melt_ea_dtype(self, dtype):
  366. # GH#41570
  367. df = DataFrame(
  368. {
  369. "a": pd.Series([1, 2], dtype="Int8"),
  370. "b": pd.Series([3, 4], dtype=dtype),
  371. }
  372. )
  373. result = df.melt()
  374. expected = DataFrame(
  375. {
  376. "variable": ["a", "a", "b", "b"],
  377. "value": pd.Series([1, 2, 3, 4], dtype=dtype),
  378. }
  379. )
  380. tm.assert_frame_equal(result, expected)
  381. def test_melt_ea_columns(self):
  382. # GH 54297
  383. df = DataFrame(
  384. {
  385. "A": {0: "a", 1: "b", 2: "c"},
  386. "B": {0: 1, 1: 3, 2: 5},
  387. "C": {0: 2, 1: 4, 2: 6},
  388. }
  389. )
  390. df.columns = df.columns.astype("string[python]")
  391. result = df.melt(id_vars=["A"], value_vars=["B"])
  392. expected = DataFrame(
  393. {
  394. "A": list("abc"),
  395. "variable": pd.Series(["B"] * 3, dtype="string[python]"),
  396. "value": [1, 3, 5],
  397. }
  398. )
  399. tm.assert_frame_equal(result, expected)
  400. def test_melt_preserves_datetime(self):
  401. df = DataFrame(
  402. data=[
  403. {
  404. "type": "A0",
  405. "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"),
  406. "end_date": pd.Timestamp("2023/03/10", tz="Asia/Tokyo"),
  407. },
  408. {
  409. "type": "A1",
  410. "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"),
  411. "end_date": pd.Timestamp("2023/03/11", tz="Asia/Tokyo"),
  412. },
  413. ],
  414. index=["aaaa", "bbbb"],
  415. )
  416. result = df.melt(
  417. id_vars=["type"],
  418. value_vars=["start_date", "end_date"],
  419. var_name="start/end",
  420. value_name="date",
  421. )
  422. expected = DataFrame(
  423. {
  424. "type": {0: "A0", 1: "A1", 2: "A0", 3: "A1"},
  425. "start/end": {
  426. 0: "start_date",
  427. 1: "start_date",
  428. 2: "end_date",
  429. 3: "end_date",
  430. },
  431. "date": {
  432. 0: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"),
  433. 1: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"),
  434. 2: pd.Timestamp("2023-03-10 00:00:00+0900", tz="Asia/Tokyo"),
  435. 3: pd.Timestamp("2023-03-11 00:00:00+0900", tz="Asia/Tokyo"),
  436. },
  437. }
  438. )
  439. tm.assert_frame_equal(result, expected)
  440. def test_melt_allows_non_scalar_id_vars(self):
  441. df = DataFrame(
  442. data={"a": [1, 2, 3], "b": [4, 5, 6]},
  443. index=["11", "22", "33"],
  444. )
  445. result = df.melt(
  446. id_vars="a",
  447. var_name=0,
  448. value_name=1,
  449. )
  450. expected = DataFrame({"a": [1, 2, 3], 0: ["b"] * 3, 1: [4, 5, 6]})
  451. tm.assert_frame_equal(result, expected)
  452. def test_melt_allows_non_string_var_name(self):
  453. df = DataFrame(
  454. data={"a": [1, 2, 3], "b": [4, 5, 6]},
  455. index=["11", "22", "33"],
  456. )
  457. result = df.melt(
  458. id_vars=["a"],
  459. var_name=0,
  460. value_name=1,
  461. )
  462. expected = DataFrame({"a": [1, 2, 3], 0: ["b"] * 3, 1: [4, 5, 6]})
  463. tm.assert_frame_equal(result, expected)
  464. def test_melt_non_scalar_var_name_raises(self):
  465. df = DataFrame(
  466. data={"a": [1, 2, 3], "b": [4, 5, 6]},
  467. index=["11", "22", "33"],
  468. )
  469. with pytest.raises(ValueError, match=r".* must be a scalar."):
  470. df.melt(id_vars=["a"], var_name=[1, 2])
  471. class TestLreshape:
  472. def test_pairs(self):
  473. data = {
  474. "birthdt": [
  475. "08jan2009",
  476. "20dec2008",
  477. "30dec2008",
  478. "21dec2008",
  479. "11jan2009",
  480. ],
  481. "birthwt": [1766, 3301, 1454, 3139, 4133],
  482. "id": [101, 102, 103, 104, 105],
  483. "sex": ["Male", "Female", "Female", "Female", "Female"],
  484. "visitdt1": [
  485. "11jan2009",
  486. "22dec2008",
  487. "04jan2009",
  488. "29dec2008",
  489. "20jan2009",
  490. ],
  491. "visitdt2": ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"],
  492. "visitdt3": ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"],
  493. "wt1": [1823, 3338, 1549, 3298, 4306],
  494. "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0],
  495. "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0],
  496. }
  497. df = DataFrame(data)
  498. spec = {
  499. "visitdt": [f"visitdt{i:d}" for i in range(1, 4)],
  500. "wt": [f"wt{i:d}" for i in range(1, 4)],
  501. }
  502. result = lreshape(df, spec)
  503. exp_data = {
  504. "birthdt": [
  505. "08jan2009",
  506. "20dec2008",
  507. "30dec2008",
  508. "21dec2008",
  509. "11jan2009",
  510. "08jan2009",
  511. "30dec2008",
  512. "21dec2008",
  513. "11jan2009",
  514. "08jan2009",
  515. "21dec2008",
  516. "11jan2009",
  517. ],
  518. "birthwt": [
  519. 1766,
  520. 3301,
  521. 1454,
  522. 3139,
  523. 4133,
  524. 1766,
  525. 1454,
  526. 3139,
  527. 4133,
  528. 1766,
  529. 3139,
  530. 4133,
  531. ],
  532. "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105],
  533. "sex": [
  534. "Male",
  535. "Female",
  536. "Female",
  537. "Female",
  538. "Female",
  539. "Male",
  540. "Female",
  541. "Female",
  542. "Female",
  543. "Male",
  544. "Female",
  545. "Female",
  546. ],
  547. "visitdt": [
  548. "11jan2009",
  549. "22dec2008",
  550. "04jan2009",
  551. "29dec2008",
  552. "20jan2009",
  553. "21jan2009",
  554. "22jan2009",
  555. "31dec2008",
  556. "03feb2009",
  557. "05feb2009",
  558. "02jan2009",
  559. "15feb2009",
  560. ],
  561. "wt": [
  562. 1823.0,
  563. 3338.0,
  564. 1549.0,
  565. 3298.0,
  566. 4306.0,
  567. 2011.0,
  568. 1892.0,
  569. 3338.0,
  570. 4575.0,
  571. 2293.0,
  572. 3377.0,
  573. 4805.0,
  574. ],
  575. }
  576. exp = DataFrame(exp_data, columns=result.columns)
  577. tm.assert_frame_equal(result, exp)
  578. result = lreshape(df, spec, dropna=False)
  579. exp_data = {
  580. "birthdt": [
  581. "08jan2009",
  582. "20dec2008",
  583. "30dec2008",
  584. "21dec2008",
  585. "11jan2009",
  586. "08jan2009",
  587. "20dec2008",
  588. "30dec2008",
  589. "21dec2008",
  590. "11jan2009",
  591. "08jan2009",
  592. "20dec2008",
  593. "30dec2008",
  594. "21dec2008",
  595. "11jan2009",
  596. ],
  597. "birthwt": [
  598. 1766,
  599. 3301,
  600. 1454,
  601. 3139,
  602. 4133,
  603. 1766,
  604. 3301,
  605. 1454,
  606. 3139,
  607. 4133,
  608. 1766,
  609. 3301,
  610. 1454,
  611. 3139,
  612. 4133,
  613. ],
  614. "id": [
  615. 101,
  616. 102,
  617. 103,
  618. 104,
  619. 105,
  620. 101,
  621. 102,
  622. 103,
  623. 104,
  624. 105,
  625. 101,
  626. 102,
  627. 103,
  628. 104,
  629. 105,
  630. ],
  631. "sex": [
  632. "Male",
  633. "Female",
  634. "Female",
  635. "Female",
  636. "Female",
  637. "Male",
  638. "Female",
  639. "Female",
  640. "Female",
  641. "Female",
  642. "Male",
  643. "Female",
  644. "Female",
  645. "Female",
  646. "Female",
  647. ],
  648. "visitdt": [
  649. "11jan2009",
  650. "22dec2008",
  651. "04jan2009",
  652. "29dec2008",
  653. "20jan2009",
  654. "21jan2009",
  655. np.nan,
  656. "22jan2009",
  657. "31dec2008",
  658. "03feb2009",
  659. "05feb2009",
  660. np.nan,
  661. np.nan,
  662. "02jan2009",
  663. "15feb2009",
  664. ],
  665. "wt": [
  666. 1823.0,
  667. 3338.0,
  668. 1549.0,
  669. 3298.0,
  670. 4306.0,
  671. 2011.0,
  672. np.nan,
  673. 1892.0,
  674. 3338.0,
  675. 4575.0,
  676. 2293.0,
  677. np.nan,
  678. np.nan,
  679. 3377.0,
  680. 4805.0,
  681. ],
  682. }
  683. exp = DataFrame(exp_data, columns=result.columns)
  684. tm.assert_frame_equal(result, exp)
  685. spec = {
  686. "visitdt": [f"visitdt{i:d}" for i in range(1, 3)],
  687. "wt": [f"wt{i:d}" for i in range(1, 4)],
  688. }
  689. msg = "All column lists must be same length"
  690. with pytest.raises(ValueError, match=msg):
  691. lreshape(df, spec)
  692. class TestWideToLong:
  693. def test_simple(self):
  694. x = np.random.default_rng(2).standard_normal(3)
  695. df = DataFrame(
  696. {
  697. "A1970": {0: "a", 1: "b", 2: "c"},
  698. "A1980": {0: "d", 1: "e", 2: "f"},
  699. "B1970": {0: 2.5, 1: 1.2, 2: 0.7},
  700. "B1980": {0: 3.2, 1: 1.3, 2: 0.1},
  701. "X": dict(zip(range(3), x)),
  702. }
  703. )
  704. df["id"] = df.index
  705. exp_data = {
  706. "X": x.tolist() + x.tolist(),
  707. "A": ["a", "b", "c", "d", "e", "f"],
  708. "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  709. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  710. "id": [0, 1, 2, 0, 1, 2],
  711. }
  712. expected = DataFrame(exp_data)
  713. expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
  714. result = wide_to_long(df, ["A", "B"], i="id", j="year")
  715. tm.assert_frame_equal(result, expected)
  716. def test_stubs(self):
  717. # GH9204 wide_to_long call should not modify 'stubs' list
  718. df = DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
  719. df.columns = ["id", "inc1", "inc2", "edu1", "edu2"]
  720. stubs = ["inc", "edu"]
  721. wide_to_long(df, stubs, i="id", j="age")
  722. assert stubs == ["inc", "edu"]
  723. def test_separating_character(self):
  724. # GH14779
  725. x = np.random.default_rng(2).standard_normal(3)
  726. df = DataFrame(
  727. {
  728. "A.1970": {0: "a", 1: "b", 2: "c"},
  729. "A.1980": {0: "d", 1: "e", 2: "f"},
  730. "B.1970": {0: 2.5, 1: 1.2, 2: 0.7},
  731. "B.1980": {0: 3.2, 1: 1.3, 2: 0.1},
  732. "X": dict(zip(range(3), x)),
  733. }
  734. )
  735. df["id"] = df.index
  736. exp_data = {
  737. "X": x.tolist() + x.tolist(),
  738. "A": ["a", "b", "c", "d", "e", "f"],
  739. "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  740. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  741. "id": [0, 1, 2, 0, 1, 2],
  742. }
  743. expected = DataFrame(exp_data)
  744. expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
  745. result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
  746. tm.assert_frame_equal(result, expected)
  747. def test_escapable_characters(self):
  748. x = np.random.default_rng(2).standard_normal(3)
  749. df = DataFrame(
  750. {
  751. "A(quarterly)1970": {0: "a", 1: "b", 2: "c"},
  752. "A(quarterly)1980": {0: "d", 1: "e", 2: "f"},
  753. "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7},
  754. "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1},
  755. "X": dict(zip(range(3), x)),
  756. }
  757. )
  758. df["id"] = df.index
  759. exp_data = {
  760. "X": x.tolist() + x.tolist(),
  761. "A(quarterly)": ["a", "b", "c", "d", "e", "f"],
  762. "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  763. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  764. "id": [0, 1, 2, 0, 1, 2],
  765. }
  766. expected = DataFrame(exp_data)
  767. expected = expected.set_index(["id", "year"])[
  768. ["X", "A(quarterly)", "B(quarterly)"]
  769. ]
  770. result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year")
  771. tm.assert_frame_equal(result, expected)
  772. def test_unbalanced(self):
  773. # test that we can have a varying amount of time variables
  774. df = DataFrame(
  775. {
  776. "A2010": [1.0, 2.0],
  777. "A2011": [3.0, 4.0],
  778. "B2010": [5.0, 6.0],
  779. "X": ["X1", "X2"],
  780. }
  781. )
  782. df["id"] = df.index
  783. exp_data = {
  784. "X": ["X1", "X2", "X1", "X2"],
  785. "A": [1.0, 2.0, 3.0, 4.0],
  786. "B": [5.0, 6.0, np.nan, np.nan],
  787. "id": [0, 1, 0, 1],
  788. "year": [2010, 2010, 2011, 2011],
  789. }
  790. expected = DataFrame(exp_data)
  791. expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
  792. result = wide_to_long(df, ["A", "B"], i="id", j="year")
  793. tm.assert_frame_equal(result, expected)
  794. def test_character_overlap(self):
  795. # Test we handle overlapping characters in both id_vars and value_vars
  796. df = DataFrame(
  797. {
  798. "A11": ["a11", "a22", "a33"],
  799. "A12": ["a21", "a22", "a23"],
  800. "B11": ["b11", "b12", "b13"],
  801. "B12": ["b21", "b22", "b23"],
  802. "BB11": [1, 2, 3],
  803. "BB12": [4, 5, 6],
  804. "BBBX": [91, 92, 93],
  805. "BBBZ": [91, 92, 93],
  806. }
  807. )
  808. df["id"] = df.index
  809. expected = DataFrame(
  810. {
  811. "BBBX": [91, 92, 93, 91, 92, 93],
  812. "BBBZ": [91, 92, 93, 91, 92, 93],
  813. "A": ["a11", "a22", "a33", "a21", "a22", "a23"],
  814. "B": ["b11", "b12", "b13", "b21", "b22", "b23"],
  815. "BB": [1, 2, 3, 4, 5, 6],
  816. "id": [0, 1, 2, 0, 1, 2],
  817. "year": [11, 11, 11, 12, 12, 12],
  818. }
  819. )
  820. expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]]
  821. result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
  822. tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
  823. def test_invalid_separator(self):
  824. # if an invalid separator is supplied a empty data frame is returned
  825. sep = "nope!"
  826. df = DataFrame(
  827. {
  828. "A2010": [1.0, 2.0],
  829. "A2011": [3.0, 4.0],
  830. "B2010": [5.0, 6.0],
  831. "X": ["X1", "X2"],
  832. }
  833. )
  834. df["id"] = df.index
  835. exp_data = {
  836. "X": "",
  837. "A2010": [],
  838. "A2011": [],
  839. "B2010": [],
  840. "id": [],
  841. "year": [],
  842. "A": [],
  843. "B": [],
  844. }
  845. expected = DataFrame(exp_data).astype({"year": np.int64})
  846. expected = expected.set_index(["id", "year"])[
  847. ["X", "A2010", "A2011", "B2010", "A", "B"]
  848. ]
  849. expected.index = expected.index.set_levels([0, 1], level=0)
  850. result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep)
  851. tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
  852. def test_num_string_disambiguation(self):
  853. # Test that we can disambiguate number value_vars from
  854. # string value_vars
  855. df = DataFrame(
  856. {
  857. "A11": ["a11", "a22", "a33"],
  858. "A12": ["a21", "a22", "a23"],
  859. "B11": ["b11", "b12", "b13"],
  860. "B12": ["b21", "b22", "b23"],
  861. "BB11": [1, 2, 3],
  862. "BB12": [4, 5, 6],
  863. "Arating": [91, 92, 93],
  864. "Arating_old": [91, 92, 93],
  865. }
  866. )
  867. df["id"] = df.index
  868. expected = DataFrame(
  869. {
  870. "Arating": [91, 92, 93, 91, 92, 93],
  871. "Arating_old": [91, 92, 93, 91, 92, 93],
  872. "A": ["a11", "a22", "a33", "a21", "a22", "a23"],
  873. "B": ["b11", "b12", "b13", "b21", "b22", "b23"],
  874. "BB": [1, 2, 3, 4, 5, 6],
  875. "id": [0, 1, 2, 0, 1, 2],
  876. "year": [11, 11, 11, 12, 12, 12],
  877. }
  878. )
  879. expected = expected.set_index(["id", "year"])[
  880. ["Arating", "Arating_old", "A", "B", "BB"]
  881. ]
  882. result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
  883. tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
  884. def test_invalid_suffixtype(self):
  885. # If all stubs names end with a string, but a numeric suffix is
  886. # assumed, an empty data frame is returned
  887. df = DataFrame(
  888. {
  889. "Aone": [1.0, 2.0],
  890. "Atwo": [3.0, 4.0],
  891. "Bone": [5.0, 6.0],
  892. "X": ["X1", "X2"],
  893. }
  894. )
  895. df["id"] = df.index
  896. exp_data = {
  897. "X": "",
  898. "Aone": [],
  899. "Atwo": [],
  900. "Bone": [],
  901. "id": [],
  902. "year": [],
  903. "A": [],
  904. "B": [],
  905. }
  906. expected = DataFrame(exp_data).astype({"year": np.int64})
  907. expected = expected.set_index(["id", "year"])
  908. expected.index = expected.index.set_levels([0, 1], level=0)
  909. result = wide_to_long(df, ["A", "B"], i="id", j="year")
  910. tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
  911. def test_multiple_id_columns(self):
  912. # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
  913. df = DataFrame(
  914. {
  915. "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3],
  916. "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3],
  917. "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
  918. "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9],
  919. }
  920. )
  921. expected = DataFrame(
  922. {
  923. "ht": [
  924. 2.8,
  925. 3.4,
  926. 2.9,
  927. 3.8,
  928. 2.2,
  929. 2.9,
  930. 2.0,
  931. 3.2,
  932. 1.8,
  933. 2.8,
  934. 1.9,
  935. 2.4,
  936. 2.2,
  937. 3.3,
  938. 2.3,
  939. 3.4,
  940. 2.1,
  941. 2.9,
  942. ],
  943. "famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
  944. "birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
  945. "age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
  946. }
  947. )
  948. expected = expected.set_index(["famid", "birth", "age"])[["ht"]]
  949. result = wide_to_long(df, "ht", i=["famid", "birth"], j="age")
  950. tm.assert_frame_equal(result, expected)
  951. def test_non_unique_idvars(self):
  952. # GH16382
  953. # Raise an error message if non unique id vars (i) are passed
  954. df = DataFrame(
  955. {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]}
  956. )
  957. msg = "the id variables need to uniquely identify each row"
  958. with pytest.raises(ValueError, match=msg):
  959. wide_to_long(df, ["A_A", "B_B"], i="x", j="colname")
  960. def test_cast_j_int(self):
  961. df = DataFrame(
  962. {
  963. "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"],
  964. "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"],
  965. "actor_fb_likes_1": [1000.0, 40000.0, 11000.0],
  966. "actor_fb_likes_2": [936.0, 5000.0, 393.0],
  967. "title": ["Avatar", "Pirates of the Caribbean", "Spectre"],
  968. }
  969. )
  970. expected = DataFrame(
  971. {
  972. "actor": [
  973. "CCH Pounder",
  974. "Johnny Depp",
  975. "Christoph Waltz",
  976. "Joel David Moore",
  977. "Orlando Bloom",
  978. "Rory Kinnear",
  979. ],
  980. "actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
  981. "num": [1, 1, 1, 2, 2, 2],
  982. "title": [
  983. "Avatar",
  984. "Pirates of the Caribbean",
  985. "Spectre",
  986. "Avatar",
  987. "Pirates of the Caribbean",
  988. "Spectre",
  989. ],
  990. }
  991. ).set_index(["title", "num"])
  992. result = wide_to_long(
  993. df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_"
  994. )
  995. tm.assert_frame_equal(result, expected)
  996. def test_identical_stubnames(self):
  997. df = DataFrame(
  998. {
  999. "A2010": [1.0, 2.0],
  1000. "A2011": [3.0, 4.0],
  1001. "B2010": [5.0, 6.0],
  1002. "A": ["X1", "X2"],
  1003. }
  1004. )
  1005. msg = "stubname can't be identical to a column name"
  1006. with pytest.raises(ValueError, match=msg):
  1007. wide_to_long(df, ["A", "B"], i="A", j="colname")
  1008. def test_nonnumeric_suffix(self):
  1009. df = DataFrame(
  1010. {
  1011. "treatment_placebo": [1.0, 2.0],
  1012. "treatment_test": [3.0, 4.0],
  1013. "result_placebo": [5.0, 6.0],
  1014. "A": ["X1", "X2"],
  1015. }
  1016. )
  1017. expected = DataFrame(
  1018. {
  1019. "A": ["X1", "X2", "X1", "X2"],
  1020. "colname": ["placebo", "placebo", "test", "test"],
  1021. "result": [5.0, 6.0, np.nan, np.nan],
  1022. "treatment": [1.0, 2.0, 3.0, 4.0],
  1023. }
  1024. )
  1025. expected = expected.set_index(["A", "colname"])
  1026. result = wide_to_long(
  1027. df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_"
  1028. )
  1029. tm.assert_frame_equal(result, expected)
  1030. def test_mixed_type_suffix(self):
  1031. df = DataFrame(
  1032. {
  1033. "A": ["X1", "X2"],
  1034. "result_1": [0, 9],
  1035. "result_foo": [5.0, 6.0],
  1036. "treatment_1": [1.0, 2.0],
  1037. "treatment_foo": [3.0, 4.0],
  1038. }
  1039. )
  1040. expected = DataFrame(
  1041. {
  1042. "A": ["X1", "X2", "X1", "X2"],
  1043. "colname": ["1", "1", "foo", "foo"],
  1044. "result": [0.0, 9.0, 5.0, 6.0],
  1045. "treatment": [1.0, 2.0, 3.0, 4.0],
  1046. }
  1047. ).set_index(["A", "colname"])
  1048. result = wide_to_long(
  1049. df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_"
  1050. )
  1051. tm.assert_frame_equal(result, expected)
  1052. def test_float_suffix(self):
  1053. df = DataFrame(
  1054. {
  1055. "treatment_1.1": [1.0, 2.0],
  1056. "treatment_2.1": [3.0, 4.0],
  1057. "result_1.2": [5.0, 6.0],
  1058. "result_1": [0, 9],
  1059. "A": ["X1", "X2"],
  1060. }
  1061. )
  1062. expected = DataFrame(
  1063. {
  1064. "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"],
  1065. "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1],
  1066. "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan],
  1067. "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0],
  1068. }
  1069. )
  1070. expected = expected.set_index(["A", "colname"])
  1071. result = wide_to_long(
  1072. df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_"
  1073. )
  1074. tm.assert_frame_equal(result, expected)
  1075. def test_col_substring_of_stubname(self):
  1076. # GH22468
  1077. # Don't raise ValueError when a column name is a substring
  1078. # of a stubname that's been passed as a string
  1079. wide_data = {
  1080. "node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
  1081. "A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
  1082. "PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
  1083. "PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
  1084. "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67},
  1085. }
  1086. wide_df = DataFrame.from_dict(wide_data)
  1087. expected = wide_to_long(wide_df, stubnames=["PA"], i=["node_id", "A"], j="time")
  1088. result = wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time")
  1089. tm.assert_frame_equal(result, expected)
  1090. def test_raise_of_column_name_value(self):
  1091. # GH34731, enforced in 2.0
  1092. # raise a ValueError if the resultant value column name matches
  1093. # a name in the dataframe already (default name is "value")
  1094. df = DataFrame({"col": list("ABC"), "value": range(10, 16, 2)})
  1095. with pytest.raises(
  1096. ValueError, match=re.escape("value_name (value) cannot match")
  1097. ):
  1098. df.melt(id_vars="value", value_name="value")
  1099. def test_missing_stubname(self, request, any_string_dtype, using_infer_string):
  1100. if using_infer_string and any_string_dtype == "object":
  1101. # triggers object dtype inference warning of dtype=object
  1102. request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
  1103. # GH46044
  1104. df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]})
  1105. df = df.astype({"id": any_string_dtype})
  1106. result = wide_to_long(
  1107. df,
  1108. stubnames=["a", "b"],
  1109. i="id",
  1110. j="num",
  1111. sep="-",
  1112. )
  1113. index = Index(
  1114. [("1", 1), ("2", 1), ("1", 2), ("2", 2)],
  1115. name=("id", "num"),
  1116. )
  1117. expected = DataFrame(
  1118. {"a": [100, 200, 300, 400], "b": [np.nan] * 4},
  1119. index=index,
  1120. )
  1121. new_level = expected.index.levels[0].astype(any_string_dtype)
  1122. if any_string_dtype == "object":
  1123. new_level = expected.index.levels[0].astype("str")
  1124. expected.index = expected.index.set_levels(new_level, level=0)
  1125. tm.assert_frame_equal(result, expected)
  1126. def test_wide_to_long_string_columns(string_storage):
  1127. # GH 57066
  1128. string_dtype = pd.StringDtype(string_storage, na_value=np.nan)
  1129. df = DataFrame(
  1130. {
  1131. "ID": {0: 1},
  1132. "R_test1": {0: 1},
  1133. "R_test2": {0: 1},
  1134. "R_test3": {0: 2},
  1135. "D": {0: 1},
  1136. }
  1137. )
  1138. df.columns = df.columns.astype(string_dtype)
  1139. result = wide_to_long(
  1140. df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*"
  1141. )
  1142. expected = DataFrame(
  1143. [[1, 1], [1, 1], [1, 2]],
  1144. columns=Index(["D", "R"]),
  1145. index=pd.MultiIndex.from_arrays(
  1146. [
  1147. [1, 1, 1],
  1148. Index(["test1", "test2", "test3"], dtype=string_dtype),
  1149. ],
  1150. names=["ID", "UNPIVOTED"],
  1151. ),
  1152. )
  1153. tm.assert_frame_equal(result, expected)