test_concat.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917
  1. from collections import (
  2. abc,
  3. deque,
  4. )
  5. from collections.abc import Iterator
  6. from datetime import datetime
  7. from decimal import Decimal
  8. import numpy as np
  9. import pytest
  10. from pandas._config import using_string_dtype
  11. from pandas.errors import InvalidIndexError
  12. import pandas.util._test_decorators as td
  13. import pandas as pd
  14. from pandas import (
  15. DataFrame,
  16. Index,
  17. MultiIndex,
  18. PeriodIndex,
  19. Series,
  20. concat,
  21. date_range,
  22. )
  23. import pandas._testing as tm
  24. from pandas.core.arrays import SparseArray
  25. from pandas.tests.extension.decimal import to_decimal
  26. class TestConcatenate:
  27. def test_append_concat(self):
  28. # GH#1815
  29. d1 = date_range("12/31/1990", "12/31/1999", freq="YE-DEC")
  30. d2 = date_range("12/31/2000", "12/31/2009", freq="YE-DEC")
  31. s1 = Series(np.random.default_rng(2).standard_normal(10), d1)
  32. s2 = Series(np.random.default_rng(2).standard_normal(10), d2)
  33. s1 = s1.to_period()
  34. s2 = s2.to_period()
  35. # drops index
  36. result = concat([s1, s2])
  37. assert isinstance(result.index, PeriodIndex)
  38. assert result.index[0] == s1.index[0]
  39. # test is not written to work with string dtype (checks .base)
  40. @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
  41. def test_concat_copy(self, using_array_manager, using_copy_on_write):
  42. df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
  43. df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1))
  44. df3 = DataFrame({5: "foo"}, index=range(4))
  45. # These are actual copies.
  46. result = concat([df, df2, df3], axis=1, copy=True)
  47. if not using_copy_on_write:
  48. for arr in result._mgr.arrays:
  49. assert not any(
  50. np.shares_memory(arr, y)
  51. for x in [df, df2, df3]
  52. for y in x._mgr.arrays
  53. )
  54. else:
  55. for arr in result._mgr.arrays:
  56. assert arr.base is not None
  57. # These are the same.
  58. result = concat([df, df2, df3], axis=1, copy=False)
  59. for arr in result._mgr.arrays:
  60. if arr.dtype.kind == "f":
  61. assert arr.base is df._mgr.arrays[0].base
  62. elif arr.dtype.kind in ["i", "u"]:
  63. assert arr.base is df2._mgr.arrays[0].base
  64. elif arr.dtype == object:
  65. if using_array_manager:
  66. # we get the same array object, which has no base
  67. assert arr is df3._mgr.arrays[0]
  68. else:
  69. assert arr.base is not None
  70. assert arr.base is not None
  71. # Float block was consolidated.
  72. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1)))
  73. result = concat([df, df2, df3, df4], axis=1, copy=False)
  74. for arr in result._mgr.arrays:
  75. if arr.dtype.kind == "f":
  76. if using_array_manager or using_copy_on_write:
  77. # this is a view on some array in either df or df4
  78. assert any(
  79. np.shares_memory(arr, other)
  80. for other in df._mgr.arrays + df4._mgr.arrays
  81. )
  82. else:
  83. # the block was consolidated, so we got a copy anyway
  84. assert arr.base is None
  85. elif arr.dtype.kind in ["i", "u"]:
  86. assert arr.base is df2._mgr.arrays[0].base
  87. elif arr.dtype == object:
  88. # this is a view on df3
  89. assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays)
  90. def test_concat_with_group_keys(self):
  91. # axis=0
  92. df = DataFrame(np.random.default_rng(2).standard_normal((3, 4)))
  93. df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)))
  94. result = concat([df, df2], keys=[0, 1])
  95. exp_index = MultiIndex.from_arrays(
  96. [[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]]
  97. )
  98. expected = DataFrame(np.r_[df.values, df2.values], index=exp_index)
  99. tm.assert_frame_equal(result, expected)
  100. result = concat([df, df], keys=[0, 1])
  101. exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
  102. expected = DataFrame(np.r_[df.values, df.values], index=exp_index2)
  103. tm.assert_frame_equal(result, expected)
  104. # axis=1
  105. df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
  106. df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)))
  107. result = concat([df, df2], keys=[0, 1], axis=1)
  108. expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index)
  109. tm.assert_frame_equal(result, expected)
  110. result = concat([df, df], keys=[0, 1], axis=1)
  111. expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2)
  112. tm.assert_frame_equal(result, expected)
  113. def test_concat_keys_specific_levels(self):
  114. df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
  115. pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]]
  116. level = ["three", "two", "one", "zero"]
  117. result = concat(
  118. pieces,
  119. axis=1,
  120. keys=["one", "two", "three"],
  121. levels=[level],
  122. names=["group_key"],
  123. )
  124. tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key"))
  125. tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3]))
  126. assert result.columns.names == ["group_key", None]
  127. @pytest.mark.parametrize("mapping", ["mapping", "dict"])
  128. def test_concat_mapping(self, mapping, non_dict_mapping_subclass):
  129. constructor = dict if mapping == "dict" else non_dict_mapping_subclass
  130. frames = constructor(
  131. {
  132. "foo": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
  133. "bar": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
  134. "baz": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
  135. "qux": DataFrame(np.random.default_rng(2).standard_normal((4, 3))),
  136. }
  137. )
  138. sorted_keys = list(frames.keys())
  139. result = concat(frames)
  140. expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys)
  141. tm.assert_frame_equal(result, expected)
  142. result = concat(frames, axis=1)
  143. expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1)
  144. tm.assert_frame_equal(result, expected)
  145. keys = ["baz", "foo", "bar"]
  146. result = concat(frames, keys=keys)
  147. expected = concat([frames[k] for k in keys], keys=keys)
  148. tm.assert_frame_equal(result, expected)
  149. def test_concat_keys_and_levels(self):
  150. df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)))
  151. df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)))
  152. levels = [["foo", "baz"], ["one", "two"]]
  153. names = ["first", "second"]
  154. result = concat(
  155. [df, df2, df, df2],
  156. keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
  157. levels=levels,
  158. names=names,
  159. )
  160. expected = concat([df, df2, df, df2])
  161. exp_index = MultiIndex(
  162. levels=levels + [[0]],
  163. codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]],
  164. names=names + [None],
  165. )
  166. expected.index = exp_index
  167. tm.assert_frame_equal(result, expected)
  168. # no names
  169. result = concat(
  170. [df, df2, df, df2],
  171. keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
  172. levels=levels,
  173. )
  174. assert result.index.names == (None,) * 3
  175. # no levels
  176. result = concat(
  177. [df, df2, df, df2],
  178. keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")],
  179. names=["first", "second"],
  180. )
  181. assert result.index.names == ("first", "second", None)
  182. tm.assert_index_equal(
  183. result.index.levels[0], Index(["baz", "foo"], name="first")
  184. )
  185. def test_concat_keys_levels_no_overlap(self):
  186. # GH #1406
  187. df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)), index=["a"])
  188. df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)), index=["b"])
  189. msg = "Values not found in passed level"
  190. with pytest.raises(ValueError, match=msg):
  191. concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]])
  192. msg = "Key one not in level"
  193. with pytest.raises(ValueError, match=msg):
  194. concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]])
  195. def test_crossed_dtypes_weird_corner(self):
  196. columns = ["A", "B", "C", "D"]
  197. df1 = DataFrame(
  198. {
  199. "A": np.array([1, 2, 3, 4], dtype="f8"),
  200. "B": np.array([1, 2, 3, 4], dtype="i8"),
  201. "C": np.array([1, 2, 3, 4], dtype="f8"),
  202. "D": np.array([1, 2, 3, 4], dtype="i8"),
  203. },
  204. columns=columns,
  205. )
  206. df2 = DataFrame(
  207. {
  208. "A": np.array([1, 2, 3, 4], dtype="i8"),
  209. "B": np.array([1, 2, 3, 4], dtype="f8"),
  210. "C": np.array([1, 2, 3, 4], dtype="i8"),
  211. "D": np.array([1, 2, 3, 4], dtype="f8"),
  212. },
  213. columns=columns,
  214. )
  215. appended = concat([df1, df2], ignore_index=True)
  216. expected = DataFrame(
  217. np.concatenate([df1.values, df2.values], axis=0), columns=columns
  218. )
  219. tm.assert_frame_equal(appended, expected)
  220. df = DataFrame(np.random.default_rng(2).standard_normal((1, 3)), index=["a"])
  221. df2 = DataFrame(np.random.default_rng(2).standard_normal((1, 4)), index=["b"])
  222. result = concat([df, df2], keys=["one", "two"], names=["first", "second"])
  223. assert result.index.names == ("first", "second")
  224. def test_with_mixed_tuples(self, sort):
  225. # 10697
  226. # columns have mixed tuples, so handle properly
  227. df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2))
  228. df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2))
  229. # it works
  230. concat([df1, df2], sort=sort)
  231. def test_concat_mixed_objs_columns(self):
  232. # Test column-wise concat for mixed series/frames (axis=1)
  233. # G2385
  234. index = date_range("01-Jan-2013", periods=10, freq="h")
  235. arr = np.arange(10, dtype="int64")
  236. s1 = Series(arr, index=index)
  237. s2 = Series(arr, index=index)
  238. df = DataFrame(arr.reshape(-1, 1), index=index)
  239. expected = DataFrame(
  240. np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0]
  241. )
  242. result = concat([df, df], axis=1)
  243. tm.assert_frame_equal(result, expected)
  244. expected = DataFrame(
  245. np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1]
  246. )
  247. result = concat([s1, s2], axis=1)
  248. tm.assert_frame_equal(result, expected)
  249. expected = DataFrame(
  250. np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2]
  251. )
  252. result = concat([s1, s2, s1], axis=1)
  253. tm.assert_frame_equal(result, expected)
  254. expected = DataFrame(
  255. np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3]
  256. )
  257. result = concat([s1, df, s2, s2, s1], axis=1)
  258. tm.assert_frame_equal(result, expected)
  259. # with names
  260. s1.name = "foo"
  261. expected = DataFrame(
  262. np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0]
  263. )
  264. result = concat([s1, df, s2], axis=1)
  265. tm.assert_frame_equal(result, expected)
  266. s2.name = "bar"
  267. expected = DataFrame(
  268. np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"]
  269. )
  270. result = concat([s1, df, s2], axis=1)
  271. tm.assert_frame_equal(result, expected)
  272. # ignore index
  273. expected = DataFrame(
  274. np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2]
  275. )
  276. result = concat([s1, df, s2], axis=1, ignore_index=True)
  277. tm.assert_frame_equal(result, expected)
  278. def test_concat_mixed_objs_index(self):
  279. # Test row-wise concat for mixed series/frames with a common name
  280. # GH2385, GH15047
  281. index = date_range("01-Jan-2013", periods=10, freq="h")
  282. arr = np.arange(10, dtype="int64")
  283. s1 = Series(arr, index=index)
  284. s2 = Series(arr, index=index)
  285. df = DataFrame(arr.reshape(-1, 1), index=index)
  286. expected = DataFrame(
  287. np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0]
  288. )
  289. result = concat([s1, df, s2])
  290. tm.assert_frame_equal(result, expected)
  291. def test_concat_mixed_objs_index_names(self):
  292. # Test row-wise concat for mixed series/frames with distinct names
  293. # GH2385, GH15047
  294. index = date_range("01-Jan-2013", periods=10, freq="h")
  295. arr = np.arange(10, dtype="int64")
  296. s1 = Series(arr, index=index, name="foo")
  297. s2 = Series(arr, index=index, name="bar")
  298. df = DataFrame(arr.reshape(-1, 1), index=index)
  299. expected = DataFrame(
  300. np.kron(np.where(np.identity(3) == 1, 1, np.nan), arr).T,
  301. index=index.tolist() * 3,
  302. columns=["foo", 0, "bar"],
  303. )
  304. result = concat([s1, df, s2])
  305. tm.assert_frame_equal(result, expected)
  306. # Rename all series to 0 when ignore_index=True
  307. expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0])
  308. result = concat([s1, df, s2], ignore_index=True)
  309. tm.assert_frame_equal(result, expected)
  310. def test_dtype_coercion(self):
  311. # 12411
  312. df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]})
  313. result = concat([df.iloc[[0]], df.iloc[[1]]])
  314. tm.assert_series_equal(result.dtypes, df.dtypes)
  315. # 12045
  316. df = DataFrame({"date": [datetime(2012, 1, 1), datetime(1012, 1, 2)]})
  317. result = concat([df.iloc[[0]], df.iloc[[1]]])
  318. tm.assert_series_equal(result.dtypes, df.dtypes)
  319. # 11594
  320. df = DataFrame({"text": ["some words"] + [None] * 9})
  321. result = concat([df.iloc[[0]], df.iloc[[1]]])
  322. tm.assert_series_equal(result.dtypes, df.dtypes)
  323. def test_concat_single_with_key(self):
  324. df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
  325. result = concat([df], keys=["foo"])
  326. expected = concat([df, df], keys=["foo", "bar"])
  327. tm.assert_frame_equal(result, expected[:10])
  328. def test_concat_no_items_raises(self):
  329. with pytest.raises(ValueError, match="No objects to concatenate"):
  330. concat([])
  331. def test_concat_exclude_none(self):
  332. df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
  333. pieces = [df[:5], None, None, df[5:]]
  334. result = concat(pieces)
  335. tm.assert_frame_equal(result, df)
  336. with pytest.raises(ValueError, match="All objects passed were None"):
  337. concat([None, None])
  338. def test_concat_keys_with_none(self):
  339. # #1649
  340. df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]])
  341. result = concat({"a": None, "b": df0, "c": df0[:2], "d": df0[:1], "e": df0})
  342. expected = concat({"b": df0, "c": df0[:2], "d": df0[:1], "e": df0})
  343. tm.assert_frame_equal(result, expected)
  344. result = concat(
  345. [None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"]
  346. )
  347. expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"])
  348. tm.assert_frame_equal(result, expected)
  349. def test_concat_bug_1719(self):
  350. ts1 = Series(
  351. np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
  352. )
  353. ts2 = ts1.copy()[::2]
  354. # to join with union
  355. # these two are of different length!
  356. left = concat([ts1, ts2], join="outer", axis=1)
  357. right = concat([ts2, ts1], join="outer", axis=1)
  358. assert len(left) == len(right)
  359. def test_concat_bug_2972(self):
  360. ts0 = Series(np.zeros(5))
  361. ts1 = Series(np.ones(5))
  362. ts0.name = ts1.name = "same name"
  363. result = concat([ts0, ts1], axis=1)
  364. expected = DataFrame({0: ts0, 1: ts1})
  365. expected.columns = ["same name", "same name"]
  366. tm.assert_frame_equal(result, expected)
  367. def test_concat_bug_3602(self):
  368. # GH 3602, duplicate columns
  369. df1 = DataFrame(
  370. {
  371. "firmNo": [0, 0, 0, 0],
  372. "prc": [6, 6, 6, 6],
  373. "stringvar": ["rrr", "rrr", "rrr", "rrr"],
  374. }
  375. )
  376. df2 = DataFrame(
  377. {"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]}
  378. )
  379. expected = DataFrame(
  380. [
  381. [0, 6, "rrr", 9, 1, 6],
  382. [0, 6, "rrr", 10, 2, 6],
  383. [0, 6, "rrr", 11, 3, 6],
  384. [0, 6, "rrr", 12, 4, 6],
  385. ]
  386. )
  387. expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"]
  388. result = concat([df1, df2], axis=1)
  389. tm.assert_frame_equal(result, expected)
  390. def test_concat_iterables(self):
  391. # GH8645 check concat works with tuples, list, generators, and weird
  392. # stuff like deque and custom iterables
  393. df1 = DataFrame([1, 2, 3])
  394. df2 = DataFrame([4, 5, 6])
  395. expected = DataFrame([1, 2, 3, 4, 5, 6])
  396. tm.assert_frame_equal(concat((df1, df2), ignore_index=True), expected)
  397. tm.assert_frame_equal(concat([df1, df2], ignore_index=True), expected)
  398. tm.assert_frame_equal(
  399. concat((df for df in (df1, df2)), ignore_index=True), expected
  400. )
  401. tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected)
  402. class CustomIterator1:
  403. def __len__(self) -> int:
  404. return 2
  405. def __getitem__(self, index):
  406. try:
  407. return {0: df1, 1: df2}[index]
  408. except KeyError as err:
  409. raise IndexError from err
  410. tm.assert_frame_equal(concat(CustomIterator1(), ignore_index=True), expected)
  411. class CustomIterator2(abc.Iterable):
  412. def __iter__(self) -> Iterator:
  413. yield df1
  414. yield df2
  415. tm.assert_frame_equal(concat(CustomIterator2(), ignore_index=True), expected)
  416. def test_concat_order(self):
  417. # GH 17344, GH#47331
  418. dfs = [DataFrame(index=range(3), columns=["a", 1, None])]
  419. dfs += [DataFrame(index=range(3), columns=[None, 1, "a"]) for _ in range(100)]
  420. result = concat(dfs, sort=True).columns
  421. expected = Index([1, "a", None])
  422. tm.assert_index_equal(result, expected)
  423. def test_concat_different_extension_dtypes_upcasts(self):
  424. a = Series(pd.array([1, 2], dtype="Int64"))
  425. b = Series(to_decimal([1, 2]))
  426. result = concat([a, b], ignore_index=True)
  427. expected = Series([1, 2, Decimal(1), Decimal(2)], dtype=object)
  428. tm.assert_series_equal(result, expected)
  429. def test_concat_ordered_dict(self):
  430. # GH 21510
  431. expected = concat(
  432. [Series(range(3)), Series(range(4))], keys=["First", "Another"]
  433. )
  434. result = concat({"First": Series(range(3)), "Another": Series(range(4))})
  435. tm.assert_series_equal(result, expected)
  436. def test_concat_duplicate_indices_raise(self):
  437. # GH 45888: test raise for concat DataFrames with duplicate indices
  438. # https://github.com/pandas-dev/pandas/issues/36263
  439. df1 = DataFrame(
  440. np.random.default_rng(2).standard_normal(5),
  441. index=[0, 1, 2, 3, 3],
  442. columns=["a"],
  443. )
  444. df2 = DataFrame(
  445. np.random.default_rng(2).standard_normal(5),
  446. index=[0, 1, 2, 2, 4],
  447. columns=["b"],
  448. )
  449. msg = "Reindexing only valid with uniquely valued Index objects"
  450. with pytest.raises(InvalidIndexError, match=msg):
  451. concat([df1, df2], axis=1)
  452. def test_concat_no_unnecessary_upcast(float_numpy_dtype, frame_or_series):
  453. # GH 13247
  454. dims = frame_or_series(dtype=object).ndim
  455. dt = float_numpy_dtype
  456. dfs = [
  457. frame_or_series(np.array([1], dtype=dt, ndmin=dims)),
  458. frame_or_series(np.array([np.nan], dtype=dt, ndmin=dims)),
  459. frame_or_series(np.array([5], dtype=dt, ndmin=dims)),
  460. ]
  461. x = concat(dfs)
  462. assert x.values.dtype == dt
  463. @pytest.mark.parametrize("pdt", [Series, DataFrame])
  464. def test_concat_will_upcast(pdt, any_signed_int_numpy_dtype):
  465. dt = any_signed_int_numpy_dtype
  466. dims = pdt().ndim
  467. dfs = [
  468. pdt(np.array([1], dtype=dt, ndmin=dims)),
  469. pdt(np.array([np.nan], ndmin=dims)),
  470. pdt(np.array([5], dtype=dt, ndmin=dims)),
  471. ]
  472. x = concat(dfs)
  473. assert x.values.dtype == "float64"
  474. def test_concat_empty_and_non_empty_frame_regression():
  475. # GH 18178 regression test
  476. df1 = DataFrame({"foo": [1]})
  477. df2 = DataFrame({"foo": []})
  478. expected = DataFrame({"foo": [1.0]})
  479. result = concat([df1, df2])
  480. tm.assert_frame_equal(result, expected)
  481. def test_concat_sparse():
  482. # GH 23557
  483. a = Series(SparseArray([0, 1, 2]))
  484. expected = DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype(
  485. pd.SparseDtype(np.int64, 0)
  486. )
  487. result = concat([a, a], axis=1)
  488. tm.assert_frame_equal(result, expected)
  489. def test_concat_dense_sparse():
  490. # GH 30668
  491. dtype = pd.SparseDtype(np.float64, None)
  492. a = Series(pd.arrays.SparseArray([1, None]), dtype=dtype)
  493. b = Series([1], dtype=float)
  494. expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(dtype)
  495. result = concat([a, b], axis=0)
  496. tm.assert_series_equal(result, expected)
  497. @pytest.mark.parametrize("keys", [["e", "f", "f"], ["f", "e", "f"]])
  498. def test_duplicate_keys(keys):
  499. # GH 33654
  500. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
  501. s1 = Series([7, 8, 9], name="c")
  502. s2 = Series([10, 11, 12], name="d")
  503. result = concat([df, s1, s2], axis=1, keys=keys)
  504. expected_values = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]]
  505. expected_columns = MultiIndex.from_tuples(
  506. [(keys[0], "a"), (keys[0], "b"), (keys[1], "c"), (keys[2], "d")]
  507. )
  508. expected = DataFrame(expected_values, columns=expected_columns)
  509. tm.assert_frame_equal(result, expected)
  510. def test_duplicate_keys_same_frame():
  511. # GH 43595
  512. keys = ["e", "e"]
  513. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
  514. result = concat([df, df], axis=1, keys=keys)
  515. expected_values = [[1, 4, 1, 4], [2, 5, 2, 5], [3, 6, 3, 6]]
  516. expected_columns = MultiIndex.from_tuples(
  517. [(keys[0], "a"), (keys[0], "b"), (keys[1], "a"), (keys[1], "b")]
  518. )
  519. expected = DataFrame(expected_values, columns=expected_columns)
  520. tm.assert_frame_equal(result, expected)
  521. @pytest.mark.filterwarnings(
  522. "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
  523. )
  524. @pytest.mark.parametrize(
  525. "obj",
  526. [
  527. tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
  528. tm.SubclassedSeries(np.arange(0, 10), name="A"),
  529. ],
  530. )
  531. def test_concat_preserves_subclass(obj):
  532. # GH28330 -- preserve subclass
  533. result = concat([obj, obj])
  534. assert isinstance(result, type(obj))
  535. def test_concat_frame_axis0_extension_dtypes():
  536. # preserve extension dtype (through common_dtype mechanism)
  537. df1 = DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")})
  538. df2 = DataFrame({"a": np.array([4, 5, 6])})
  539. result = concat([df1, df2], ignore_index=True)
  540. expected = DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64")
  541. tm.assert_frame_equal(result, expected)
  542. result = concat([df2, df1], ignore_index=True)
  543. expected = DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
  544. tm.assert_frame_equal(result, expected)
  545. def test_concat_preserves_extension_int64_dtype():
  546. # GH 24768
  547. df_a = DataFrame({"a": [-1]}, dtype="Int64")
  548. df_b = DataFrame({"b": [1]}, dtype="Int64")
  549. result = concat([df_a, df_b], ignore_index=True)
  550. expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64")
  551. tm.assert_frame_equal(result, expected)
  552. @pytest.mark.parametrize(
  553. "dtype1,dtype2,expected_dtype",
  554. [
  555. ("bool", "bool", "bool"),
  556. ("boolean", "bool", "boolean"),
  557. ("bool", "boolean", "boolean"),
  558. ("boolean", "boolean", "boolean"),
  559. ],
  560. )
  561. def test_concat_bool_types(dtype1, dtype2, expected_dtype):
  562. # GH 42800
  563. ser1 = Series([True, False], dtype=dtype1)
  564. ser2 = Series([False, True], dtype=dtype2)
  565. result = concat([ser1, ser2], ignore_index=True)
  566. expected = Series([True, False, False, True], dtype=expected_dtype)
  567. tm.assert_series_equal(result, expected)
  568. @pytest.mark.parametrize(
  569. ("keys", "integrity"),
  570. [
  571. (["red"] * 3, True),
  572. (["red"] * 3, False),
  573. (["red", "blue", "red"], False),
  574. (["red", "blue", "red"], True),
  575. ],
  576. )
  577. def test_concat_repeated_keys(keys, integrity):
  578. # GH: 20816
  579. series_list = [Series({"a": 1}), Series({"b": 2}), Series({"c": 3})]
  580. result = concat(series_list, keys=keys, verify_integrity=integrity)
  581. tuples = list(zip(keys, ["a", "b", "c"]))
  582. expected = Series([1, 2, 3], index=MultiIndex.from_tuples(tuples))
  583. tm.assert_series_equal(result, expected)
  584. def test_concat_null_object_with_dti():
  585. # GH#40841
  586. dti = pd.DatetimeIndex(
  587. ["2021-04-08 21:21:14+00:00"], dtype="datetime64[ns, UTC]", name="Time (UTC)"
  588. )
  589. right = DataFrame(data={"C": [0.5274]}, index=dti)
  590. idx = Index([None], dtype="object", name="Maybe Time (UTC)")
  591. left = DataFrame(data={"A": [None], "B": [np.nan]}, index=idx)
  592. result = concat([left, right], axis="columns")
  593. exp_index = Index([None, dti[0]], dtype=object)
  594. expected = DataFrame(
  595. {
  596. "A": np.array([None, np.nan], dtype=object),
  597. "B": [np.nan, np.nan],
  598. "C": [np.nan, 0.5274],
  599. },
  600. index=exp_index,
  601. )
  602. tm.assert_frame_equal(result, expected)
  603. def test_concat_multiindex_with_empty_rangeindex():
  604. # GH#41234
  605. mi = MultiIndex.from_tuples([("B", 1), ("C", 1)])
  606. df1 = DataFrame([[1, 2]], columns=mi)
  607. df2 = DataFrame(index=[1], columns=pd.RangeIndex(0))
  608. result = concat([df1, df2])
  609. expected = DataFrame([[1, 2], [np.nan, np.nan]], columns=mi)
  610. tm.assert_frame_equal(result, expected)
  611. @pytest.mark.parametrize(
  612. "data",
  613. [
  614. Series(data=[1, 2]),
  615. DataFrame(
  616. data={
  617. "col1": [1, 2],
  618. }
  619. ),
  620. DataFrame(dtype=float),
  621. Series(dtype=float),
  622. ],
  623. )
  624. def test_concat_drop_attrs(data):
  625. # GH#41828
  626. df1 = data.copy()
  627. df1.attrs = {1: 1}
  628. df2 = data.copy()
  629. df2.attrs = {1: 2}
  630. df = concat([df1, df2])
  631. assert len(df.attrs) == 0
  632. @pytest.mark.parametrize(
  633. "data",
  634. [
  635. Series(data=[1, 2]),
  636. DataFrame(
  637. data={
  638. "col1": [1, 2],
  639. }
  640. ),
  641. DataFrame(dtype=float),
  642. Series(dtype=float),
  643. ],
  644. )
  645. def test_concat_retain_attrs(data):
  646. # GH#41828
  647. df1 = data.copy()
  648. df1.attrs = {1: 1}
  649. df2 = data.copy()
  650. df2.attrs = {1: 1}
  651. df = concat([df1, df2])
  652. assert df.attrs[1] == 1
  653. @td.skip_array_manager_invalid_test
  654. @pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"])
  655. @pytest.mark.parametrize("empty_dtype", [None, "float64", "object"])
  656. def test_concat_ignore_empty_object_float(empty_dtype, df_dtype):
  657. # https://github.com/pandas-dev/pandas/issues/45637
  658. df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
  659. empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype)
  660. msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
  661. warn = None
  662. if df_dtype == "datetime64[ns]" or (
  663. df_dtype == "float64" and empty_dtype != "float64"
  664. ):
  665. warn = FutureWarning
  666. with tm.assert_produces_warning(warn, match=msg):
  667. result = concat([empty, df])
  668. expected = df
  669. if df_dtype == "int64":
  670. # TODO what exact behaviour do we want for integer eventually?
  671. if empty_dtype == "float64":
  672. expected = df.astype("float64")
  673. else:
  674. expected = df.astype("object")
  675. tm.assert_frame_equal(result, expected)
  676. @td.skip_array_manager_invalid_test
  677. @pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"])
  678. @pytest.mark.parametrize("empty_dtype", [None, "float64", "object"])
  679. def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype):
  680. df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype)
  681. empty = DataFrame({"foo": [np.nan], "bar": [np.nan]}, dtype=empty_dtype)
  682. if df_dtype == "int64":
  683. # TODO what exact behaviour do we want for integer eventually?
  684. if empty_dtype == "object":
  685. df_dtype = "object"
  686. else:
  687. df_dtype = "float64"
  688. msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
  689. warn = None
  690. if empty_dtype != df_dtype and empty_dtype is not None:
  691. warn = FutureWarning
  692. elif df_dtype == "datetime64[ns]":
  693. warn = FutureWarning
  694. with tm.assert_produces_warning(warn, match=msg):
  695. result = concat([empty, df], ignore_index=True)
  696. expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype)
  697. tm.assert_frame_equal(result, expected)
  698. @td.skip_array_manager_invalid_test
  699. def test_concat_ignore_empty_from_reindex():
  700. # https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856
  701. df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]})
  702. df2 = DataFrame({"a": [2]})
  703. aligned = df2.reindex(columns=df1.columns)
  704. msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
  705. with tm.assert_produces_warning(FutureWarning, match=msg):
  706. result = concat([df1, aligned], ignore_index=True)
  707. expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]})
  708. tm.assert_frame_equal(result, expected)
  709. def test_concat_mismatched_keys_length():
  710. # GH#43485
  711. ser = Series(range(5))
  712. sers = [ser + n for n in range(4)]
  713. keys = ["A", "B", "C"]
  714. msg = r"The behavior of pd.concat with len\(keys\) != len\(objs\) is deprecated"
  715. with tm.assert_produces_warning(FutureWarning, match=msg):
  716. concat(sers, keys=keys, axis=1)
  717. with tm.assert_produces_warning(FutureWarning, match=msg):
  718. concat(sers, keys=keys, axis=0)
  719. with tm.assert_produces_warning(FutureWarning, match=msg):
  720. concat((x for x in sers), keys=(y for y in keys), axis=1)
  721. with tm.assert_produces_warning(FutureWarning, match=msg):
  722. concat((x for x in sers), keys=(y for y in keys), axis=0)
  723. def test_concat_multiindex_with_category():
  724. df1 = DataFrame(
  725. {
  726. "c1": Series(list("abc"), dtype="category"),
  727. "c2": Series(list("eee"), dtype="category"),
  728. "i2": Series([1, 2, 3]),
  729. }
  730. )
  731. df1 = df1.set_index(["c1", "c2"])
  732. df2 = DataFrame(
  733. {
  734. "c1": Series(list("abc"), dtype="category"),
  735. "c2": Series(list("eee"), dtype="category"),
  736. "i2": Series([4, 5, 6]),
  737. }
  738. )
  739. df2 = df2.set_index(["c1", "c2"])
  740. result = concat([df1, df2])
  741. expected = DataFrame(
  742. {
  743. "c1": Series(list("abcabc"), dtype="category"),
  744. "c2": Series(list("eeeeee"), dtype="category"),
  745. "i2": Series([1, 2, 3, 4, 5, 6]),
  746. }
  747. )
  748. expected = expected.set_index(["c1", "c2"])
  749. tm.assert_frame_equal(result, expected)
  750. def test_concat_ea_upcast():
  751. # GH#54848
  752. df1 = DataFrame(["a"], dtype="string")
  753. df2 = DataFrame([1], dtype="Int64")
  754. result = concat([df1, df2])
  755. expected = DataFrame(["a", 1], index=[0, 0])
  756. tm.assert_frame_equal(result, expected)
  757. def test_concat_none_with_timezone_timestamp():
  758. # GH#52093
  759. df1 = DataFrame([{"A": None}])
  760. df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}])
  761. msg = "The behavior of DataFrame concatenation with empty or all-NA entries"
  762. with tm.assert_produces_warning(FutureWarning, match=msg):
  763. result = concat([df1, df2], ignore_index=True)
  764. expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
  765. tm.assert_frame_equal(result, expected)