test_empty.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. import numpy as np
  2. import pytest
  3. from pandas._config import using_string_dtype
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. RangeIndex,
  8. Series,
  9. concat,
  10. date_range,
  11. )
  12. import pandas._testing as tm
  13. class TestEmptyConcat:
  14. def test_handle_empty_objects(self, sort, using_infer_string):
  15. df = DataFrame(
  16. np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
  17. )
  18. dfcopy = df[:5].copy()
  19. dfcopy["foo"] = "bar"
  20. empty = df[5:5]
  21. frames = [dfcopy, empty, empty, df[5:]]
  22. concatted = concat(frames, axis=0, sort=sort)
  23. expected = df.reindex(columns=["a", "b", "c", "d", "foo"])
  24. expected["foo"] = expected["foo"].astype(
  25. object if not using_infer_string else "str"
  26. )
  27. expected.loc[0:4, "foo"] = "bar"
  28. tm.assert_frame_equal(concatted, expected)
  29. # empty as first element with time series
  30. # GH3259
  31. df = DataFrame(
  32. {"A": range(10000)}, index=date_range("20130101", periods=10000, freq="s")
  33. )
  34. empty = DataFrame()
  35. result = concat([df, empty], axis=1)
  36. tm.assert_frame_equal(result, df)
  37. result = concat([empty, df], axis=1)
  38. tm.assert_frame_equal(result, df)
  39. result = concat([df, empty])
  40. tm.assert_frame_equal(result, df)
  41. result = concat([empty, df])
  42. tm.assert_frame_equal(result, df)
  43. def test_concat_empty_series(self):
  44. # GH 11082
  45. s1 = Series([1, 2, 3], name="x")
  46. s2 = Series(name="y", dtype="float64")
  47. res = concat([s1, s2], axis=1)
  48. exp = DataFrame(
  49. {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]},
  50. index=RangeIndex(3),
  51. )
  52. tm.assert_frame_equal(res, exp)
  53. s1 = Series([1, 2, 3], name="x")
  54. s2 = Series(name="y", dtype="float64")
  55. msg = "The behavior of array concatenation with empty entries is deprecated"
  56. with tm.assert_produces_warning(FutureWarning, match=msg):
  57. res = concat([s1, s2], axis=0)
  58. # name will be reset
  59. exp = Series([1, 2, 3])
  60. tm.assert_series_equal(res, exp)
  61. # empty Series with no name
  62. s1 = Series([1, 2, 3], name="x")
  63. s2 = Series(name=None, dtype="float64")
  64. res = concat([s1, s2], axis=1)
  65. exp = DataFrame(
  66. {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
  67. columns=["x", 0],
  68. index=RangeIndex(3),
  69. )
  70. tm.assert_frame_equal(res, exp)
  71. @pytest.mark.parametrize("tz", [None, "UTC"])
  72. @pytest.mark.parametrize("values", [[], [1, 2, 3]])
  73. def test_concat_empty_series_timelike(self, tz, values):
  74. # GH 18447
  75. first = Series([], dtype="M8[ns]").dt.tz_localize(tz)
  76. dtype = None if values else np.float64
  77. second = Series(values, dtype=dtype)
  78. expected = DataFrame(
  79. {
  80. 0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
  81. 1: values,
  82. }
  83. )
  84. result = concat([first, second], axis=1)
  85. tm.assert_frame_equal(result, expected)
  86. @pytest.mark.parametrize(
  87. "left,right,expected",
  88. [
  89. # booleans
  90. (np.bool_, np.int32, np.object_), # changed from int32 in 2.0 GH#39817
  91. (np.bool_, np.float32, np.object_),
  92. # datetime-like
  93. ("m8[ns]", np.bool_, np.object_),
  94. ("m8[ns]", np.int64, np.object_),
  95. ("M8[ns]", np.bool_, np.object_),
  96. ("M8[ns]", np.int64, np.object_),
  97. # categorical
  98. ("category", "category", "category"),
  99. ("category", "object", "object"),
  100. ],
  101. )
  102. def test_concat_empty_series_dtypes(self, left, right, expected):
  103. # GH#39817, GH#45101
  104. result = concat([Series(dtype=left), Series(dtype=right)])
  105. assert result.dtype == expected
  106. @pytest.mark.parametrize(
  107. "dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]
  108. )
  109. def test_concat_empty_series_dtypes_match_roundtrips(self, dtype):
  110. dtype = np.dtype(dtype)
  111. result = concat([Series(dtype=dtype)])
  112. assert result.dtype == dtype
  113. result = concat([Series(dtype=dtype), Series(dtype=dtype)])
  114. assert result.dtype == dtype
  115. @pytest.mark.parametrize("dtype", ["float64", "int8", "uint8", "m8[ns]", "M8[ns]"])
  116. @pytest.mark.parametrize(
  117. "dtype2",
  118. ["float64", "int8", "uint8", "m8[ns]", "M8[ns]"],
  119. )
  120. def test_concat_empty_series_dtypes_roundtrips(self, dtype, dtype2):
  121. # round-tripping with self & like self
  122. if dtype == dtype2:
  123. pytest.skip("same dtype is not applicable for test")
  124. def int_result_type(dtype, dtype2):
  125. typs = {dtype.kind, dtype2.kind}
  126. if not len(typs - {"i", "u", "b"}) and (
  127. dtype.kind == "i" or dtype2.kind == "i"
  128. ):
  129. return "i"
  130. elif not len(typs - {"u", "b"}) and (
  131. dtype.kind == "u" or dtype2.kind == "u"
  132. ):
  133. return "u"
  134. return None
  135. def float_result_type(dtype, dtype2):
  136. typs = {dtype.kind, dtype2.kind}
  137. if not len(typs - {"f", "i", "u"}) and (
  138. dtype.kind == "f" or dtype2.kind == "f"
  139. ):
  140. return "f"
  141. return None
  142. def get_result_type(dtype, dtype2):
  143. result = float_result_type(dtype, dtype2)
  144. if result is not None:
  145. return result
  146. result = int_result_type(dtype, dtype2)
  147. if result is not None:
  148. return result
  149. return "O"
  150. dtype = np.dtype(dtype)
  151. dtype2 = np.dtype(dtype2)
  152. expected = get_result_type(dtype, dtype2)
  153. result = concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype
  154. assert result.kind == expected
  155. def test_concat_empty_series_dtypes_triple(self):
  156. assert (
  157. concat(
  158. [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)]
  159. ).dtype
  160. == np.object_
  161. )
  162. def test_concat_empty_series_dtype_category_with_array(self):
  163. # GH#18515
  164. assert (
  165. concat(
  166. [Series(np.array([]), dtype="category"), Series(dtype="float64")]
  167. ).dtype
  168. == "float64"
  169. )
  170. def test_concat_empty_series_dtypes_sparse(self):
  171. result = concat(
  172. [
  173. Series(dtype="float64").astype("Sparse"),
  174. Series(dtype="float64").astype("Sparse"),
  175. ]
  176. )
  177. assert result.dtype == "Sparse[float64]"
  178. result = concat(
  179. [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")]
  180. )
  181. expected = pd.SparseDtype(np.float64)
  182. assert result.dtype == expected
  183. result = concat(
  184. [Series(dtype="float64").astype("Sparse"), Series(dtype="object")]
  185. )
  186. expected = pd.SparseDtype("object")
  187. assert result.dtype == expected
  188. def test_concat_empty_df_object_dtype(self):
  189. # GH 9149
  190. df_1 = DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]})
  191. df_2 = DataFrame(columns=df_1.columns)
  192. result = concat([df_1, df_2], axis=0)
  193. expected = df_1.astype(object)
  194. tm.assert_frame_equal(result, expected)
  195. def test_concat_empty_dataframe_dtypes(self):
  196. df = DataFrame(columns=list("abc"))
  197. df["a"] = df["a"].astype(np.bool_)
  198. df["b"] = df["b"].astype(np.int32)
  199. df["c"] = df["c"].astype(np.float64)
  200. result = concat([df, df])
  201. assert result["a"].dtype == np.bool_
  202. assert result["b"].dtype == np.int32
  203. assert result["c"].dtype == np.float64
  204. result = concat([df, df.astype(np.float64)])
  205. assert result["a"].dtype == np.object_
  206. assert result["b"].dtype == np.float64
  207. assert result["c"].dtype == np.float64
  208. # triggers warning about empty entries
  209. @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
  210. def test_concat_inner_join_empty(self):
  211. # GH 15328
  212. df_empty = DataFrame()
  213. df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
  214. df_expected = DataFrame({"a": []}, index=RangeIndex(0), dtype="int64")
  215. result = concat([df_a, df_empty], axis=1, join="inner")
  216. tm.assert_frame_equal(result, df_expected)
  217. result = concat([df_a, df_empty], axis=1, join="outer")
  218. tm.assert_frame_equal(result, df_a)
  219. def test_empty_dtype_coerce(self):
  220. # xref to #12411
  221. # xref to #12045
  222. # xref to #11594
  223. # see below
  224. # 10571
  225. df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"])
  226. df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"])
  227. result = concat([df1, df2])
  228. expected = df1.dtypes
  229. tm.assert_series_equal(result.dtypes, expected)
  230. def test_concat_empty_dataframe(self):
  231. # 39037
  232. df1 = DataFrame(columns=["a", "b"])
  233. df2 = DataFrame(columns=["b", "c"])
  234. result = concat([df1, df2, df1])
  235. expected = DataFrame(columns=["a", "b", "c"])
  236. tm.assert_frame_equal(result, expected)
  237. df3 = DataFrame(columns=["a", "b"])
  238. df4 = DataFrame(columns=["b"])
  239. result = concat([df3, df4])
  240. expected = DataFrame(columns=["a", "b"])
  241. tm.assert_frame_equal(result, expected)
  242. def test_concat_empty_dataframe_different_dtypes(self, using_infer_string):
  243. # 39037
  244. df1 = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
  245. df2 = DataFrame({"a": [1, 2, 3]})
  246. result = concat([df1[:0], df2[:0]])
  247. assert result["a"].dtype == np.int64
  248. assert result["b"].dtype == np.object_ if not using_infer_string else "str"
  249. def test_concat_to_empty_ea(self):
  250. """48510 `concat` to an empty EA should maintain type EA dtype."""
  251. df_empty = DataFrame({"a": pd.array([], dtype=pd.Int64Dtype())})
  252. df_new = DataFrame({"a": pd.array([1, 2, 3], dtype=pd.Int64Dtype())})
  253. expected = df_new.copy()
  254. result = concat([df_empty, df_new])
  255. tm.assert_frame_equal(result, expected)