test_append.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. import datetime as dt
  2. from itertools import combinations
  3. import dateutil
  4. import numpy as np
  5. import pytest
  6. import pandas as pd
  7. from pandas import (
  8. DataFrame,
  9. Index,
  10. Series,
  11. Timestamp,
  12. concat,
  13. isna,
  14. )
  15. import pandas._testing as tm
  16. class TestAppend:
  17. def test_append(self, sort, float_frame):
  18. mixed_frame = float_frame.copy()
  19. mixed_frame["foo"] = "bar"
  20. begin_index = float_frame.index[:5]
  21. end_index = float_frame.index[5:]
  22. begin_frame = float_frame.reindex(begin_index)
  23. end_frame = float_frame.reindex(end_index)
  24. appended = begin_frame._append(end_frame)
  25. tm.assert_almost_equal(appended["A"], float_frame["A"])
  26. del end_frame["A"]
  27. partial_appended = begin_frame._append(end_frame, sort=sort)
  28. assert "A" in partial_appended
  29. partial_appended = end_frame._append(begin_frame, sort=sort)
  30. assert "A" in partial_appended
  31. # mixed type handling
  32. appended = mixed_frame[:5]._append(mixed_frame[5:])
  33. tm.assert_frame_equal(appended, mixed_frame)
  34. # what to test here
  35. mixed_appended = mixed_frame[:5]._append(float_frame[5:], sort=sort)
  36. mixed_appended2 = float_frame[:5]._append(mixed_frame[5:], sort=sort)
  37. # all equal except 'foo' column
  38. tm.assert_frame_equal(
  39. mixed_appended.reindex(columns=["A", "B", "C", "D"]),
  40. mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
  41. )
  42. def test_append_empty(self, float_frame):
  43. empty = DataFrame()
  44. appended = float_frame._append(empty)
  45. tm.assert_frame_equal(float_frame, appended)
  46. assert appended is not float_frame
  47. appended = empty._append(float_frame)
  48. tm.assert_frame_equal(float_frame, appended)
  49. assert appended is not float_frame
  50. def test_append_overlap_raises(self, float_frame):
  51. msg = "Indexes have overlapping values"
  52. with pytest.raises(ValueError, match=msg):
  53. float_frame._append(float_frame, verify_integrity=True)
  54. def test_append_new_columns(self):
  55. # see gh-6129: new columns
  56. df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
  57. row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
  58. expected = DataFrame(
  59. {
  60. "a": {"x": 1, "y": 2, "z": 5},
  61. "b": {"x": 3, "y": 4, "z": 6},
  62. "c": {"z": 7},
  63. }
  64. )
  65. result = df._append(row)
  66. tm.assert_frame_equal(result, expected)
  67. def test_append_length0_frame(self, sort):
  68. df = DataFrame(columns=["A", "B", "C"])
  69. df3 = DataFrame(index=[0, 1], columns=["A", "B"])
  70. df5 = df._append(df3, sort=sort)
  71. expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
  72. tm.assert_frame_equal(df5, expected)
  73. def test_append_records(self):
  74. arr1 = np.zeros((2,), dtype=("i4,f4,S10"))
  75. arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
  76. arr2 = np.zeros((3,), dtype=("i4,f4,S10"))
  77. arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
  78. df1 = DataFrame(arr1)
  79. df2 = DataFrame(arr2)
  80. result = df1._append(df2, ignore_index=True)
  81. expected = DataFrame(np.concatenate((arr1, arr2)))
  82. tm.assert_frame_equal(result, expected)
  83. # rewrite sort fixture, since we also want to test default of None
  84. def test_append_sorts(self, sort):
  85. df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
  86. df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])
  87. result = df1._append(df2, sort=sort)
  88. # for None / True
  89. expected = DataFrame(
  90. {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]},
  91. columns=["a", "b", "c"],
  92. )
  93. if sort is False:
  94. expected = expected[["b", "a", "c"]]
  95. tm.assert_frame_equal(result, expected)
  96. def test_append_different_columns(self, sort):
  97. df = DataFrame(
  98. {
  99. "bools": np.random.default_rng(2).standard_normal(10) > 0,
  100. "ints": np.random.default_rng(2).integers(0, 10, 10),
  101. "floats": np.random.default_rng(2).standard_normal(10),
  102. "strings": ["foo", "bar"] * 5,
  103. }
  104. )
  105. a = df[:5].loc[:, ["bools", "ints", "floats"]]
  106. b = df[5:].loc[:, ["strings", "ints", "floats"]]
  107. appended = a._append(b, sort=sort)
  108. assert isna(appended["strings"][0:4]).all()
  109. assert isna(appended["bools"][5:]).all()
  110. def test_append_many(self, sort, float_frame):
  111. chunks = [
  112. float_frame[:5],
  113. float_frame[5:10],
  114. float_frame[10:15],
  115. float_frame[15:],
  116. ]
  117. result = chunks[0]._append(chunks[1:])
  118. tm.assert_frame_equal(result, float_frame)
  119. chunks[-1] = chunks[-1].copy()
  120. chunks[-1]["foo"] = "bar"
  121. result = chunks[0]._append(chunks[1:], sort=sort)
  122. tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame)
  123. assert (result["foo"][15:] == "bar").all()
  124. assert result["foo"][:15].isna().all()
  125. def test_append_preserve_index_name(self):
  126. # #980
  127. df1 = DataFrame(columns=["A", "B", "C"])
  128. df1 = df1.set_index(["A"])
  129. df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
  130. df2 = df2.set_index(["A"])
  131. msg = "The behavior of array concatenation with empty entries is deprecated"
  132. with tm.assert_produces_warning(FutureWarning, match=msg):
  133. result = df1._append(df2)
  134. assert result.index.name == "A"
  135. indexes_can_append = [
  136. pd.RangeIndex(3),
  137. Index([4, 5, 6]),
  138. Index([4.5, 5.5, 6.5]),
  139. Index(list("abc")),
  140. pd.CategoricalIndex("A B C".split()),
  141. pd.CategoricalIndex("D E F".split(), ordered=True),
  142. pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
  143. pd.DatetimeIndex(
  144. [
  145. dt.datetime(2013, 1, 3, 0, 0),
  146. dt.datetime(2013, 1, 3, 6, 10),
  147. dt.datetime(2013, 1, 3, 7, 12),
  148. ]
  149. ),
  150. pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]),
  151. ]
  152. @pytest.mark.parametrize(
  153. "index", indexes_can_append, ids=lambda x: type(x).__name__
  154. )
  155. def test_append_same_columns_type(self, index):
  156. # GH18359
  157. # df wider than ser
  158. df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
  159. ser_index = index[:2]
  160. ser = Series([7, 8], index=ser_index, name=2)
  161. result = df._append(ser)
  162. expected = DataFrame(
  163. [[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index
  164. )
  165. # integer dtype is preserved for columns present in ser.index
  166. assert expected.dtypes.iloc[0].kind == "i"
  167. assert expected.dtypes.iloc[1].kind == "i"
  168. tm.assert_frame_equal(result, expected)
  169. # ser wider than df
  170. ser_index = index
  171. index = index[:2]
  172. df = DataFrame([[1, 2], [4, 5]], columns=index)
  173. ser = Series([7, 8, 9], index=ser_index, name=2)
  174. result = df._append(ser)
  175. expected = DataFrame(
  176. [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
  177. index=[0, 1, 2],
  178. columns=ser_index,
  179. )
  180. tm.assert_frame_equal(result, expected)
  181. @pytest.mark.parametrize(
  182. "df_columns, series_index",
  183. combinations(indexes_can_append, r=2),
  184. ids=lambda x: type(x).__name__,
  185. )
  186. def test_append_different_columns_types(self, df_columns, series_index):
  187. # GH18359
  188. # See also test 'test_append_different_columns_types_raises' below
  189. # for errors raised when appending
  190. df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
  191. ser = Series([7, 8, 9], index=series_index, name=2)
  192. result = df._append(ser)
  193. idx_diff = ser.index.difference(df_columns)
  194. combined_columns = Index(df_columns.tolist()).append(idx_diff)
  195. expected = DataFrame(
  196. [
  197. [1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
  198. [4, 5, 6, np.nan, np.nan, np.nan],
  199. [np.nan, np.nan, np.nan, 7, 8, 9],
  200. ],
  201. index=[0, 1, 2],
  202. columns=combined_columns,
  203. )
  204. tm.assert_frame_equal(result, expected)
  205. def test_append_dtype_coerce(self, sort):
  206. # GH 4993
  207. # appending with datetime will incorrectly convert datetime64
  208. df1 = DataFrame(
  209. index=[1, 2],
  210. data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)],
  211. columns=["start_time"],
  212. )
  213. df2 = DataFrame(
  214. index=[4, 5],
  215. data=[
  216. [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)],
  217. [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)],
  218. ],
  219. columns=["start_time", "end_time"],
  220. )
  221. expected = concat(
  222. [
  223. Series(
  224. [
  225. pd.NaT,
  226. pd.NaT,
  227. dt.datetime(2013, 1, 3, 6, 10),
  228. dt.datetime(2013, 1, 4, 7, 10),
  229. ],
  230. name="end_time",
  231. ),
  232. Series(
  233. [
  234. dt.datetime(2013, 1, 1, 0, 0),
  235. dt.datetime(2013, 1, 2, 0, 0),
  236. dt.datetime(2013, 1, 3, 0, 0),
  237. dt.datetime(2013, 1, 4, 0, 0),
  238. ],
  239. name="start_time",
  240. ),
  241. ],
  242. axis=1,
  243. sort=sort,
  244. )
  245. result = df1._append(df2, ignore_index=True, sort=sort)
  246. if sort:
  247. expected = expected[["end_time", "start_time"]]
  248. else:
  249. expected = expected[["start_time", "end_time"]]
  250. tm.assert_frame_equal(result, expected)
  251. def test_append_missing_column_proper_upcast(self, sort):
  252. df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
  253. df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)})
  254. appended = df1._append(df2, ignore_index=True, sort=sort)
  255. assert appended["A"].dtype == "f8"
  256. assert appended["B"].dtype == "O"
  257. def test_append_empty_frame_to_series_with_dateutil_tz(self):
  258. # GH 23682
  259. date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
  260. ser = Series({"a": 1.0, "b": 2.0, "date": date})
  261. df = DataFrame(columns=["c", "d"])
  262. result_a = df._append(ser, ignore_index=True)
  263. expected = DataFrame(
  264. [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
  265. )
  266. # These columns get cast to object after append
  267. expected["c"] = expected["c"].astype(object)
  268. expected["d"] = expected["d"].astype(object)
  269. tm.assert_frame_equal(result_a, expected)
  270. expected = DataFrame(
  271. [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
  272. )
  273. expected["c"] = expected["c"].astype(object)
  274. expected["d"] = expected["d"].astype(object)
  275. result_b = result_a._append(ser, ignore_index=True)
  276. tm.assert_frame_equal(result_b, expected)
  277. result = df._append([ser, ser], ignore_index=True)
  278. tm.assert_frame_equal(result, expected)
  279. def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager):
  280. # https://github.com/pandas-dev/pandas/issues/35460
  281. df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
  282. # pd.NaT gets inferred as tz-naive, so append result is tz-naive
  283. result = df._append({"a": pd.NaT}, ignore_index=True)
  284. if using_array_manager:
  285. expected = DataFrame({"a": [pd.NaT]}, dtype=object)
  286. else:
  287. expected = DataFrame({"a": [np.nan]}, dtype=object)
  288. tm.assert_frame_equal(result, expected)
  289. # also test with typed value to append
  290. df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
  291. other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
  292. result = df._append(other, ignore_index=True)
  293. tm.assert_frame_equal(result, expected)
  294. # mismatched tz
  295. other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]")
  296. result = df._append(other, ignore_index=True)
  297. expected = DataFrame({"a": [pd.NaT]}).astype(object)
  298. tm.assert_frame_equal(result, expected)
  299. @pytest.mark.parametrize(
  300. "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
  301. )
  302. @pytest.mark.parametrize("val", [1, "NaT"])
  303. def test_append_empty_frame_with_timedelta64ns_nat(
  304. self, dtype_str, val, using_array_manager
  305. ):
  306. # https://github.com/pandas-dev/pandas/issues/35460
  307. df = DataFrame(columns=["a"]).astype(dtype_str)
  308. other = DataFrame({"a": [np.timedelta64(val, "ns")]})
  309. result = df._append(other, ignore_index=True)
  310. expected = other.astype(object)
  311. if isinstance(val, str) and dtype_str != "int64" and not using_array_manager:
  312. # TODO: expected used to be `other.astype(object)` which is a more
  313. # reasonable result. This was changed when tightening
  314. # assert_frame_equal's treatment of mismatched NAs to match the
  315. # existing behavior.
  316. expected = DataFrame({"a": [np.nan]}, dtype=object)
  317. tm.assert_frame_equal(result, expected)
  318. @pytest.mark.parametrize(
  319. "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
  320. )
  321. @pytest.mark.parametrize("val", [1, "NaT"])
  322. def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
  323. # https://github.com/pandas-dev/pandas/issues/35460
  324. df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
  325. other = DataFrame({"a": [np.timedelta64(val, "ns")]})
  326. result = df._append(other, ignore_index=True)
  327. expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
  328. tm.assert_frame_equal(result, expected)