reshaping.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. import itertools
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. import pandas._testing as tm
  6. from pandas.api.extensions import ExtensionArray
  7. from pandas.core.internals.blocks import EABackedBlock
  8. class BaseReshapingTests:
  9. """Tests for reshaping and concatenation."""
  10. @pytest.mark.parametrize("in_frame", [True, False])
  11. def test_concat(self, data, in_frame):
  12. wrapped = pd.Series(data)
  13. if in_frame:
  14. wrapped = pd.DataFrame(wrapped)
  15. result = pd.concat([wrapped, wrapped], ignore_index=True)
  16. assert len(result) == len(data) * 2
  17. if in_frame:
  18. dtype = result.dtypes[0]
  19. else:
  20. dtype = result.dtype
  21. assert dtype == data.dtype
  22. if hasattr(result._mgr, "blocks"):
  23. assert isinstance(result._mgr.blocks[0], EABackedBlock)
  24. assert isinstance(result._mgr.arrays[0], ExtensionArray)
  25. @pytest.mark.parametrize("in_frame", [True, False])
  26. def test_concat_all_na_block(self, data_missing, in_frame):
  27. valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
  28. na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
  29. if in_frame:
  30. valid_block = pd.DataFrame({"a": valid_block})
  31. na_block = pd.DataFrame({"a": na_block})
  32. result = pd.concat([valid_block, na_block])
  33. if in_frame:
  34. expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
  35. tm.assert_frame_equal(result, expected)
  36. else:
  37. expected = pd.Series(data_missing.take([1, 1, 0, 0]))
  38. tm.assert_series_equal(result, expected)
  39. def test_concat_mixed_dtypes(self, data):
  40. # https://github.com/pandas-dev/pandas/issues/20762
  41. df1 = pd.DataFrame({"A": data[:3]})
  42. df2 = pd.DataFrame({"A": [1, 2, 3]})
  43. df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
  44. dfs = [df1, df2, df3]
  45. # dataframes
  46. result = pd.concat(dfs)
  47. expected = pd.concat([x.astype(object) for x in dfs])
  48. tm.assert_frame_equal(result, expected)
  49. # series
  50. result = pd.concat([x["A"] for x in dfs])
  51. expected = pd.concat([x["A"].astype(object) for x in dfs])
  52. tm.assert_series_equal(result, expected)
  53. # simple test for just EA and one other
  54. result = pd.concat([df1, df2.astype(object)])
  55. expected = pd.concat([df1.astype("object"), df2.astype("object")])
  56. tm.assert_frame_equal(result, expected)
  57. result = pd.concat([df1["A"], df2["A"].astype(object)])
  58. expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
  59. tm.assert_series_equal(result, expected)
  60. def test_concat_columns(self, data, na_value):
  61. df1 = pd.DataFrame({"A": data[:3]})
  62. df2 = pd.DataFrame({"B": [1, 2, 3]})
  63. expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
  64. result = pd.concat([df1, df2], axis=1)
  65. tm.assert_frame_equal(result, expected)
  66. result = pd.concat([df1["A"], df2["B"]], axis=1)
  67. tm.assert_frame_equal(result, expected)
  68. # non-aligned
  69. df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
  70. expected = pd.DataFrame(
  71. {
  72. "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
  73. "B": [np.nan, 1, 2, 3],
  74. }
  75. )
  76. result = pd.concat([df1, df2], axis=1)
  77. tm.assert_frame_equal(result, expected)
  78. result = pd.concat([df1["A"], df2["B"]], axis=1)
  79. tm.assert_frame_equal(result, expected)
  80. def test_concat_extension_arrays_copy_false(self, data, na_value):
  81. # GH 20756
  82. df1 = pd.DataFrame({"A": data[:3]})
  83. df2 = pd.DataFrame({"B": data[3:7]})
  84. expected = pd.DataFrame(
  85. {
  86. "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
  87. "B": data[3:7],
  88. }
  89. )
  90. result = pd.concat([df1, df2], axis=1, copy=False)
  91. tm.assert_frame_equal(result, expected)
  92. def test_concat_with_reindex(self, data):
  93. # GH-33027
  94. a = pd.DataFrame({"a": data[:5]})
  95. b = pd.DataFrame({"b": data[:5]})
  96. result = pd.concat([a, b], ignore_index=True)
  97. expected = pd.DataFrame(
  98. {
  99. "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
  100. "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
  101. }
  102. )
  103. tm.assert_frame_equal(result, expected)
  104. def test_align(self, data, na_value):
  105. a = data[:3]
  106. b = data[2:5]
  107. r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
  108. # Assumes that the ctor can take a list of scalars of the type
  109. e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
  110. e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
  111. tm.assert_series_equal(r1, e1)
  112. tm.assert_series_equal(r2, e2)
  113. def test_align_frame(self, data, na_value):
  114. a = data[:3]
  115. b = data[2:5]
  116. r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
  117. # Assumes that the ctor can take a list of scalars of the type
  118. e1 = pd.DataFrame(
  119. {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
  120. )
  121. e2 = pd.DataFrame(
  122. {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
  123. )
  124. tm.assert_frame_equal(r1, e1)
  125. tm.assert_frame_equal(r2, e2)
  126. def test_align_series_frame(self, data, na_value):
  127. # https://github.com/pandas-dev/pandas/issues/20576
  128. ser = pd.Series(data, name="a")
  129. df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
  130. r1, r2 = ser.align(df)
  131. e1 = pd.Series(
  132. data._from_sequence(list(data) + [na_value], dtype=data.dtype),
  133. name=ser.name,
  134. )
  135. tm.assert_series_equal(r1, e1)
  136. tm.assert_frame_equal(r2, df)
  137. def test_set_frame_expand_regular_with_extension(self, data):
  138. df = pd.DataFrame({"A": [1] * len(data)})
  139. df["B"] = data
  140. expected = pd.DataFrame({"A": [1] * len(data), "B": data})
  141. tm.assert_frame_equal(df, expected)
  142. def test_set_frame_expand_extension_with_regular(self, data):
  143. df = pd.DataFrame({"A": data})
  144. df["B"] = [1] * len(data)
  145. expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
  146. tm.assert_frame_equal(df, expected)
  147. def test_set_frame_overwrite_object(self, data):
  148. # https://github.com/pandas-dev/pandas/issues/20555
  149. df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
  150. df["A"] = data
  151. assert df.dtypes["A"] == data.dtype
  152. def test_merge(self, data, na_value):
  153. # GH-20743
  154. df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
  155. df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
  156. res = pd.merge(df1, df2)
  157. exp = pd.DataFrame(
  158. {
  159. "int1": [1, 1, 2],
  160. "int2": [1, 2, 3],
  161. "key": [0, 0, 1],
  162. "ext": data._from_sequence(
  163. [data[0], data[0], data[1]], dtype=data.dtype
  164. ),
  165. }
  166. )
  167. tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
  168. res = pd.merge(df1, df2, how="outer")
  169. exp = pd.DataFrame(
  170. {
  171. "int1": [1, 1, 2, 3, np.nan],
  172. "int2": [1, 2, 3, np.nan, 4],
  173. "key": [0, 0, 1, 2, 3],
  174. "ext": data._from_sequence(
  175. [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
  176. ),
  177. }
  178. )
  179. tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
  180. def test_merge_on_extension_array(self, data):
  181. # GH 23020
  182. a, b = data[:2]
  183. key = type(data)._from_sequence([a, b], dtype=data.dtype)
  184. df = pd.DataFrame({"key": key, "val": [1, 2]})
  185. result = pd.merge(df, df, on="key")
  186. expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
  187. tm.assert_frame_equal(result, expected)
  188. # order
  189. result = pd.merge(df.iloc[[1, 0]], df, on="key")
  190. expected = expected.iloc[[1, 0]].reset_index(drop=True)
  191. tm.assert_frame_equal(result, expected)
  192. def test_merge_on_extension_array_duplicates(self, data):
  193. # GH 23020
  194. a, b = data[:2]
  195. key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
  196. df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
  197. df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
  198. result = pd.merge(df1, df2, on="key")
  199. expected = pd.DataFrame(
  200. {
  201. "key": key.take([0, 0, 1, 2, 2]),
  202. "val_x": [1, 1, 2, 3, 3],
  203. "val_y": [1, 3, 2, 1, 3],
  204. }
  205. )
  206. tm.assert_frame_equal(result, expected)
  207. @pytest.mark.filterwarnings(
  208. "ignore:The previous implementation of stack is deprecated"
  209. )
  210. @pytest.mark.parametrize(
  211. "columns",
  212. [
  213. ["A", "B"],
  214. pd.MultiIndex.from_tuples(
  215. [("A", "a"), ("A", "b")], names=["outer", "inner"]
  216. ),
  217. ],
  218. )
  219. @pytest.mark.parametrize("future_stack", [True, False])
  220. def test_stack(self, data, columns, future_stack):
  221. df = pd.DataFrame({"A": data[:5], "B": data[:5]})
  222. df.columns = columns
  223. result = df.stack(future_stack=future_stack)
  224. expected = df.astype(object).stack(future_stack=future_stack)
  225. # we need a second astype(object), in case the constructor inferred
  226. # object -> specialized, as is done for period.
  227. expected = expected.astype(object)
  228. if isinstance(expected, pd.Series):
  229. assert result.dtype == df.iloc[:, 0].dtype
  230. else:
  231. assert all(result.dtypes == df.iloc[:, 0].dtype)
  232. result = result.astype(object)
  233. tm.assert_equal(result, expected)
  234. @pytest.mark.parametrize(
  235. "index",
  236. [
  237. # Two levels, uniform.
  238. pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
  239. # non-uniform
  240. pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
  241. # three levels, non-uniform
  242. pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
  243. pd.MultiIndex.from_tuples(
  244. [
  245. ("A", "a", 1),
  246. ("A", "b", 0),
  247. ("A", "a", 0),
  248. ("B", "a", 0),
  249. ("B", "c", 1),
  250. ]
  251. ),
  252. ],
  253. )
  254. @pytest.mark.parametrize("obj", ["series", "frame"])
  255. def test_unstack(self, data, index, obj):
  256. data = data[: len(index)]
  257. if obj == "series":
  258. ser = pd.Series(data, index=index)
  259. else:
  260. ser = pd.DataFrame({"A": data, "B": data}, index=index)
  261. n = index.nlevels
  262. levels = list(range(n))
  263. # [0, 1, 2]
  264. # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
  265. combinations = itertools.chain.from_iterable(
  266. itertools.permutations(levels, i) for i in range(1, n)
  267. )
  268. for level in combinations:
  269. result = ser.unstack(level=level)
  270. assert all(
  271. isinstance(result[col].array, type(data)) for col in result.columns
  272. )
  273. if obj == "series":
  274. # We should get the same result with to_frame+unstack+droplevel
  275. df = ser.to_frame()
  276. alt = df.unstack(level=level).droplevel(0, axis=1)
  277. tm.assert_frame_equal(result, alt)
  278. obj_ser = ser.astype(object)
  279. expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
  280. if obj == "series":
  281. assert (expected.dtypes == object).all()
  282. result = result.astype(object)
  283. tm.assert_frame_equal(result, expected)
  284. def test_ravel(self, data):
  285. # as long as EA is 1D-only, ravel is a no-op
  286. result = data.ravel()
  287. assert type(result) == type(data)
  288. if data.dtype._is_immutable:
  289. pytest.skip(f"test_ravel assumes mutability and {data.dtype} is immutable")
  290. # Check that we have a view, not a copy
  291. result[0] = result[1]
  292. assert data[0] == data[1]
  293. def test_transpose(self, data):
  294. result = data.transpose()
  295. assert type(result) == type(data)
  296. # check we get a new object
  297. assert result is not data
  298. # If we ever _did_ support 2D, shape should be reversed
  299. assert result.shape == data.shape[::-1]
  300. if data.dtype._is_immutable:
  301. pytest.skip(
  302. f"test_transpose assumes mutability and {data.dtype} is immutable"
  303. )
  304. # Check that we have a view, not a copy
  305. result[0] = result[1]
  306. assert data[0] == data[1]
  307. def test_transpose_frame(self, data):
  308. df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
  309. result = df.T
  310. expected = pd.DataFrame(
  311. {
  312. "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
  313. "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
  314. "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
  315. "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
  316. },
  317. index=["A", "B"],
  318. )
  319. tm.assert_frame_equal(result, expected)
  320. tm.assert_frame_equal(np.transpose(np.transpose(df)), df)
  321. tm.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]])