test_functions.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. Index,
  6. Series,
  7. concat,
  8. merge,
  9. )
  10. import pandas._testing as tm
  11. from pandas.tests.copy_view.util import get_array
  12. def test_concat_frames(using_copy_on_write):
  13. df = DataFrame({"b": ["a"] * 3}, dtype=object)
  14. df2 = DataFrame({"a": ["a"] * 3}, dtype=object)
  15. df_orig = df.copy()
  16. result = concat([df, df2], axis=1)
  17. if using_copy_on_write:
  18. assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
  19. assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
  20. else:
  21. assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
  22. assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
  23. result.iloc[0, 0] = "d"
  24. if using_copy_on_write:
  25. assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
  26. assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
  27. result.iloc[0, 1] = "d"
  28. if using_copy_on_write:
  29. assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
  30. tm.assert_frame_equal(df, df_orig)
  31. def test_concat_frames_updating_input(using_copy_on_write):
  32. df = DataFrame({"b": ["a"] * 3}, dtype=object)
  33. df2 = DataFrame({"a": ["a"] * 3}, dtype=object)
  34. result = concat([df, df2], axis=1)
  35. if using_copy_on_write:
  36. assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
  37. assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
  38. else:
  39. assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
  40. assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
  41. expected = result.copy()
  42. df.iloc[0, 0] = "d"
  43. if using_copy_on_write:
  44. assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
  45. assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
  46. df2.iloc[0, 0] = "d"
  47. if using_copy_on_write:
  48. assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
  49. tm.assert_frame_equal(result, expected)
  50. def test_concat_series(using_copy_on_write):
  51. ser = Series([1, 2], name="a")
  52. ser2 = Series([3, 4], name="b")
  53. ser_orig = ser.copy()
  54. ser2_orig = ser2.copy()
  55. result = concat([ser, ser2], axis=1)
  56. if using_copy_on_write:
  57. assert np.shares_memory(get_array(result, "a"), ser.values)
  58. assert np.shares_memory(get_array(result, "b"), ser2.values)
  59. else:
  60. assert not np.shares_memory(get_array(result, "a"), ser.values)
  61. assert not np.shares_memory(get_array(result, "b"), ser2.values)
  62. result.iloc[0, 0] = 100
  63. if using_copy_on_write:
  64. assert not np.shares_memory(get_array(result, "a"), ser.values)
  65. assert np.shares_memory(get_array(result, "b"), ser2.values)
  66. result.iloc[0, 1] = 1000
  67. if using_copy_on_write:
  68. assert not np.shares_memory(get_array(result, "b"), ser2.values)
  69. tm.assert_series_equal(ser, ser_orig)
  70. tm.assert_series_equal(ser2, ser2_orig)
  71. def test_concat_frames_chained(using_copy_on_write):
  72. df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
  73. df2 = DataFrame({"c": [4, 5, 6]})
  74. df3 = DataFrame({"d": [4, 5, 6]})
  75. result = concat([concat([df1, df2], axis=1), df3], axis=1)
  76. expected = result.copy()
  77. if using_copy_on_write:
  78. assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  79. assert np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
  80. assert np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
  81. else:
  82. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  83. assert not np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
  84. assert not np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
  85. df1.iloc[0, 0] = 100
  86. if using_copy_on_write:
  87. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  88. tm.assert_frame_equal(result, expected)
  89. def test_concat_series_chained(using_copy_on_write):
  90. ser1 = Series([1, 2, 3], name="a")
  91. ser2 = Series([4, 5, 6], name="c")
  92. ser3 = Series([4, 5, 6], name="d")
  93. result = concat([concat([ser1, ser2], axis=1), ser3], axis=1)
  94. expected = result.copy()
  95. if using_copy_on_write:
  96. assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
  97. assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
  98. assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
  99. else:
  100. assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
  101. assert not np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
  102. assert not np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
  103. ser1.iloc[0] = 100
  104. if using_copy_on_write:
  105. assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
  106. tm.assert_frame_equal(result, expected)
  107. def test_concat_series_updating_input(using_copy_on_write):
  108. ser = Series([1, 2], name="a")
  109. ser2 = Series([3, 4], name="b")
  110. expected = DataFrame({"a": [1, 2], "b": [3, 4]})
  111. result = concat([ser, ser2], axis=1)
  112. if using_copy_on_write:
  113. assert np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
  114. assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
  115. else:
  116. assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
  117. assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
  118. ser.iloc[0] = 100
  119. if using_copy_on_write:
  120. assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
  121. assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
  122. tm.assert_frame_equal(result, expected)
  123. ser2.iloc[0] = 1000
  124. if using_copy_on_write:
  125. assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
  126. tm.assert_frame_equal(result, expected)
  127. def test_concat_mixed_series_frame(using_copy_on_write):
  128. df = DataFrame({"a": [1, 2, 3], "c": 1})
  129. ser = Series([4, 5, 6], name="d")
  130. result = concat([df, ser], axis=1)
  131. expected = result.copy()
  132. if using_copy_on_write:
  133. assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
  134. assert np.shares_memory(get_array(result, "c"), get_array(df, "c"))
  135. assert np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
  136. else:
  137. assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
  138. assert not np.shares_memory(get_array(result, "c"), get_array(df, "c"))
  139. assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
  140. ser.iloc[0] = 100
  141. if using_copy_on_write:
  142. assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
  143. df.iloc[0, 0] = 100
  144. if using_copy_on_write:
  145. assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
  146. tm.assert_frame_equal(result, expected)
  147. @pytest.mark.parametrize("copy", [True, None, False])
  148. def test_concat_copy_keyword(using_copy_on_write, copy):
  149. df = DataFrame({"a": [1, 2]})
  150. df2 = DataFrame({"b": [1.5, 2.5]})
  151. result = concat([df, df2], axis=1, copy=copy)
  152. if using_copy_on_write or copy is False:
  153. assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
  154. assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
  155. else:
  156. assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
  157. assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
  158. @pytest.mark.parametrize(
  159. "func",
  160. [
  161. lambda df1, df2, **kwargs: df1.merge(df2, **kwargs),
  162. lambda df1, df2, **kwargs: merge(df1, df2, **kwargs),
  163. ],
  164. )
  165. def test_merge_on_key(using_copy_on_write, func):
  166. df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]})
  167. df2 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "b": [4, 5, 6]})
  168. df1_orig = df1.copy()
  169. df2_orig = df2.copy()
  170. result = func(df1, df2, on="key")
  171. if using_copy_on_write:
  172. assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  173. assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  174. assert np.shares_memory(get_array(result, "key"), get_array(df1, "key"))
  175. assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
  176. else:
  177. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  178. assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  179. result.iloc[0, 1] = 0
  180. if using_copy_on_write:
  181. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  182. assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  183. result.iloc[0, 2] = 0
  184. if using_copy_on_write:
  185. assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  186. tm.assert_frame_equal(df1, df1_orig)
  187. tm.assert_frame_equal(df2, df2_orig)
  188. def test_merge_on_index(using_copy_on_write):
  189. df1 = DataFrame({"a": [1, 2, 3]})
  190. df2 = DataFrame({"b": [4, 5, 6]})
  191. df1_orig = df1.copy()
  192. df2_orig = df2.copy()
  193. result = merge(df1, df2, left_index=True, right_index=True)
  194. if using_copy_on_write:
  195. assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  196. assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  197. else:
  198. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  199. assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  200. result.iloc[0, 0] = 0
  201. if using_copy_on_write:
  202. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  203. assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  204. result.iloc[0, 1] = 0
  205. if using_copy_on_write:
  206. assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  207. tm.assert_frame_equal(df1, df1_orig)
  208. tm.assert_frame_equal(df2, df2_orig)
  209. @pytest.mark.parametrize(
  210. "func, how",
  211. [
  212. (lambda df1, df2, **kwargs: merge(df2, df1, on="key", **kwargs), "right"),
  213. (lambda df1, df2, **kwargs: merge(df1, df2, on="key", **kwargs), "left"),
  214. ],
  215. )
  216. def test_merge_on_key_enlarging_one(using_copy_on_write, func, how):
  217. df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]})
  218. df2 = DataFrame({"key": Series(["a", "b"], dtype=object), "b": [4, 5]})
  219. df1_orig = df1.copy()
  220. df2_orig = df2.copy()
  221. result = func(df1, df2, how=how)
  222. if using_copy_on_write:
  223. assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  224. assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  225. assert df2._mgr._has_no_reference(1)
  226. assert df2._mgr._has_no_reference(0)
  227. assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) is (
  228. how == "left"
  229. )
  230. assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
  231. else:
  232. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  233. assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  234. if how == "left":
  235. result.iloc[0, 1] = 0
  236. else:
  237. result.iloc[0, 2] = 0
  238. if using_copy_on_write:
  239. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  240. tm.assert_frame_equal(df1, df1_orig)
  241. tm.assert_frame_equal(df2, df2_orig)
  242. @pytest.mark.parametrize("copy", [True, None, False])
  243. def test_merge_copy_keyword(using_copy_on_write, copy):
  244. df = DataFrame({"a": [1, 2]})
  245. df2 = DataFrame({"b": [3, 4.5]})
  246. result = df.merge(df2, copy=copy, left_index=True, right_index=True)
  247. if using_copy_on_write or copy is False:
  248. assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
  249. assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
  250. else:
  251. assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
  252. assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
  253. @pytest.mark.parametrize("dtype", [object, "str"])
  254. def test_join_on_key(dtype, using_copy_on_write):
  255. df_index = Index(["a", "b", "c"], name="key", dtype=dtype)
  256. df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
  257. df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True))
  258. df1_orig = df1.copy()
  259. df2_orig = df2.copy()
  260. result = df1.join(df2, on="key")
  261. if using_copy_on_write:
  262. assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  263. assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  264. assert tm.shares_memory(get_array(result.index), get_array(df1.index))
  265. assert not np.shares_memory(get_array(result.index), get_array(df2.index))
  266. else:
  267. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  268. assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  269. result.iloc[0, 0] = 0
  270. if using_copy_on_write:
  271. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  272. assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  273. result.iloc[0, 1] = 0
  274. if using_copy_on_write:
  275. assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
  276. tm.assert_frame_equal(df1, df1_orig)
  277. tm.assert_frame_equal(df2, df2_orig)
  278. def test_join_multiple_dataframes_on_key(using_copy_on_write):
  279. df_index = Index(["a", "b", "c"], name="key", dtype=object)
  280. df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
  281. dfs_list = [
  282. DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)),
  283. DataFrame({"c": [7, 8, 9]}, index=df_index.copy(deep=True)),
  284. ]
  285. df1_orig = df1.copy()
  286. dfs_list_orig = [df.copy() for df in dfs_list]
  287. result = df1.join(dfs_list)
  288. if using_copy_on_write:
  289. assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  290. assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
  291. assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
  292. assert np.shares_memory(get_array(result.index), get_array(df1.index))
  293. assert not np.shares_memory(
  294. get_array(result.index), get_array(dfs_list[0].index)
  295. )
  296. assert not np.shares_memory(
  297. get_array(result.index), get_array(dfs_list[1].index)
  298. )
  299. else:
  300. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  301. assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
  302. assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
  303. result.iloc[0, 0] = 0
  304. if using_copy_on_write:
  305. assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
  306. assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
  307. assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
  308. result.iloc[0, 1] = 0
  309. if using_copy_on_write:
  310. assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
  311. assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
  312. result.iloc[0, 2] = 0
  313. if using_copy_on_write:
  314. assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
  315. tm.assert_frame_equal(df1, df1_orig)
  316. for df, df_orig in zip(dfs_list, dfs_list_orig):
  317. tm.assert_frame_equal(df, df_orig)