test_replace.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. import numpy as np
  2. import pytest
  3. from pandas.compat import WARNING_CHECK_DISABLED
  4. from pandas import (
  5. Categorical,
  6. DataFrame,
  7. option_context,
  8. )
  9. import pandas._testing as tm
  10. from pandas.tests.copy_view.util import get_array
  11. @pytest.mark.parametrize(
  12. "replace_kwargs",
  13. [
  14. {"to_replace": {"a": 1, "b": 4}, "value": -1},
  15. # Test CoW splits blocks to avoid copying unchanged columns
  16. {"to_replace": {"a": 1}, "value": -1},
  17. {"to_replace": {"b": 4}, "value": -1},
  18. {"to_replace": {"b": {4: 1}}},
  19. # TODO: Add these in a further optimization
  20. # We would need to see which columns got replaced in the mask
  21. # which could be expensive
  22. # {"to_replace": {"b": 1}},
  23. # 1
  24. ],
  25. )
  26. def test_replace(using_copy_on_write, replace_kwargs):
  27. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  28. df_orig = df.copy()
  29. df_replaced = df.replace(**replace_kwargs)
  30. if using_copy_on_write:
  31. if (df_replaced["b"] == df["b"]).all():
  32. assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
  33. assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
  34. # mutating squeezed df triggers a copy-on-write for that column/block
  35. df_replaced.loc[0, "c"] = -1
  36. if using_copy_on_write:
  37. assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
  38. if "a" in replace_kwargs["to_replace"]:
  39. arr = get_array(df_replaced, "a")
  40. df_replaced.loc[0, "a"] = 100
  41. assert np.shares_memory(get_array(df_replaced, "a"), arr)
  42. tm.assert_frame_equal(df, df_orig)
  43. def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write):
  44. df = DataFrame({"a": ["aaa", "bbb"]})
  45. df_orig = df.copy()
  46. view = df[:]
  47. arr = get_array(df, "a")
  48. with tm.assert_cow_warning(warn_copy_on_write):
  49. df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
  50. if using_copy_on_write:
  51. assert not tm.shares_memory(arr, get_array(df, "a"))
  52. assert df._mgr._has_no_reference(0)
  53. tm.assert_frame_equal(view, df_orig)
  54. else:
  55. assert np.shares_memory(arr, get_array(df, "a"))
  56. def test_replace_regex_inplace(using_copy_on_write):
  57. df = DataFrame({"a": ["aaa", "bbb"]})
  58. arr = get_array(df, "a")
  59. df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
  60. if using_copy_on_write:
  61. assert df._mgr._has_no_reference(0)
  62. assert tm.shares_memory(arr, get_array(df, "a"))
  63. df_orig = df.copy()
  64. df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
  65. tm.assert_frame_equal(df_orig, df)
  66. assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  67. def test_replace_regex_inplace_no_op(using_copy_on_write):
  68. df = DataFrame({"a": [1, 2]})
  69. arr = get_array(df, "a")
  70. df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True)
  71. if using_copy_on_write:
  72. assert df._mgr._has_no_reference(0)
  73. assert np.shares_memory(arr, get_array(df, "a"))
  74. df_orig = df.copy()
  75. df2 = df.replace(to_replace=r"^x.$", value="new", regex=True)
  76. tm.assert_frame_equal(df_orig, df)
  77. if using_copy_on_write:
  78. assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  79. else:
  80. assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  81. def test_replace_mask_all_false_second_block(using_copy_on_write):
  82. df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
  83. df_orig = df.copy()
  84. df2 = df.replace(to_replace=1.5, value=55.5)
  85. if using_copy_on_write:
  86. # TODO: Block splitting would allow us to avoid copying b
  87. assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
  88. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  89. else:
  90. assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
  91. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  92. df2.loc[0, "c"] = 1
  93. tm.assert_frame_equal(df, df_orig) # Original is unchanged
  94. if using_copy_on_write:
  95. assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
  96. # TODO: This should split and not copy the whole block
  97. # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
  98. def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
  99. df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
  100. df_orig = df.copy()
  101. df2 = df.replace(to_replace=1.5, value="a")
  102. if using_copy_on_write:
  103. assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  104. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  105. elif not using_array_manager:
  106. assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  107. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  108. if using_copy_on_write:
  109. df2.loc[0, "b"] = 0.5
  110. tm.assert_frame_equal(df, df_orig) # Original is unchanged
  111. assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  112. def test_replace_to_replace_wrong_dtype(using_copy_on_write):
  113. df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
  114. df_orig = df.copy()
  115. df2 = df.replace(to_replace="xxx", value=1.5)
  116. if using_copy_on_write:
  117. assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  118. assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  119. else:
  120. assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  121. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  122. df2.loc[0, "b"] = 0.5
  123. tm.assert_frame_equal(df, df_orig) # Original is unchanged
  124. if using_copy_on_write:
  125. assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  126. def test_replace_list_categorical(using_copy_on_write):
  127. df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
  128. arr = get_array(df, "a")
  129. msg = (
  130. r"The behavior of Series\.replace \(and DataFrame.replace\) "
  131. "with CategoricalDtype"
  132. )
  133. with tm.assert_produces_warning(FutureWarning, match=msg):
  134. df.replace(["c"], value="a", inplace=True)
  135. assert np.shares_memory(arr.codes, get_array(df, "a").codes)
  136. if using_copy_on_write:
  137. assert df._mgr._has_no_reference(0)
  138. df_orig = df.copy()
  139. with tm.assert_produces_warning(FutureWarning, match=msg):
  140. df2 = df.replace(["b"], value="a")
  141. assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)
  142. tm.assert_frame_equal(df, df_orig)
  143. def test_replace_list_inplace_refs_categorical(using_copy_on_write):
  144. df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
  145. view = df[:]
  146. df_orig = df.copy()
  147. msg = (
  148. r"The behavior of Series\.replace \(and DataFrame.replace\) "
  149. "with CategoricalDtype"
  150. )
  151. with tm.assert_produces_warning(FutureWarning, match=msg):
  152. df.replace(["c"], value="a", inplace=True)
  153. if using_copy_on_write:
  154. assert not np.shares_memory(
  155. get_array(view, "a").codes, get_array(df, "a").codes
  156. )
  157. tm.assert_frame_equal(df_orig, view)
  158. else:
  159. # This could be inplace
  160. assert not np.shares_memory(
  161. get_array(view, "a").codes, get_array(df, "a").codes
  162. )
  163. @pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
  164. def test_replace_inplace(using_copy_on_write, to_replace):
  165. df = DataFrame({"a": [1.5, 2, 3]})
  166. arr_a = get_array(df, "a")
  167. df.replace(to_replace=1.5, value=15.5, inplace=True)
  168. assert np.shares_memory(get_array(df, "a"), arr_a)
  169. if using_copy_on_write:
  170. assert df._mgr._has_no_reference(0)
  171. @pytest.mark.parametrize("to_replace", [1.5, [1.5]])
  172. def test_replace_inplace_reference(using_copy_on_write, to_replace, warn_copy_on_write):
  173. df = DataFrame({"a": [1.5, 2, 3]})
  174. arr_a = get_array(df, "a")
  175. view = df[:]
  176. with tm.assert_cow_warning(warn_copy_on_write):
  177. df.replace(to_replace=to_replace, value=15.5, inplace=True)
  178. if using_copy_on_write:
  179. assert not np.shares_memory(get_array(df, "a"), arr_a)
  180. assert df._mgr._has_no_reference(0)
  181. assert view._mgr._has_no_reference(0)
  182. else:
  183. assert np.shares_memory(get_array(df, "a"), arr_a)
  184. @pytest.mark.parametrize("to_replace", ["a", 100.5])
  185. def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace):
  186. df = DataFrame({"a": [1.5, 2, 3]})
  187. arr_a = get_array(df, "a")
  188. view = df[:]
  189. df.replace(to_replace=to_replace, value=15.5, inplace=True)
  190. assert np.shares_memory(get_array(df, "a"), arr_a)
  191. if using_copy_on_write:
  192. assert not df._mgr._has_no_reference(0)
  193. assert not view._mgr._has_no_reference(0)
  194. @pytest.mark.parametrize("to_replace", [1, [1]])
  195. @pytest.mark.parametrize("val", [1, 1.5])
  196. def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace):
  197. df = DataFrame({"a": Categorical([1, 2, 3])})
  198. df_orig = df.copy()
  199. arr_a = get_array(df, "a")
  200. view = df[:]
  201. msg = (
  202. r"The behavior of Series\.replace \(and DataFrame.replace\) "
  203. "with CategoricalDtype"
  204. )
  205. warn = FutureWarning if val == 1.5 else None
  206. with tm.assert_produces_warning(warn, match=msg):
  207. df.replace(to_replace=to_replace, value=val, inplace=True)
  208. if using_copy_on_write:
  209. assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
  210. assert df._mgr._has_no_reference(0)
  211. assert view._mgr._has_no_reference(0)
  212. tm.assert_frame_equal(view, df_orig)
  213. else:
  214. assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
  215. @pytest.mark.parametrize("val", [1, 1.5])
  216. def test_replace_categorical_inplace(using_copy_on_write, val):
  217. df = DataFrame({"a": Categorical([1, 2, 3])})
  218. arr_a = get_array(df, "a")
  219. msg = (
  220. r"The behavior of Series\.replace \(and DataFrame.replace\) "
  221. "with CategoricalDtype"
  222. )
  223. warn = FutureWarning if val == 1.5 else None
  224. with tm.assert_produces_warning(warn, match=msg):
  225. df.replace(to_replace=1, value=val, inplace=True)
  226. assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
  227. if using_copy_on_write:
  228. assert df._mgr._has_no_reference(0)
  229. expected = DataFrame({"a": Categorical([val, 2, 3])})
  230. tm.assert_frame_equal(df, expected)
  231. @pytest.mark.parametrize("val", [1, 1.5])
  232. def test_replace_categorical(using_copy_on_write, val):
  233. df = DataFrame({"a": Categorical([1, 2, 3])})
  234. df_orig = df.copy()
  235. msg = (
  236. r"The behavior of Series\.replace \(and DataFrame.replace\) "
  237. "with CategoricalDtype"
  238. )
  239. warn = FutureWarning if val == 1.5 else None
  240. with tm.assert_produces_warning(warn, match=msg):
  241. df2 = df.replace(to_replace=1, value=val)
  242. if using_copy_on_write:
  243. assert df._mgr._has_no_reference(0)
  244. assert df2._mgr._has_no_reference(0)
  245. assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
  246. tm.assert_frame_equal(df, df_orig)
  247. arr_a = get_array(df2, "a").codes
  248. df2.iloc[0, 0] = 2.0
  249. assert np.shares_memory(get_array(df2, "a").codes, arr_a)
  250. @pytest.mark.parametrize("method", ["where", "mask"])
  251. def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write):
  252. df = DataFrame({"a": [1.5, 2, 3]})
  253. df_orig = df.copy()
  254. arr_a = get_array(df, "a")
  255. view = df[:]
  256. method = getattr(df, method)
  257. if warn_copy_on_write:
  258. with tm.assert_cow_warning():
  259. method(df["a"] > 1.6, -1, inplace=True)
  260. else:
  261. method(df["a"] > 1.6, -1, inplace=True)
  262. if using_copy_on_write:
  263. assert not np.shares_memory(get_array(df, "a"), arr_a)
  264. assert df._mgr._has_no_reference(0)
  265. assert view._mgr._has_no_reference(0)
  266. tm.assert_frame_equal(view, df_orig)
  267. else:
  268. assert np.shares_memory(get_array(df, "a"), arr_a)
  269. def test_replace_empty_list(using_copy_on_write):
  270. df = DataFrame({"a": [1, 2]})
  271. df2 = df.replace([], [])
  272. if using_copy_on_write:
  273. assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  274. assert not df._mgr._has_no_reference(0)
  275. else:
  276. assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  277. arr_a = get_array(df, "a")
  278. df.replace([], [])
  279. if using_copy_on_write:
  280. assert np.shares_memory(get_array(df, "a"), arr_a)
  281. assert not df._mgr._has_no_reference(0)
  282. assert not df2._mgr._has_no_reference(0)
  283. @pytest.mark.parametrize("value", ["d", None])
  284. def test_replace_object_list_inplace(using_copy_on_write, value):
  285. df = DataFrame({"a": ["a", "b", "c"]}, dtype=object)
  286. arr = get_array(df, "a")
  287. df.replace(["c"], value, inplace=True)
  288. if using_copy_on_write or value is None:
  289. assert tm.shares_memory(arr, get_array(df, "a"))
  290. else:
  291. # This could be inplace
  292. assert not np.shares_memory(arr, get_array(df, "a"))
  293. if using_copy_on_write:
  294. assert df._mgr._has_no_reference(0)
  295. def test_replace_list_multiple_elements_inplace(using_copy_on_write):
  296. df = DataFrame({"a": [1, 2, 3]})
  297. arr = get_array(df, "a")
  298. df.replace([1, 2], 4, inplace=True)
  299. if using_copy_on_write:
  300. assert np.shares_memory(arr, get_array(df, "a"))
  301. assert df._mgr._has_no_reference(0)
  302. else:
  303. assert np.shares_memory(arr, get_array(df, "a"))
  304. def test_replace_list_none(using_copy_on_write):
  305. df = DataFrame({"a": ["a", "b", "c"]})
  306. df_orig = df.copy()
  307. df2 = df.replace(["b"], value=None)
  308. tm.assert_frame_equal(df, df_orig)
  309. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  310. # replace multiple values that don't actually replace anything with None
  311. # https://github.com/pandas-dev/pandas/issues/59770
  312. df3 = df.replace(["d", "e", "f"], value=None)
  313. tm.assert_frame_equal(df3, df_orig)
  314. if using_copy_on_write:
  315. assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a"))
  316. else:
  317. assert not tm.shares_memory(get_array(df, "a"), get_array(df3, "a"))
  318. def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write):
  319. df = DataFrame({"a": ["a", "b", "c"]})
  320. arr = get_array(df, "a")
  321. df_orig = df.copy()
  322. view = df[:]
  323. with tm.assert_cow_warning(warn_copy_on_write):
  324. df.replace(["a"], value=None, inplace=True)
  325. if using_copy_on_write:
  326. assert df._mgr._has_no_reference(0)
  327. assert not np.shares_memory(arr, get_array(df, "a"))
  328. tm.assert_frame_equal(df_orig, view)
  329. else:
  330. assert np.shares_memory(arr, get_array(df, "a"))
  331. def test_replace_columnwise_no_op_inplace(using_copy_on_write):
  332. df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
  333. view = df[:]
  334. df_orig = df.copy()
  335. df.replace({"a": 10}, 100, inplace=True)
  336. if using_copy_on_write:
  337. assert np.shares_memory(get_array(view, "a"), get_array(df, "a"))
  338. df.iloc[0, 0] = 100
  339. tm.assert_frame_equal(view, df_orig)
  340. def test_replace_columnwise_no_op(using_copy_on_write):
  341. df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
  342. df_orig = df.copy()
  343. df2 = df.replace({"a": 10}, 100)
  344. if using_copy_on_write:
  345. assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  346. df2.iloc[0, 0] = 100
  347. tm.assert_frame_equal(df, df_orig)
  348. def test_replace_chained_assignment(using_copy_on_write):
  349. df = DataFrame({"a": [1, np.nan, 2], "b": 1})
  350. df_orig = df.copy()
  351. if using_copy_on_write:
  352. with tm.raises_chained_assignment_error():
  353. df["a"].replace(1, 100, inplace=True)
  354. tm.assert_frame_equal(df, df_orig)
  355. with tm.raises_chained_assignment_error():
  356. df[["a"]].replace(1, 100, inplace=True)
  357. tm.assert_frame_equal(df, df_orig)
  358. else:
  359. with tm.assert_produces_warning(None):
  360. with option_context("mode.chained_assignment", None):
  361. df[["a"]].replace(1, 100, inplace=True)
  362. with tm.assert_produces_warning(None):
  363. with option_context("mode.chained_assignment", None):
  364. df[df.a > 5].replace(1, 100, inplace=True)
  365. with tm.assert_produces_warning(
  366. FutureWarning if not WARNING_CHECK_DISABLED else None,
  367. match="inplace method",
  368. ):
  369. df["a"].replace(1, 100, inplace=True)
  370. def test_replace_listlike(using_copy_on_write):
  371. df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
  372. df_orig = df.copy()
  373. result = df.replace([200, 201], [11, 11])
  374. if using_copy_on_write:
  375. assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
  376. else:
  377. assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
  378. result.iloc[0, 0] = 100
  379. tm.assert_frame_equal(df, df)
  380. result = df.replace([200, 2], [10, 10])
  381. assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
  382. tm.assert_frame_equal(df, df_orig)
  383. def test_replace_listlike_inplace(using_copy_on_write, warn_copy_on_write):
  384. df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
  385. arr = get_array(df, "a")
  386. df.replace([200, 2], [10, 11], inplace=True)
  387. assert np.shares_memory(get_array(df, "a"), arr)
  388. view = df[:]
  389. df_orig = df.copy()
  390. with tm.assert_cow_warning(warn_copy_on_write):
  391. df.replace([200, 3], [10, 11], inplace=True)
  392. if using_copy_on_write:
  393. assert not np.shares_memory(get_array(df, "a"), arr)
  394. tm.assert_frame_equal(view, df_orig)
  395. else:
  396. assert np.shares_memory(get_array(df, "a"), arr)
  397. tm.assert_frame_equal(df, view)