test_block_internals.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. from datetime import (
  2. datetime,
  3. timedelta,
  4. )
  5. import itertools
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import WARNING_CHECK_DISABLED
  9. from pandas.errors import PerformanceWarning
  10. import pandas.util._test_decorators as td
  11. import pandas as pd
  12. from pandas import (
  13. Categorical,
  14. DataFrame,
  15. Series,
  16. Timestamp,
  17. date_range,
  18. option_context,
  19. )
  20. import pandas._testing as tm
  21. from pandas.core.internals.blocks import NumpyBlock
  22. # Segregated collection of methods that require the BlockManager internal data
  23. # structure
  24. # TODO(ArrayManager) check which of those tests need to be rewritten to test the
  25. # equivalent for ArrayManager
  26. pytestmark = td.skip_array_manager_invalid_test
  27. class TestDataFrameBlockInternals:
  28. def test_setitem_invalidates_datetime_index_freq(self):
  29. # GH#24096 altering a datetime64tz column inplace invalidates the
  30. # `freq` attribute on the underlying DatetimeIndex
  31. dti = date_range("20130101", periods=3, tz="US/Eastern")
  32. ts = dti[1]
  33. df = DataFrame({"B": dti})
  34. assert df["B"]._values.freq is None
  35. df.iloc[1, 0] = pd.NaT
  36. assert df["B"]._values.freq is None
  37. # check that the DatetimeIndex was not altered in place
  38. assert dti.freq == "D"
  39. assert dti[1] == ts
  40. def test_cast_internals(self, float_frame):
  41. msg = "Passing a BlockManager to DataFrame"
  42. with tm.assert_produces_warning(
  43. DeprecationWarning, match=msg, check_stacklevel=False
  44. ):
  45. casted = DataFrame(float_frame._mgr, dtype=int)
  46. expected = DataFrame(float_frame._series, dtype=int)
  47. tm.assert_frame_equal(casted, expected)
  48. with tm.assert_produces_warning(
  49. DeprecationWarning, match=msg, check_stacklevel=False
  50. ):
  51. casted = DataFrame(float_frame._mgr, dtype=np.int32)
  52. expected = DataFrame(float_frame._series, dtype=np.int32)
  53. tm.assert_frame_equal(casted, expected)
  54. def test_consolidate(self, float_frame):
  55. float_frame["E"] = 7.0
  56. consolidated = float_frame._consolidate()
  57. assert len(consolidated._mgr.blocks) == 1
  58. # Ensure copy, do I want this?
  59. recons = consolidated._consolidate()
  60. assert recons is not consolidated
  61. tm.assert_frame_equal(recons, consolidated)
  62. float_frame["F"] = 8.0
  63. assert len(float_frame._mgr.blocks) == 3
  64. return_value = float_frame._consolidate_inplace()
  65. assert return_value is None
  66. assert len(float_frame._mgr.blocks) == 1
  67. def test_consolidate_inplace(self, float_frame):
  68. # triggers in-place consolidation
  69. for letter in range(ord("A"), ord("Z")):
  70. float_frame[chr(letter)] = chr(letter)
  71. def test_modify_values(self, float_frame, using_copy_on_write):
  72. if using_copy_on_write:
  73. with pytest.raises(ValueError, match="read-only"):
  74. float_frame.values[5] = 5
  75. assert (float_frame.values[5] != 5).all()
  76. return
  77. float_frame.values[5] = 5
  78. assert (float_frame.values[5] == 5).all()
  79. # unconsolidated
  80. float_frame["E"] = 7.0
  81. col = float_frame["E"]
  82. float_frame.values[6] = 6
  83. # as of 2.0 .values does not consolidate, so subsequent calls to .values
  84. # does not share data
  85. assert not (float_frame.values[6] == 6).all()
  86. assert (col == 7).all()
  87. def test_boolean_set_uncons(self, float_frame):
  88. float_frame["E"] = 7.0
  89. expected = float_frame.values.copy()
  90. expected[expected > 1] = 2
  91. float_frame[float_frame > 1] = 2
  92. tm.assert_almost_equal(expected, float_frame.values)
  93. def test_constructor_with_convert(self):
  94. # this is actually mostly a test of lib.maybe_convert_objects
  95. # #2845
  96. df = DataFrame({"A": [2**63 - 1]})
  97. result = df["A"]
  98. expected = Series(np.asarray([2**63 - 1], np.int64), name="A")
  99. tm.assert_series_equal(result, expected)
  100. df = DataFrame({"A": [2**63]})
  101. result = df["A"]
  102. expected = Series(np.asarray([2**63], np.uint64), name="A")
  103. tm.assert_series_equal(result, expected)
  104. df = DataFrame({"A": [datetime(2005, 1, 1), True]})
  105. result = df["A"]
  106. expected = Series(
  107. np.asarray([datetime(2005, 1, 1), True], np.object_), name="A"
  108. )
  109. tm.assert_series_equal(result, expected)
  110. df = DataFrame({"A": [None, 1]})
  111. result = df["A"]
  112. expected = Series(np.asarray([np.nan, 1], np.float64), name="A")
  113. tm.assert_series_equal(result, expected)
  114. df = DataFrame({"A": [1.0, 2]})
  115. result = df["A"]
  116. expected = Series(np.asarray([1.0, 2], np.float64), name="A")
  117. tm.assert_series_equal(result, expected)
  118. df = DataFrame({"A": [1.0 + 2.0j, 3]})
  119. result = df["A"]
  120. expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex128), name="A")
  121. tm.assert_series_equal(result, expected)
  122. df = DataFrame({"A": [1.0 + 2.0j, 3.0]})
  123. result = df["A"]
  124. expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex128), name="A")
  125. tm.assert_series_equal(result, expected)
  126. df = DataFrame({"A": [1.0 + 2.0j, True]})
  127. result = df["A"]
  128. expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name="A")
  129. tm.assert_series_equal(result, expected)
  130. df = DataFrame({"A": [1.0, None]})
  131. result = df["A"]
  132. expected = Series(np.asarray([1.0, np.nan], np.float64), name="A")
  133. tm.assert_series_equal(result, expected)
  134. df = DataFrame({"A": [1.0 + 2.0j, None]})
  135. result = df["A"]
  136. expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex128), name="A")
  137. tm.assert_series_equal(result, expected)
  138. df = DataFrame({"A": [2.0, 1, True, None]})
  139. result = df["A"]
  140. expected = Series(np.asarray([2.0, 1, True, None], np.object_), name="A")
  141. tm.assert_series_equal(result, expected)
  142. df = DataFrame({"A": [2.0, 1, datetime(2006, 1, 1), None]})
  143. result = df["A"]
  144. expected = Series(
  145. np.asarray([2.0, 1, datetime(2006, 1, 1), None], np.object_), name="A"
  146. )
  147. tm.assert_series_equal(result, expected)
  148. def test_construction_with_mixed(self, float_string_frame, using_infer_string):
  149. # mixed-type frames
  150. float_string_frame["datetime"] = datetime.now()
  151. float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
  152. assert float_string_frame["datetime"].dtype == "M8[us]"
  153. assert float_string_frame["timedelta"].dtype == "m8[us]"
  154. result = float_string_frame.dtypes
  155. expected = Series(
  156. [np.dtype("float64")] * 4
  157. + [
  158. np.dtype("object")
  159. if not using_infer_string
  160. else pd.StringDtype(na_value=np.nan),
  161. np.dtype("datetime64[us]"),
  162. np.dtype("timedelta64[us]"),
  163. ],
  164. index=list("ABCD") + ["foo", "datetime", "timedelta"],
  165. )
  166. tm.assert_series_equal(result, expected)
  167. def test_construction_with_conversions(self):
  168. # convert from a numpy array of non-ns timedelta64; as of 2.0 this does
  169. # *not* convert
  170. arr = np.array([1, 2, 3], dtype="timedelta64[s]")
  171. df = DataFrame({"A": arr})
  172. expected = DataFrame(
  173. {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3)
  174. )
  175. tm.assert_numpy_array_equal(df["A"].to_numpy(), arr)
  176. expected = DataFrame(
  177. {
  178. "dt1": Timestamp("20130101"),
  179. "dt2": date_range("20130101", periods=3).astype("M8[s]"),
  180. # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
  181. # FIXME: don't leave commented-out
  182. },
  183. index=range(3),
  184. )
  185. assert expected.dtypes["dt1"] == "M8[s]"
  186. assert expected.dtypes["dt2"] == "M8[s]"
  187. dt1 = np.datetime64("2013-01-01")
  188. dt2 = np.array(
  189. ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]"
  190. )
  191. df = DataFrame({"dt1": dt1, "dt2": dt2})
  192. # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
  193. # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
  194. # FIXME: don't leave commented-out
  195. tm.assert_frame_equal(df, expected)
  196. def test_constructor_compound_dtypes(self):
  197. # GH 5191
  198. # compound dtypes should raise not-implementederror
  199. def f(dtype):
  200. data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9))
  201. return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype)
  202. msg = "compound dtypes are not implemented in the DataFrame constructor"
  203. with pytest.raises(NotImplementedError, match=msg):
  204. f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")])
  205. # pre-2.0 these used to work (though results may be unexpected)
  206. with pytest.raises(TypeError, match="argument must be"):
  207. f("int64")
  208. with pytest.raises(TypeError, match="argument must be"):
  209. f("float64")
  210. # 10822
  211. msg = "^Unknown datetime string format, unable to parse: aa, at position 0$"
  212. with pytest.raises(ValueError, match=msg):
  213. f("M8[ns]")
  214. def test_pickle(self, float_string_frame, timezone_frame):
  215. empty_frame = DataFrame()
  216. unpickled = tm.round_trip_pickle(float_string_frame)
  217. tm.assert_frame_equal(float_string_frame, unpickled)
  218. # buglet
  219. float_string_frame._mgr.ndim
  220. # empty
  221. unpickled = tm.round_trip_pickle(empty_frame)
  222. repr(unpickled)
  223. # tz frame
  224. unpickled = tm.round_trip_pickle(timezone_frame)
  225. tm.assert_frame_equal(timezone_frame, unpickled)
  226. def test_consolidate_datetime64(self):
  227. # numpy vstack bug
  228. df = DataFrame(
  229. {
  230. "starting": pd.to_datetime(
  231. [
  232. "2012-06-21 00:00",
  233. "2012-06-23 07:00",
  234. "2012-06-23 16:30",
  235. "2012-06-25 08:00",
  236. "2012-06-26 12:00",
  237. ]
  238. ),
  239. "ending": pd.to_datetime(
  240. [
  241. "2012-06-23 07:00",
  242. "2012-06-23 16:30",
  243. "2012-06-25 08:00",
  244. "2012-06-26 12:00",
  245. "2012-06-27 08:00",
  246. ]
  247. ),
  248. "measure": [77, 65, 77, 0, 77],
  249. }
  250. )
  251. ser_starting = df.starting
  252. ser_starting.index = ser_starting.values
  253. ser_starting = ser_starting.tz_localize("US/Eastern")
  254. ser_starting = ser_starting.tz_convert("UTC")
  255. ser_starting.index.name = "starting"
  256. ser_ending = df.ending
  257. ser_ending.index = ser_ending.values
  258. ser_ending = ser_ending.tz_localize("US/Eastern")
  259. ser_ending = ser_ending.tz_convert("UTC")
  260. ser_ending.index.name = "ending"
  261. df.starting = ser_starting.index
  262. df.ending = ser_ending.index
  263. tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index)
  264. tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
  265. def test_is_mixed_type(self, float_frame, float_string_frame):
  266. assert not float_frame._is_mixed_type
  267. assert float_string_frame._is_mixed_type
  268. def test_stale_cached_series_bug_473(self, using_copy_on_write, warn_copy_on_write):
  269. # this is chained, but ok
  270. with option_context("chained_assignment", None):
  271. Y = DataFrame(
  272. np.random.default_rng(2).random((4, 4)),
  273. index=("a", "b", "c", "d"),
  274. columns=("e", "f", "g", "h"),
  275. )
  276. repr(Y)
  277. Y["e"] = Y["e"].astype("object")
  278. with tm.raises_chained_assignment_error():
  279. Y["g"]["c"] = np.nan
  280. repr(Y)
  281. Y.sum()
  282. Y["g"].sum()
  283. if using_copy_on_write:
  284. assert not pd.isna(Y["g"]["c"])
  285. else:
  286. assert pd.isna(Y["g"]["c"])
  287. @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
  288. def test_strange_column_corruption_issue(self, using_copy_on_write):
  289. # TODO(wesm): Unclear how exactly this is related to internal matters
  290. df = DataFrame(index=[0, 1])
  291. df[0] = np.nan
  292. wasCol = {}
  293. with tm.assert_produces_warning(
  294. PerformanceWarning, raise_on_extra_warnings=False
  295. ):
  296. for i, dt in enumerate(df.index):
  297. for col in range(100, 200):
  298. if col not in wasCol:
  299. wasCol[col] = 1
  300. df[col] = np.nan
  301. if using_copy_on_write:
  302. df.loc[dt, col] = i
  303. else:
  304. df[col][dt] = i
  305. myid = 100
  306. first = len(df.loc[pd.isna(df[myid]), [myid]])
  307. second = len(df.loc[pd.isna(df[myid]), [myid]])
  308. assert first == second == 0
  309. def test_constructor_no_pandas_array(self):
  310. # Ensure that NumpyExtensionArray isn't allowed inside Series
  311. # See https://github.com/pandas-dev/pandas/issues/23995 for more.
  312. arr = Series([1, 2, 3]).array
  313. result = DataFrame({"A": arr})
  314. expected = DataFrame({"A": [1, 2, 3]})
  315. tm.assert_frame_equal(result, expected)
  316. assert isinstance(result._mgr.blocks[0], NumpyBlock)
  317. assert result._mgr.blocks[0].is_numeric
  318. def test_add_column_with_pandas_array(self):
  319. # GH 26390
  320. df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
  321. df["c"] = pd.arrays.NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object))
  322. df2 = DataFrame(
  323. {
  324. "a": [1, 2, 3, 4],
  325. "b": ["a", "b", "c", "d"],
  326. "c": pd.arrays.NumpyExtensionArray(
  327. np.array([1, 2, None, 3], dtype=object)
  328. ),
  329. }
  330. )
  331. assert type(df["c"]._mgr.blocks[0]) == NumpyBlock
  332. assert df["c"]._mgr.blocks[0].is_object
  333. assert type(df2["c"]._mgr.blocks[0]) == NumpyBlock
  334. assert df2["c"]._mgr.blocks[0].is_object
  335. tm.assert_frame_equal(df, df2)
  336. def test_update_inplace_sets_valid_block_values(using_copy_on_write):
  337. # https://github.com/pandas-dev/pandas/issues/33457
  338. df = DataFrame({"a": Series([1, 2, None], dtype="category")})
  339. # inplace update of a single column
  340. if using_copy_on_write:
  341. with tm.raises_chained_assignment_error():
  342. df["a"].fillna(1, inplace=True)
  343. else:
  344. with tm.assert_produces_warning(
  345. FutureWarning if not WARNING_CHECK_DISABLED else None,
  346. match="inplace method",
  347. ):
  348. df["a"].fillna(1, inplace=True)
  349. # check we haven't put a Series into any block.values
  350. assert isinstance(df._mgr.blocks[0].values, Categorical)
  351. if not using_copy_on_write:
  352. # smoketest for OP bug from GH#35731
  353. assert df.isnull().sum().sum() == 0
  354. def test_nonconsolidated_item_cache_take():
  355. # https://github.com/pandas-dev/pandas/issues/35521
  356. # create non-consolidated dataframe with object dtype columns
  357. df = DataFrame(
  358. {
  359. "col1": Series(["a"], dtype=object),
  360. }
  361. )
  362. df["col2"] = Series([0], dtype=object)
  363. assert not df._mgr.is_consolidated()
  364. # access column (item cache)
  365. df["col1"] == "A"
  366. # take operation
  367. # (regression was that this consolidated but didn't reset item cache,
  368. # resulting in an invalid cache and the .at operation not working properly)
  369. df[df["col2"] == 0]
  370. # now setting value should update actual dataframe
  371. df.at[0, "col1"] = "A"
  372. expected = DataFrame({"col1": ["A"], "col2": [0]}, dtype=object)
  373. tm.assert_frame_equal(df, expected)
  374. assert df.at[0, "col1"] == "A"