test_diff.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Series,
  7. Timestamp,
  8. date_range,
  9. )
  10. import pandas._testing as tm
  11. class TestDataFrameDiff:
  12. def test_diff_requires_integer(self):
  13. df = DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
  14. with pytest.raises(ValueError, match="periods must be an integer"):
  15. df.diff(1.5)
  16. # GH#44572 np.int64 is accepted
  17. @pytest.mark.parametrize("num", [1, np.int64(1)])
  18. def test_diff(self, datetime_frame, num):
  19. df = datetime_frame
  20. the_diff = df.diff(num)
  21. expected = df["A"] - df["A"].shift(num)
  22. tm.assert_series_equal(the_diff["A"], expected)
  23. def test_diff_int_dtype(self):
  24. # int dtype
  25. a = 10_000_000_000_000_000
  26. b = a + 1
  27. ser = Series([a, b])
  28. rs = DataFrame({"s": ser}).diff()
  29. assert rs.s[1] == 1
  30. def test_diff_mixed_numeric(self, datetime_frame):
  31. # mixed numeric
  32. tf = datetime_frame.astype("float32")
  33. the_diff = tf.diff(1)
  34. tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1))
  35. def test_diff_axis1_nonconsolidated(self):
  36. # GH#10907
  37. df = DataFrame({"y": Series([2]), "z": Series([3])})
  38. df.insert(0, "x", 1)
  39. result = df.diff(axis=1)
  40. expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)})
  41. tm.assert_frame_equal(result, expected)
  42. def test_diff_timedelta64_with_nat(self):
  43. # GH#32441
  44. arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
  45. arr[:, 0] = np.timedelta64("NaT", "ns")
  46. df = DataFrame(arr)
  47. result = df.diff(1, axis=0)
  48. expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]})
  49. tm.assert_equal(result, expected)
  50. result = df.diff(0)
  51. expected = df - df
  52. assert expected[0].isna().all()
  53. tm.assert_equal(result, expected)
  54. result = df.diff(-1, axis=1)
  55. expected = df * np.nan
  56. tm.assert_equal(result, expected)
  57. @pytest.mark.parametrize("tz", [None, "UTC"])
  58. def test_diff_datetime_axis0_with_nat(self, tz, unit):
  59. # GH#32441
  60. dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit)
  61. ser = Series(dti)
  62. df = ser.to_frame()
  63. result = df.diff()
  64. ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit(
  65. unit
  66. )
  67. expected = Series(ex_index).to_frame()
  68. tm.assert_frame_equal(result, expected)
  69. @pytest.mark.parametrize("tz", [None, "UTC"])
  70. def test_diff_datetime_with_nat_zero_periods(self, tz):
  71. # diff on NaT values should give NaT, not timedelta64(0)
  72. dti = date_range("2016-01-01", periods=4, tz=tz)
  73. ser = Series(dti)
  74. df = ser.to_frame().copy()
  75. df[1] = ser.copy()
  76. df.iloc[:, 0] = pd.NaT
  77. expected = df - df
  78. assert expected[0].isna().all()
  79. result = df.diff(0, axis=0)
  80. tm.assert_frame_equal(result, expected)
  81. result = df.diff(0, axis=1)
  82. tm.assert_frame_equal(result, expected)
  83. @pytest.mark.parametrize("tz", [None, "UTC"])
  84. def test_diff_datetime_axis0(self, tz):
  85. # GH#18578
  86. df = DataFrame(
  87. {
  88. 0: date_range("2010", freq="D", periods=2, tz=tz),
  89. 1: date_range("2010", freq="D", periods=2, tz=tz),
  90. }
  91. )
  92. result = df.diff(axis=0)
  93. expected = DataFrame(
  94. {
  95. 0: pd.TimedeltaIndex(["NaT", "1 days"]),
  96. 1: pd.TimedeltaIndex(["NaT", "1 days"]),
  97. }
  98. )
  99. tm.assert_frame_equal(result, expected)
  100. @pytest.mark.parametrize("tz", [None, "UTC"])
  101. def test_diff_datetime_axis1(self, tz):
  102. # GH#18578
  103. df = DataFrame(
  104. {
  105. 0: date_range("2010", freq="D", periods=2, tz=tz),
  106. 1: date_range("2010", freq="D", periods=2, tz=tz),
  107. }
  108. )
  109. result = df.diff(axis=1)
  110. expected = DataFrame(
  111. {
  112. 0: pd.TimedeltaIndex(["NaT", "NaT"]),
  113. 1: pd.TimedeltaIndex(["0 days", "0 days"]),
  114. }
  115. )
  116. tm.assert_frame_equal(result, expected)
  117. def test_diff_timedelta(self, unit):
  118. # GH#4533
  119. df = DataFrame(
  120. {
  121. "time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
  122. "value": [1.0, 2.0],
  123. }
  124. )
  125. df["time"] = df["time"].dt.as_unit(unit)
  126. res = df.diff()
  127. exp = DataFrame(
  128. [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]
  129. )
  130. exp["time"] = exp["time"].dt.as_unit(unit)
  131. tm.assert_frame_equal(res, exp)
  132. def test_diff_mixed_dtype(self):
  133. df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
  134. df["A"] = np.array([1, 2, 3, 4, 5], dtype=object)
  135. result = df.diff()
  136. assert result[0].dtype == np.float64
  137. def test_diff_neg_n(self, datetime_frame):
  138. rs = datetime_frame.diff(-1)
  139. xp = datetime_frame - datetime_frame.shift(-1)
  140. tm.assert_frame_equal(rs, xp)
  141. def test_diff_float_n(self, datetime_frame):
  142. rs = datetime_frame.diff(1.0)
  143. xp = datetime_frame.diff(1)
  144. tm.assert_frame_equal(rs, xp)
  145. def test_diff_axis(self):
  146. # GH#9727
  147. df = DataFrame([[1.0, 2.0], [3.0, 4.0]])
  148. tm.assert_frame_equal(
  149. df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])
  150. )
  151. tm.assert_frame_equal(
  152. df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])
  153. )
  154. def test_diff_period(self):
  155. # GH#32995 Don't pass an incorrect axis
  156. pi = date_range("2016-01-01", periods=3).to_period("D")
  157. df = DataFrame({"A": pi})
  158. result = df.diff(1, axis=1)
  159. expected = (df - pd.NaT).astype(object)
  160. tm.assert_frame_equal(result, expected)
  161. def test_diff_axis1_mixed_dtypes(self):
  162. # GH#32995 operate column-wise when we have mixed dtypes and axis=1
  163. df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
  164. expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2})
  165. result = df.diff(axis=1)
  166. tm.assert_frame_equal(result, expected)
  167. # GH#21437 mixed-float-dtypes
  168. df = DataFrame(
  169. {"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")}
  170. )
  171. result = df.diff(axis=1)
  172. expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0})
  173. tm.assert_frame_equal(result, expected)
  174. def test_diff_axis1_mixed_dtypes_large_periods(self):
  175. # GH#32995 operate column-wise when we have mixed dtypes and axis=1
  176. df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
  177. expected = df * np.nan
  178. result = df.diff(axis=1, periods=3)
  179. tm.assert_frame_equal(result, expected)
  180. def test_diff_axis1_mixed_dtypes_negative_periods(self):
  181. # GH#32995 operate column-wise when we have mixed dtypes and axis=1
  182. df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)})
  183. expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan})
  184. result = df.diff(axis=1, periods=-1)
  185. tm.assert_frame_equal(result, expected)
  186. def test_diff_sparse(self):
  187. # GH#28813 .diff() should work for sparse dataframes as well
  188. sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]")
  189. result = sparse_df.diff()
  190. expected = DataFrame(
  191. [[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0)
  192. )
  193. tm.assert_frame_equal(result, expected)
  194. @pytest.mark.parametrize(
  195. "axis,expected",
  196. [
  197. (
  198. 0,
  199. DataFrame(
  200. {
  201. "a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0],
  202. "b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan],
  203. "c": np.repeat(np.nan, 8),
  204. "d": [np.nan, 3, 5, 7, 9, 11, 13, 15],
  205. },
  206. dtype="Int64",
  207. ),
  208. ),
  209. (
  210. 1,
  211. DataFrame(
  212. {
  213. "a": np.repeat(np.nan, 8),
  214. "b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0],
  215. "c": np.repeat(np.nan, 8),
  216. "d": np.repeat(np.nan, 8),
  217. },
  218. dtype="Int64",
  219. ),
  220. ),
  221. ],
  222. )
  223. def test_diff_integer_na(self, axis, expected):
  224. # GH#24171 IntegerNA Support for DataFrame.diff()
  225. df = DataFrame(
  226. {
  227. "a": np.repeat([0, 1, np.nan, 2], 2),
  228. "b": np.tile([0, 1, np.nan, 2], 2),
  229. "c": np.repeat(np.nan, 8),
  230. "d": np.arange(1, 9) ** 2,
  231. },
  232. dtype="Int64",
  233. )
  234. # Test case for default behaviour of diff
  235. result = df.diff(axis=axis)
  236. tm.assert_frame_equal(result, expected)
  237. def test_diff_readonly(self):
  238. # https://github.com/pandas-dev/pandas/issues/35559
  239. arr = np.random.default_rng(2).standard_normal((5, 2))
  240. arr.flags.writeable = False
  241. df = DataFrame(arr)
  242. result = df.diff()
  243. expected = DataFrame(np.array(df)).diff()
  244. tm.assert_frame_equal(result, expected)
  245. def test_diff_all_int_dtype(self, any_int_numpy_dtype):
  246. # GH 14773
  247. df = DataFrame(range(5))
  248. df = df.astype(any_int_numpy_dtype)
  249. result = df.diff()
  250. expected_dtype = (
  251. "float32" if any_int_numpy_dtype in ("int8", "int16") else "float64"
  252. )
  253. expected = DataFrame([np.nan, 1.0, 1.0, 1.0, 1.0], dtype=expected_dtype)
  254. tm.assert_frame_equal(result, expected)