test_rolling_functions.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. from datetime import datetime
  2. import numpy as np
  3. import pytest
  4. import pandas.util._test_decorators as td
  5. from pandas import (
  6. DataFrame,
  7. DatetimeIndex,
  8. Series,
  9. concat,
  10. isna,
  11. notna,
  12. )
  13. import pandas._testing as tm
  14. from pandas.tseries import offsets
  15. @pytest.mark.parametrize(
  16. "compare_func, roll_func, kwargs",
  17. [
  18. [np.mean, "mean", {}],
  19. [np.nansum, "sum", {}],
  20. [
  21. lambda x: np.isfinite(x).astype(float).sum(),
  22. "count",
  23. {},
  24. ],
  25. [np.median, "median", {}],
  26. [np.min, "min", {}],
  27. [np.max, "max", {}],
  28. [lambda x: np.std(x, ddof=1), "std", {}],
  29. [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}],
  30. [lambda x: np.var(x, ddof=1), "var", {}],
  31. [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}],
  32. ],
  33. )
  34. def test_series(series, compare_func, roll_func, kwargs, step):
  35. result = getattr(series.rolling(50, step=step), roll_func)(**kwargs)
  36. assert isinstance(result, Series)
  37. end = range(0, len(series), step or 1)[-1] + 1
  38. tm.assert_almost_equal(result.iloc[-1], compare_func(series[end - 50 : end]))
  39. @pytest.mark.parametrize(
  40. "compare_func, roll_func, kwargs",
  41. [
  42. [np.mean, "mean", {}],
  43. [np.nansum, "sum", {}],
  44. [
  45. lambda x: np.isfinite(x).astype(float).sum(),
  46. "count",
  47. {},
  48. ],
  49. [np.median, "median", {}],
  50. [np.min, "min", {}],
  51. [np.max, "max", {}],
  52. [lambda x: np.std(x, ddof=1), "std", {}],
  53. [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}],
  54. [lambda x: np.var(x, ddof=1), "var", {}],
  55. [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}],
  56. ],
  57. )
  58. def test_frame(raw, frame, compare_func, roll_func, kwargs, step):
  59. result = getattr(frame.rolling(50, step=step), roll_func)(**kwargs)
  60. assert isinstance(result, DataFrame)
  61. end = range(0, len(frame), step or 1)[-1] + 1
  62. tm.assert_series_equal(
  63. result.iloc[-1, :],
  64. frame.iloc[end - 50 : end, :].apply(compare_func, axis=0, raw=raw),
  65. check_names=False,
  66. )
  67. @pytest.mark.parametrize(
  68. "compare_func, roll_func, kwargs, minp",
  69. [
  70. [np.mean, "mean", {}, 10],
  71. [np.nansum, "sum", {}, 10],
  72. [lambda x: np.isfinite(x).astype(float).sum(), "count", {}, 0],
  73. [np.median, "median", {}, 10],
  74. [np.min, "min", {}, 10],
  75. [np.max, "max", {}, 10],
  76. [lambda x: np.std(x, ddof=1), "std", {}, 10],
  77. [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}, 10],
  78. [lambda x: np.var(x, ddof=1), "var", {}, 10],
  79. [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}, 10],
  80. ],
  81. )
  82. def test_time_rule_series(series, compare_func, roll_func, kwargs, minp):
  83. win = 25
  84. ser = series[::2].resample("B").mean()
  85. series_result = getattr(ser.rolling(window=win, min_periods=minp), roll_func)(
  86. **kwargs
  87. )
  88. last_date = series_result.index[-1]
  89. prev_date = last_date - 24 * offsets.BDay()
  90. trunc_series = series[::2].truncate(prev_date, last_date)
  91. tm.assert_almost_equal(series_result.iloc[-1], compare_func(trunc_series))
  92. @pytest.mark.parametrize(
  93. "compare_func, roll_func, kwargs, minp",
  94. [
  95. [np.mean, "mean", {}, 10],
  96. [np.nansum, "sum", {}, 10],
  97. [lambda x: np.isfinite(x).astype(float).sum(), "count", {}, 0],
  98. [np.median, "median", {}, 10],
  99. [np.min, "min", {}, 10],
  100. [np.max, "max", {}, 10],
  101. [lambda x: np.std(x, ddof=1), "std", {}, 10],
  102. [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}, 10],
  103. [lambda x: np.var(x, ddof=1), "var", {}, 10],
  104. [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}, 10],
  105. ],
  106. )
  107. def test_time_rule_frame(raw, frame, compare_func, roll_func, kwargs, minp):
  108. win = 25
  109. frm = frame[::2].resample("B").mean()
  110. frame_result = getattr(frm.rolling(window=win, min_periods=minp), roll_func)(
  111. **kwargs
  112. )
  113. last_date = frame_result.index[-1]
  114. prev_date = last_date - 24 * offsets.BDay()
  115. trunc_frame = frame[::2].truncate(prev_date, last_date)
  116. tm.assert_series_equal(
  117. frame_result.xs(last_date),
  118. trunc_frame.apply(compare_func, raw=raw),
  119. check_names=False,
  120. )
  121. @pytest.mark.parametrize(
  122. "compare_func, roll_func, kwargs",
  123. [
  124. [np.mean, "mean", {}],
  125. [np.nansum, "sum", {}],
  126. [np.median, "median", {}],
  127. [np.min, "min", {}],
  128. [np.max, "max", {}],
  129. [lambda x: np.std(x, ddof=1), "std", {}],
  130. [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}],
  131. [lambda x: np.var(x, ddof=1), "var", {}],
  132. [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}],
  133. ],
  134. )
  135. def test_nans(compare_func, roll_func, kwargs):
  136. obj = Series(np.random.default_rng(2).standard_normal(50))
  137. obj[:10] = np.nan
  138. obj[-10:] = np.nan
  139. result = getattr(obj.rolling(50, min_periods=30), roll_func)(**kwargs)
  140. tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10]))
  141. # min_periods is working correctly
  142. result = getattr(obj.rolling(20, min_periods=15), roll_func)(**kwargs)
  143. assert isna(result.iloc[23])
  144. assert not isna(result.iloc[24])
  145. assert not isna(result.iloc[-6])
  146. assert isna(result.iloc[-5])
  147. obj2 = Series(np.random.default_rng(2).standard_normal(20))
  148. result = getattr(obj2.rolling(10, min_periods=5), roll_func)(**kwargs)
  149. assert isna(result.iloc[3])
  150. assert notna(result.iloc[4])
  151. if roll_func != "sum":
  152. result0 = getattr(obj.rolling(20, min_periods=0), roll_func)(**kwargs)
  153. result1 = getattr(obj.rolling(20, min_periods=1), roll_func)(**kwargs)
  154. tm.assert_almost_equal(result0, result1)
  155. def test_nans_count():
  156. obj = Series(np.random.default_rng(2).standard_normal(50))
  157. obj[:10] = np.nan
  158. obj[-10:] = np.nan
  159. result = obj.rolling(50, min_periods=30).count()
  160. tm.assert_almost_equal(
  161. result.iloc[-1], np.isfinite(obj[10:-10]).astype(float).sum()
  162. )
  163. @pytest.mark.parametrize(
  164. "roll_func, kwargs",
  165. [
  166. ["mean", {}],
  167. ["sum", {}],
  168. ["median", {}],
  169. ["min", {}],
  170. ["max", {}],
  171. ["std", {}],
  172. ["std", {"ddof": 0}],
  173. ["var", {}],
  174. ["var", {"ddof": 0}],
  175. ],
  176. )
  177. @pytest.mark.parametrize("minp", [0, 99, 100])
  178. def test_min_periods(series, minp, roll_func, kwargs, step):
  179. result = getattr(
  180. series.rolling(len(series) + 1, min_periods=minp, step=step), roll_func
  181. )(**kwargs)
  182. expected = getattr(
  183. series.rolling(len(series), min_periods=minp, step=step), roll_func
  184. )(**kwargs)
  185. nan_mask = isna(result)
  186. tm.assert_series_equal(nan_mask, isna(expected))
  187. nan_mask = ~nan_mask
  188. tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
  189. def test_min_periods_count(series, step):
  190. result = series.rolling(len(series) + 1, min_periods=0, step=step).count()
  191. expected = series.rolling(len(series), min_periods=0, step=step).count()
  192. nan_mask = isna(result)
  193. tm.assert_series_equal(nan_mask, isna(expected))
  194. nan_mask = ~nan_mask
  195. tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
  196. @pytest.mark.parametrize(
  197. "roll_func, kwargs, minp",
  198. [
  199. ["mean", {}, 15],
  200. ["sum", {}, 15],
  201. ["count", {}, 0],
  202. ["median", {}, 15],
  203. ["min", {}, 15],
  204. ["max", {}, 15],
  205. ["std", {}, 15],
  206. ["std", {"ddof": 0}, 15],
  207. ["var", {}, 15],
  208. ["var", {"ddof": 0}, 15],
  209. ],
  210. )
  211. def test_center(roll_func, kwargs, minp):
  212. obj = Series(np.random.default_rng(2).standard_normal(50))
  213. obj[:10] = np.nan
  214. obj[-10:] = np.nan
  215. result = getattr(obj.rolling(20, min_periods=minp, center=True), roll_func)(
  216. **kwargs
  217. )
  218. expected = (
  219. getattr(
  220. concat([obj, Series([np.nan] * 9)]).rolling(20, min_periods=minp), roll_func
  221. )(**kwargs)
  222. .iloc[9:]
  223. .reset_index(drop=True)
  224. )
  225. tm.assert_series_equal(result, expected)
  226. @pytest.mark.parametrize(
  227. "roll_func, kwargs, minp, fill_value",
  228. [
  229. ["mean", {}, 10, None],
  230. ["sum", {}, 10, None],
  231. ["count", {}, 0, 0],
  232. ["median", {}, 10, None],
  233. ["min", {}, 10, None],
  234. ["max", {}, 10, None],
  235. ["std", {}, 10, None],
  236. ["std", {"ddof": 0}, 10, None],
  237. ["var", {}, 10, None],
  238. ["var", {"ddof": 0}, 10, None],
  239. ],
  240. )
  241. def test_center_reindex_series(series, roll_func, kwargs, minp, fill_value):
  242. # shifter index
  243. s = [f"x{x:d}" for x in range(12)]
  244. series_xp = (
  245. getattr(
  246. series.reindex(list(series.index) + s).rolling(window=25, min_periods=minp),
  247. roll_func,
  248. )(**kwargs)
  249. .shift(-12)
  250. .reindex(series.index)
  251. )
  252. series_rs = getattr(
  253. series.rolling(window=25, min_periods=minp, center=True), roll_func
  254. )(**kwargs)
  255. if fill_value is not None:
  256. series_xp = series_xp.fillna(fill_value)
  257. tm.assert_series_equal(series_xp, series_rs)
  258. @pytest.mark.parametrize(
  259. "roll_func, kwargs, minp, fill_value",
  260. [
  261. ["mean", {}, 10, None],
  262. ["sum", {}, 10, None],
  263. ["count", {}, 0, 0],
  264. ["median", {}, 10, None],
  265. ["min", {}, 10, None],
  266. ["max", {}, 10, None],
  267. ["std", {}, 10, None],
  268. ["std", {"ddof": 0}, 10, None],
  269. ["var", {}, 10, None],
  270. ["var", {"ddof": 0}, 10, None],
  271. ],
  272. )
  273. def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value):
  274. # shifter index
  275. s = [f"x{x:d}" for x in range(12)]
  276. frame_xp = (
  277. getattr(
  278. frame.reindex(list(frame.index) + s).rolling(window=25, min_periods=minp),
  279. roll_func,
  280. )(**kwargs)
  281. .shift(-12)
  282. .reindex(frame.index)
  283. )
  284. frame_rs = getattr(
  285. frame.rolling(window=25, min_periods=minp, center=True), roll_func
  286. )(**kwargs)
  287. if fill_value is not None:
  288. frame_xp = frame_xp.fillna(fill_value)
  289. tm.assert_frame_equal(frame_xp, frame_rs)
  290. @pytest.mark.parametrize(
  291. "f",
  292. [
  293. lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False),
  294. lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False),
  295. lambda x: x.rolling(window=10, min_periods=5).max(),
  296. lambda x: x.rolling(window=10, min_periods=5).min(),
  297. lambda x: x.rolling(window=10, min_periods=5).sum(),
  298. lambda x: x.rolling(window=10, min_periods=5).mean(),
  299. lambda x: x.rolling(window=10, min_periods=5).std(),
  300. lambda x: x.rolling(window=10, min_periods=5).var(),
  301. lambda x: x.rolling(window=10, min_periods=5).skew(),
  302. lambda x: x.rolling(window=10, min_periods=5).kurt(),
  303. lambda x: x.rolling(window=10, min_periods=5).quantile(q=0.5),
  304. lambda x: x.rolling(window=10, min_periods=5).median(),
  305. lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False),
  306. lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True),
  307. pytest.param(
  308. lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(),
  309. marks=td.skip_if_no("scipy"),
  310. ),
  311. ],
  312. )
  313. def test_rolling_functions_window_non_shrinkage(f):
  314. # GH 7764
  315. s = Series(range(4))
  316. s_expected = Series(np.nan, index=s.index)
  317. df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"])
  318. df_expected = DataFrame(np.nan, index=df.index, columns=df.columns)
  319. s_result = f(s)
  320. tm.assert_series_equal(s_result, s_expected)
  321. df_result = f(df)
  322. tm.assert_frame_equal(df_result, df_expected)
  323. def test_rolling_max_gh6297(step):
  324. """Replicate result expected in GH #6297"""
  325. indices = [datetime(1975, 1, i) for i in range(1, 6)]
  326. # So that we can have 2 datapoints on one of the days
  327. indices.append(datetime(1975, 1, 3, 6, 0))
  328. series = Series(range(1, 7), index=indices)
  329. # Use floats instead of ints as values
  330. series = series.map(lambda x: float(x))
  331. # Sort chronologically
  332. series = series.sort_index()
  333. expected = Series(
  334. [1.0, 2.0, 6.0, 4.0, 5.0],
  335. index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
  336. )[::step]
  337. x = series.resample("D").max().rolling(window=1, step=step).max()
  338. tm.assert_series_equal(expected, x)
  339. def test_rolling_max_resample(step):
  340. indices = [datetime(1975, 1, i) for i in range(1, 6)]
  341. # So that we can have 3 datapoints on last day (4, 10, and 20)
  342. indices.append(datetime(1975, 1, 5, 1))
  343. indices.append(datetime(1975, 1, 5, 2))
  344. series = Series(list(range(5)) + [10, 20], index=indices)
  345. # Use floats instead of ints as values
  346. series = series.map(lambda x: float(x))
  347. # Sort chronologically
  348. series = series.sort_index()
  349. # Default how should be max
  350. expected = Series(
  351. [0.0, 1.0, 2.0, 3.0, 20.0],
  352. index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
  353. )[::step]
  354. x = series.resample("D").max().rolling(window=1, step=step).max()
  355. tm.assert_series_equal(expected, x)
  356. # Now specify median (10.0)
  357. expected = Series(
  358. [0.0, 1.0, 2.0, 3.0, 10.0],
  359. index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
  360. )[::step]
  361. x = series.resample("D").median().rolling(window=1, step=step).max()
  362. tm.assert_series_equal(expected, x)
  363. # Now specify mean (4+10+20)/3
  364. v = (4.0 + 10.0 + 20.0) / 3.0
  365. expected = Series(
  366. [0.0, 1.0, 2.0, 3.0, v],
  367. index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
  368. )[::step]
  369. x = series.resample("D").mean().rolling(window=1, step=step).max()
  370. tm.assert_series_equal(expected, x)
  371. def test_rolling_min_resample(step):
  372. indices = [datetime(1975, 1, i) for i in range(1, 6)]
  373. # So that we can have 3 datapoints on last day (4, 10, and 20)
  374. indices.append(datetime(1975, 1, 5, 1))
  375. indices.append(datetime(1975, 1, 5, 2))
  376. series = Series(list(range(5)) + [10, 20], index=indices)
  377. # Use floats instead of ints as values
  378. series = series.map(lambda x: float(x))
  379. # Sort chronologically
  380. series = series.sort_index()
  381. # Default how should be min
  382. expected = Series(
  383. [0.0, 1.0, 2.0, 3.0, 4.0],
  384. index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
  385. )[::step]
  386. r = series.resample("D").min().rolling(window=1, step=step)
  387. tm.assert_series_equal(expected, r.min())
  388. def test_rolling_median_resample():
  389. indices = [datetime(1975, 1, i) for i in range(1, 6)]
  390. # So that we can have 3 datapoints on last day (4, 10, and 20)
  391. indices.append(datetime(1975, 1, 5, 1))
  392. indices.append(datetime(1975, 1, 5, 2))
  393. series = Series(list(range(5)) + [10, 20], index=indices)
  394. # Use floats instead of ints as values
  395. series = series.map(lambda x: float(x))
  396. # Sort chronologically
  397. series = series.sort_index()
  398. # Default how should be median
  399. expected = Series(
  400. [0.0, 1.0, 2.0, 3.0, 10],
  401. index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
  402. )
  403. x = series.resample("D").median().rolling(window=1).median()
  404. tm.assert_series_equal(expected, x)
  405. def test_rolling_median_memory_error():
  406. # GH11722
  407. n = 20000
  408. Series(np.random.default_rng(2).standard_normal(n)).rolling(
  409. window=2, center=False
  410. ).median()
  411. Series(np.random.default_rng(2).standard_normal(n)).rolling(
  412. window=2, center=False
  413. ).median()
  414. @pytest.mark.parametrize(
  415. "data_type",
  416. [np.dtype(f"f{width}") for width in [4, 8]]
  417. + [np.dtype(f"{sign}{width}") for width in [1, 2, 4, 8] for sign in "ui"],
  418. )
  419. def test_rolling_min_max_numeric_types(data_type):
  420. # GH12373
  421. # Just testing that these don't throw exceptions and that
  422. # the return type is float64. Other tests will cover quantitative
  423. # correctness
  424. result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).max()
  425. assert result.dtypes[0] == np.dtype("f8")
  426. result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min()
  427. assert result.dtypes[0] == np.dtype("f8")
  428. @pytest.mark.parametrize(
  429. "f",
  430. [
  431. lambda x: x.rolling(window=10, min_periods=0).count(),
  432. lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False),
  433. lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False),
  434. lambda x: x.rolling(window=10, min_periods=5).max(),
  435. lambda x: x.rolling(window=10, min_periods=5).min(),
  436. lambda x: x.rolling(window=10, min_periods=5).sum(),
  437. lambda x: x.rolling(window=10, min_periods=5).mean(),
  438. lambda x: x.rolling(window=10, min_periods=5).std(),
  439. lambda x: x.rolling(window=10, min_periods=5).var(),
  440. lambda x: x.rolling(window=10, min_periods=5).skew(),
  441. lambda x: x.rolling(window=10, min_periods=5).kurt(),
  442. lambda x: x.rolling(window=10, min_periods=5).quantile(0.5),
  443. lambda x: x.rolling(window=10, min_periods=5).median(),
  444. lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False),
  445. lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True),
  446. pytest.param(
  447. lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(),
  448. marks=td.skip_if_no("scipy"),
  449. ),
  450. ],
  451. )
  452. def test_moment_functions_zero_length(f):
  453. # GH 8056
  454. s = Series(dtype=np.float64)
  455. s_expected = s
  456. df1 = DataFrame()
  457. df1_expected = df1
  458. df2 = DataFrame(columns=["a"])
  459. df2["a"] = df2["a"].astype("float64")
  460. df2_expected = df2
  461. s_result = f(s)
  462. tm.assert_series_equal(s_result, s_expected)
  463. df1_result = f(df1)
  464. tm.assert_frame_equal(df1_result, df1_expected)
  465. df2_result = f(df2)
  466. tm.assert_frame_equal(df2_result, df2_expected)