test_base.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. from datetime import datetime
  2. import numpy as np
  3. import pytest
  4. from pandas.core.dtypes.common import is_extension_array_dtype
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. DatetimeIndex,
  9. MultiIndex,
  10. NaT,
  11. PeriodIndex,
  12. Series,
  13. TimedeltaIndex,
  14. )
  15. import pandas._testing as tm
  16. from pandas.core.groupby.groupby import DataError
  17. from pandas.core.groupby.grouper import Grouper
  18. from pandas.core.indexes.datetimes import date_range
  19. from pandas.core.indexes.period import period_range
  20. from pandas.core.indexes.timedeltas import timedelta_range
  21. from pandas.core.resample import _asfreq_compat
  22. # a fixture value can be overridden by the test parameter value. Note that the
  23. # value of the fixture can be overridden this way even if the test doesn't use
  24. # it directly (doesn't mention it in the function prototype).
  25. # see https://docs.pytest.org/en/latest/fixture.html#override-a-fixture-with-direct-test-parametrization # noqa: E501
  26. # in this module we override the fixture values defined in conftest.py
  27. # tuples of '_index_factory,_series_name,_index_start,_index_end'
  28. DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10))
  29. PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10))
  30. TIMEDELTA_RANGE = (timedelta_range, "tdi", "1 day", "10 day")
  31. all_ts = pytest.mark.parametrize(
  32. "_index_factory,_series_name,_index_start,_index_end",
  33. [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE],
  34. )
  35. @pytest.fixture
  36. def create_index(_index_factory):
  37. def _create_index(*args, **kwargs):
  38. """return the _index_factory created using the args, kwargs"""
  39. return _index_factory(*args, **kwargs)
  40. return _create_index
  41. @pytest.mark.parametrize("freq", ["2D", "1h"])
  42. @pytest.mark.parametrize(
  43. "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE]
  44. )
  45. def test_asfreq(series_and_frame, freq, create_index):
  46. obj = series_and_frame
  47. result = obj.resample(freq).asfreq()
  48. new_index = create_index(obj.index[0], obj.index[-1], freq=freq)
  49. expected = obj.reindex(new_index)
  50. tm.assert_almost_equal(result, expected)
  51. @pytest.mark.parametrize(
  52. "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE]
  53. )
  54. def test_asfreq_fill_value(series, create_index):
  55. # test for fill value during resampling, issue 3715
  56. ser = series
  57. result = ser.resample("1h").asfreq()
  58. new_index = create_index(ser.index[0], ser.index[-1], freq="1h")
  59. expected = ser.reindex(new_index)
  60. tm.assert_series_equal(result, expected)
  61. # Explicit cast to float to avoid implicit cast when setting None
  62. frame = ser.astype("float").to_frame("value")
  63. frame.iloc[1] = None
  64. result = frame.resample("1h").asfreq(fill_value=4.0)
  65. new_index = create_index(frame.index[0], frame.index[-1], freq="1h")
  66. expected = frame.reindex(new_index, fill_value=4.0)
  67. tm.assert_frame_equal(result, expected)
  68. @all_ts
  69. def test_resample_interpolate(frame):
  70. # GH#12925
  71. df = frame
  72. result = df.resample("1min").asfreq().interpolate()
  73. expected = df.resample("1min").interpolate()
  74. tm.assert_frame_equal(result, expected)
  75. def test_raises_on_non_datetimelike_index():
  76. # this is a non datetimelike index
  77. xp = DataFrame()
  78. msg = (
  79. "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, "
  80. "but got an instance of 'RangeIndex'"
  81. )
  82. with pytest.raises(TypeError, match=msg):
  83. xp.resample("YE")
  84. @all_ts
  85. @pytest.mark.parametrize("freq", ["ME", "D", "h"])
  86. def test_resample_empty_series(freq, empty_series_dti, resample_method):
  87. # GH12771 & GH12868
  88. ser = empty_series_dti
  89. if freq == "ME" and isinstance(ser.index, TimedeltaIndex):
  90. msg = (
  91. "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
  92. "e.g. '24h' or '3D', not <MonthEnd>"
  93. )
  94. with pytest.raises(ValueError, match=msg):
  95. ser.resample(freq)
  96. return
  97. elif freq == "ME" and isinstance(ser.index, PeriodIndex):
  98. # index is PeriodIndex, so convert to corresponding Period freq
  99. freq = "M"
  100. rs = ser.resample(freq)
  101. result = getattr(rs, resample_method)()
  102. if resample_method == "ohlc":
  103. expected = DataFrame(
  104. [], index=ser.index[:0].copy(), columns=["open", "high", "low", "close"]
  105. )
  106. expected.index = _asfreq_compat(ser.index, freq)
  107. tm.assert_frame_equal(result, expected, check_dtype=False)
  108. else:
  109. expected = ser.copy()
  110. expected.index = _asfreq_compat(ser.index, freq)
  111. tm.assert_series_equal(result, expected, check_dtype=False)
  112. tm.assert_index_equal(result.index, expected.index)
  113. assert result.index.freq == expected.index.freq
  114. @pytest.mark.parametrize("min_count", [0, 1])
  115. def test_resample_empty_sum_string(string_dtype_no_object, min_count):
  116. # https://github.com/pandas-dev/pandas/issues/60229
  117. dtype = string_dtype_no_object
  118. ser = Series(
  119. pd.NA,
  120. index=DatetimeIndex(
  121. [
  122. "2000-01-01 00:00:00",
  123. "2000-01-01 00:00:10",
  124. "2000-01-01 00:00:20",
  125. "2000-01-01 00:00:30",
  126. ]
  127. ),
  128. dtype=dtype,
  129. )
  130. rs = ser.resample("20s")
  131. result = rs.sum(min_count=min_count)
  132. value = "" if min_count == 0 else pd.NA
  133. index = date_range(start="2000-01-01", freq="20s", periods=2)
  134. expected = Series(value, index=index, dtype=dtype)
  135. tm.assert_series_equal(result, expected)
  136. @all_ts
  137. @pytest.mark.parametrize(
  138. "freq",
  139. [
  140. pytest.param("ME", marks=pytest.mark.xfail(reason="Don't know why this fails")),
  141. "D",
  142. "h",
  143. ],
  144. )
  145. def test_resample_nat_index_series(freq, series, resample_method):
  146. # GH39227
  147. ser = series.copy()
  148. ser.index = PeriodIndex([NaT] * len(ser), freq=freq)
  149. rs = ser.resample(freq)
  150. result = getattr(rs, resample_method)()
  151. if resample_method == "ohlc":
  152. expected = DataFrame(
  153. [], index=ser.index[:0].copy(), columns=["open", "high", "low", "close"]
  154. )
  155. tm.assert_frame_equal(result, expected, check_dtype=False)
  156. else:
  157. expected = ser[:0].copy()
  158. tm.assert_series_equal(result, expected, check_dtype=False)
  159. tm.assert_index_equal(result.index, expected.index)
  160. assert result.index.freq == expected.index.freq
  161. @all_ts
  162. @pytest.mark.parametrize("freq", ["ME", "D", "h"])
  163. @pytest.mark.parametrize("resample_method", ["count", "size"])
  164. def test_resample_count_empty_series(freq, empty_series_dti, resample_method):
  165. # GH28427
  166. ser = empty_series_dti
  167. if freq == "ME" and isinstance(ser.index, TimedeltaIndex):
  168. msg = (
  169. "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
  170. "e.g. '24h' or '3D', not <MonthEnd>"
  171. )
  172. with pytest.raises(ValueError, match=msg):
  173. ser.resample(freq)
  174. return
  175. elif freq == "ME" and isinstance(ser.index, PeriodIndex):
  176. # index is PeriodIndex, so convert to corresponding Period freq
  177. freq = "M"
  178. rs = ser.resample(freq)
  179. result = getattr(rs, resample_method)()
  180. index = _asfreq_compat(ser.index, freq)
  181. expected = Series([], dtype="int64", index=index, name=ser.name)
  182. tm.assert_series_equal(result, expected)
  183. @all_ts
  184. @pytest.mark.parametrize("freq", ["ME", "D", "h"])
  185. def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method):
  186. # GH13212
  187. df = empty_frame_dti
  188. # count retains dimensions too
  189. if freq == "ME" and isinstance(df.index, TimedeltaIndex):
  190. msg = (
  191. "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
  192. "e.g. '24h' or '3D', not <MonthEnd>"
  193. )
  194. with pytest.raises(ValueError, match=msg):
  195. df.resample(freq, group_keys=False)
  196. return
  197. elif freq == "ME" and isinstance(df.index, PeriodIndex):
  198. # index is PeriodIndex, so convert to corresponding Period freq
  199. freq = "M"
  200. rs = df.resample(freq, group_keys=False)
  201. result = getattr(rs, resample_method)()
  202. if resample_method == "ohlc":
  203. # TODO: no tests with len(df.columns) > 0
  204. mi = MultiIndex.from_product([df.columns, ["open", "high", "low", "close"]])
  205. expected = DataFrame(
  206. [], index=df.index[:0].copy(), columns=mi, dtype=np.float64
  207. )
  208. expected.index = _asfreq_compat(df.index, freq)
  209. elif resample_method != "size":
  210. expected = df.copy()
  211. else:
  212. # GH14962
  213. expected = Series([], dtype=np.int64)
  214. expected.index = _asfreq_compat(df.index, freq)
  215. tm.assert_index_equal(result.index, expected.index)
  216. assert result.index.freq == expected.index.freq
  217. tm.assert_almost_equal(result, expected)
  218. # test size for GH13212 (currently stays as df)
  219. @all_ts
  220. @pytest.mark.parametrize("freq", ["ME", "D", "h"])
  221. def test_resample_count_empty_dataframe(freq, empty_frame_dti):
  222. # GH28427
  223. empty_frame_dti["a"] = []
  224. if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex):
  225. msg = (
  226. "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
  227. "e.g. '24h' or '3D', not <MonthEnd>"
  228. )
  229. with pytest.raises(ValueError, match=msg):
  230. empty_frame_dti.resample(freq)
  231. return
  232. elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex):
  233. # index is PeriodIndex, so convert to corresponding Period freq
  234. freq = "M"
  235. result = empty_frame_dti.resample(freq).count()
  236. index = _asfreq_compat(empty_frame_dti.index, freq)
  237. expected = DataFrame(dtype="int64", index=index, columns=["a"])
  238. tm.assert_frame_equal(result, expected)
  239. @all_ts
  240. @pytest.mark.parametrize("freq", ["ME", "D", "h"])
  241. def test_resample_size_empty_dataframe(freq, empty_frame_dti):
  242. # GH28427
  243. empty_frame_dti["a"] = []
  244. if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex):
  245. msg = (
  246. "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
  247. "e.g. '24h' or '3D', not <MonthEnd>"
  248. )
  249. with pytest.raises(ValueError, match=msg):
  250. empty_frame_dti.resample(freq)
  251. return
  252. elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex):
  253. # index is PeriodIndex, so convert to corresponding Period freq
  254. freq = "M"
  255. result = empty_frame_dti.resample(freq).size()
  256. index = _asfreq_compat(empty_frame_dti.index, freq)
  257. expected = Series([], dtype="int64", index=index)
  258. tm.assert_series_equal(result, expected)
  259. @pytest.mark.parametrize(
  260. "index",
  261. [
  262. PeriodIndex([], freq="M", name="a"),
  263. DatetimeIndex([], name="a"),
  264. TimedeltaIndex([], name="a"),
  265. ],
  266. )
  267. @pytest.mark.parametrize("dtype", [float, int, object, "datetime64[ns]"])
  268. def test_resample_empty_dtypes(index, dtype, resample_method):
  269. # Empty series were sometimes causing a segfault (for the functions
  270. # with Cython bounds-checking disabled) or an IndexError. We just run
  271. # them to ensure they no longer do. (GH #10228)
  272. empty_series_dti = Series([], index, dtype)
  273. rs = empty_series_dti.resample("d", group_keys=False)
  274. try:
  275. getattr(rs, resample_method)()
  276. except DataError:
  277. # Ignore these since some combinations are invalid
  278. # (ex: doing mean with dtype of np.object_)
  279. pass
  280. @all_ts
  281. @pytest.mark.parametrize("freq", ["ME", "D", "h"])
  282. def test_apply_to_empty_series(empty_series_dti, freq):
  283. # GH 14313
  284. ser = empty_series_dti
  285. if freq == "ME" and isinstance(empty_series_dti.index, TimedeltaIndex):
  286. msg = (
  287. "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
  288. "e.g. '24h' or '3D', not <MonthEnd>"
  289. )
  290. with pytest.raises(ValueError, match=msg):
  291. empty_series_dti.resample(freq)
  292. return
  293. elif freq == "ME" and isinstance(empty_series_dti.index, PeriodIndex):
  294. # index is PeriodIndex, so convert to corresponding Period freq
  295. freq = "M"
  296. result = ser.resample(freq, group_keys=False).apply(lambda x: 1)
  297. expected = ser.resample(freq).apply("sum")
  298. tm.assert_series_equal(result, expected, check_dtype=False)
  299. @all_ts
  300. def test_resampler_is_iterable(series):
  301. # GH 15314
  302. freq = "h"
  303. tg = Grouper(freq=freq, convention="start")
  304. grouped = series.groupby(tg)
  305. resampled = series.resample(freq)
  306. for (rk, rv), (gk, gv) in zip(resampled, grouped):
  307. assert rk == gk
  308. tm.assert_series_equal(rv, gv)
  309. @all_ts
  310. def test_resample_quantile(series):
  311. # GH 15023
  312. ser = series
  313. q = 0.75
  314. freq = "h"
  315. result = ser.resample(freq).quantile(q)
  316. expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name)
  317. tm.assert_series_equal(result, expected)
  318. @pytest.mark.parametrize("how", ["first", "last"])
  319. def test_first_last_skipna(any_real_nullable_dtype, skipna, how):
  320. # GH#57019
  321. if is_extension_array_dtype(any_real_nullable_dtype):
  322. na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value
  323. else:
  324. na_value = np.nan
  325. df = DataFrame(
  326. {
  327. "a": [2, 1, 1, 2],
  328. "b": [na_value, 3.0, na_value, 4.0],
  329. "c": [na_value, 3.0, na_value, 4.0],
  330. },
  331. index=date_range("2020-01-01", periods=4, freq="D"),
  332. dtype=any_real_nullable_dtype,
  333. )
  334. rs = df.resample("ME")
  335. method = getattr(rs, how)
  336. result = method(skipna=skipna)
  337. gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")])
  338. expected = getattr(gb, how)(skipna=skipna)
  339. expected.index.freq = "ME"
  340. tm.assert_frame_equal(result, expected)