frequencies.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. from __future__ import annotations
  2. from typing import TYPE_CHECKING
  3. import numpy as np
  4. from pandas._libs import lib
  5. from pandas._libs.algos import unique_deltas
  6. from pandas._libs.tslibs import (
  7. Timestamp,
  8. get_unit_from_dtype,
  9. periods_per_day,
  10. tz_convert_from_utc,
  11. )
  12. from pandas._libs.tslibs.ccalendar import (
  13. DAYS,
  14. MONTH_ALIASES,
  15. MONTH_NUMBERS,
  16. MONTHS,
  17. int_to_weekday,
  18. )
  19. from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR
  20. from pandas._libs.tslibs.fields import (
  21. build_field_sarray,
  22. month_position_check,
  23. )
  24. from pandas._libs.tslibs.offsets import (
  25. DateOffset,
  26. Day,
  27. to_offset,
  28. )
  29. from pandas._libs.tslibs.parsing import get_rule_month
  30. from pandas.util._decorators import (
  31. cache_readonly,
  32. set_module,
  33. )
  34. from pandas.core.dtypes.common import is_numeric_dtype
  35. from pandas.core.dtypes.dtypes import (
  36. ArrowDtype,
  37. DatetimeTZDtype,
  38. PeriodDtype,
  39. )
  40. from pandas.core.dtypes.generic import (
  41. ABCIndex,
  42. ABCSeries,
  43. )
  44. from pandas.core.algorithms import unique
  45. if TYPE_CHECKING:
  46. from pandas._typing import npt
  47. from pandas import (
  48. DatetimeIndex,
  49. Series,
  50. TimedeltaIndex,
  51. )
  52. from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
  53. # --------------------------------------------------------------------
  54. # Offset related functions
  55. _need_suffix = ["QS", "BQE", "BQS", "YS", "BYE", "BYS"]
  56. for _prefix in _need_suffix:
  57. for _m in MONTHS:
  58. key = f"{_prefix}-{_m}"
  59. OFFSET_TO_PERIOD_FREQSTR[key] = OFFSET_TO_PERIOD_FREQSTR[_prefix]
  60. for _prefix in ["Y", "Q"]:
  61. for _m in MONTHS:
  62. _alias = f"{_prefix}-{_m}"
  63. OFFSET_TO_PERIOD_FREQSTR[_alias] = _alias
  64. for _d in DAYS:
  65. OFFSET_TO_PERIOD_FREQSTR[f"W-{_d}"] = f"W-{_d}"
  66. def get_period_alias(offset_str: str) -> str | None:
  67. """
  68. Alias to closest period strings BQ->Q etc.
  69. """
  70. return OFFSET_TO_PERIOD_FREQSTR.get(offset_str, None)
  71. # ---------------------------------------------------------------------
  72. # Period codes
  73. @set_module("pandas")
  74. def infer_freq(
  75. index: DatetimeIndex | TimedeltaIndex | Series | DatetimeLikeArrayMixin,
  76. ) -> str | None:
  77. """
  78. Infer the most likely frequency given the input index.
  79. This method attempts to deduce the most probable frequency (e.g., 'D' for daily,
  80. 'H' for hourly) from a sequence of datetime-like objects. It is particularly useful
  81. when the frequency of a time series is not explicitly set or known but can be
  82. inferred from its values.
  83. Parameters
  84. ----------
  85. index : DatetimeIndex, TimedeltaIndex, Series or array-like
  86. If passed a Series will use the values of the series (NOT THE INDEX).
  87. Returns
  88. -------
  89. str or None
  90. None if no discernible frequency.
  91. Raises
  92. ------
  93. TypeError
  94. If the index is not datetime-like.
  95. ValueError
  96. If there are fewer than three values.
  97. See Also
  98. --------
  99. date_range : Return a fixed frequency DatetimeIndex.
  100. timedelta_range : Return a fixed frequency TimedeltaIndex with day as the default.
  101. period_range : Return a fixed frequency PeriodIndex.
  102. DatetimeIndex.freq : Return the frequency object if it is set, otherwise None.
  103. Examples
  104. --------
  105. >>> idx = pd.date_range(start="2020/12/01", end="2020/12/30", periods=30)
  106. >>> pd.infer_freq(idx)
  107. 'D'
  108. """
  109. from pandas.core.api import DatetimeIndex
  110. if isinstance(index, ABCSeries):
  111. values = index._values
  112. if isinstance(index.dtype, ArrowDtype):
  113. import pyarrow as pa
  114. if pa.types.is_timestamp(values.dtype.pyarrow_dtype):
  115. # GH#58403
  116. values = values._to_datetimearray()
  117. if not (
  118. lib.is_np_dtype(values.dtype, "mM")
  119. or isinstance(values.dtype, DatetimeTZDtype)
  120. or values.dtype == object
  121. ):
  122. raise TypeError(
  123. "cannot infer freq from a non-convertible dtype "
  124. f"on a Series of {index.dtype}"
  125. )
  126. index = values
  127. inferer: _FrequencyInferer
  128. if not hasattr(index, "dtype"):
  129. pass
  130. elif isinstance(index.dtype, PeriodDtype):
  131. raise TypeError(
  132. "PeriodIndex given. Check the `freq` attribute instead of using infer_freq."
  133. )
  134. elif lib.is_np_dtype(index.dtype, "m"):
  135. # Allow TimedeltaIndex and TimedeltaArray
  136. inferer = _TimedeltaFrequencyInferer(index)
  137. return inferer.get_freq()
  138. elif is_numeric_dtype(index.dtype):
  139. raise TypeError(
  140. f"cannot infer freq from a non-convertible index of dtype {index.dtype}"
  141. )
  142. if not isinstance(index, DatetimeIndex):
  143. index = DatetimeIndex(index, copy=False)
  144. inferer = _FrequencyInferer(index)
  145. return inferer.get_freq()
  146. class _FrequencyInferer:
  147. """
  148. Not sure if I can avoid the state machine here
  149. """
  150. def __init__(self, index) -> None:
  151. self.index = index
  152. self.i8values = index.asi8
  153. # For get_unit_from_dtype we need the dtype to the underlying ndarray,
  154. # which for tz-aware is not the same as index.dtype
  155. if isinstance(index, ABCIndex):
  156. # error: Item "ndarray[Any, Any]" of "Union[ExtensionArray,
  157. # ndarray[Any, Any]]" has no attribute "_ndarray"
  158. self._creso = get_unit_from_dtype(
  159. index._data._ndarray.dtype # type: ignore[union-attr]
  160. )
  161. else:
  162. # otherwise we have DTA/TDA
  163. self._creso = get_unit_from_dtype(index._ndarray.dtype)
  164. # This moves the values, which are implicitly in UTC, to the
  165. # the timezone so they are in local time
  166. if hasattr(index, "tz"):
  167. if index.tz is not None:
  168. self.i8values = tz_convert_from_utc(
  169. self.i8values, index.tz, reso=self._creso
  170. )
  171. if len(index) < 3:
  172. raise ValueError("Need at least 3 dates to infer frequency")
  173. self.is_monotonic = (
  174. self.index._is_monotonic_increasing or self.index._is_monotonic_decreasing
  175. )
  176. @cache_readonly
  177. def deltas(self) -> npt.NDArray[np.int64]:
  178. return unique_deltas(self.i8values)
  179. @cache_readonly
  180. def deltas_asi8(self) -> npt.NDArray[np.int64]:
  181. # NB: we cannot use self.i8values here because we may have converted
  182. # the tz in __init__
  183. return unique_deltas(self.index.asi8)
  184. @cache_readonly
  185. def is_unique(self) -> bool:
  186. return len(self.deltas) == 1
  187. @cache_readonly
  188. def is_unique_asi8(self) -> bool:
  189. return len(self.deltas_asi8) == 1
  190. def get_freq(self) -> str | None:
  191. """
  192. Find the appropriate frequency string to describe the inferred
  193. frequency of self.i8values
  194. Returns
  195. -------
  196. str or None
  197. """
  198. if not self.is_monotonic or not self.index._is_unique:
  199. return None
  200. delta = self.deltas[0]
  201. ppd = periods_per_day(self._creso)
  202. if delta and _is_multiple(delta, ppd):
  203. return self._infer_daily_rule()
  204. # Business hourly, maybe. 17: one day / 65: one weekend
  205. if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
  206. return "bh"
  207. # Possibly intraday frequency. Here we use the
  208. # original .asi8 values as the modified values
  209. # will not work around DST transitions. See #8772
  210. if not self.is_unique_asi8:
  211. return None
  212. delta = self.deltas_asi8[0]
  213. pph = ppd // 24
  214. ppm = pph // 60
  215. pps = ppm // 60
  216. if _is_multiple(delta, pph):
  217. # Hours
  218. return _maybe_add_count("h", delta / pph)
  219. elif _is_multiple(delta, ppm):
  220. # Minutes
  221. return _maybe_add_count("min", delta / ppm)
  222. elif _is_multiple(delta, pps):
  223. # Seconds
  224. return _maybe_add_count("s", delta / pps)
  225. elif _is_multiple(delta, (pps // 1000)):
  226. # Milliseconds
  227. return _maybe_add_count("ms", delta / (pps // 1000))
  228. elif _is_multiple(delta, (pps // 1_000_000)):
  229. # Microseconds
  230. return _maybe_add_count("us", delta / (pps // 1_000_000))
  231. else:
  232. # Nanoseconds
  233. return _maybe_add_count("ns", delta)
  234. @cache_readonly
  235. def day_deltas(self) -> list[int]:
  236. ppd = periods_per_day(self._creso)
  237. return [x / ppd for x in self.deltas]
  238. @cache_readonly
  239. def hour_deltas(self) -> list[int]:
  240. pph = periods_per_day(self._creso) // 24
  241. return [x / pph for x in self.deltas]
  242. @cache_readonly
  243. def fields(self) -> np.ndarray: # structured array of fields
  244. return build_field_sarray(self.i8values, reso=self._creso)
  245. @cache_readonly
  246. def rep_stamp(self) -> Timestamp:
  247. return Timestamp(self.i8values[0], unit=self.index.unit)
  248. def month_position_check(self) -> str | None:
  249. return month_position_check(self.fields, self.index.dayofweek)
  250. @cache_readonly
  251. def mdiffs(self) -> npt.NDArray[np.int64]:
  252. nmonths = self.fields["Y"] * 12 + self.fields["M"]
  253. return unique_deltas(nmonths.astype("i8"))
  254. @cache_readonly
  255. def ydiffs(self) -> npt.NDArray[np.int64]:
  256. return unique_deltas(self.fields["Y"].astype("i8"))
  257. def _infer_daily_rule(self) -> str | None:
  258. annual_rule = self._get_annual_rule()
  259. if annual_rule:
  260. nyears = self.ydiffs[0]
  261. month = MONTH_ALIASES[self.rep_stamp.month]
  262. alias = f"{annual_rule}-{month}"
  263. return _maybe_add_count(alias, nyears)
  264. quarterly_rule = self._get_quarterly_rule()
  265. if quarterly_rule:
  266. nquarters = self.mdiffs[0] / 3
  267. mod_dict = {0: 12, 2: 11, 1: 10}
  268. month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
  269. alias = f"{quarterly_rule}-{month}"
  270. return _maybe_add_count(alias, nquarters)
  271. monthly_rule = self._get_monthly_rule()
  272. if monthly_rule:
  273. return _maybe_add_count(monthly_rule, self.mdiffs[0])
  274. if self.is_unique:
  275. return self._get_daily_rule()
  276. if self._is_business_daily():
  277. return "B"
  278. wom_rule = self._get_wom_rule()
  279. if wom_rule:
  280. return wom_rule
  281. return None
  282. def _get_daily_rule(self) -> str | None:
  283. ppd = periods_per_day(self._creso)
  284. days = self.deltas[0] / ppd
  285. if days % 7 == 0:
  286. # Weekly
  287. wd = int_to_weekday[self.rep_stamp.weekday()]
  288. alias = f"W-{wd}"
  289. return _maybe_add_count(alias, days / 7)
  290. else:
  291. return _maybe_add_count("D", days)
  292. def _get_annual_rule(self) -> str | None:
  293. if len(self.ydiffs) > 1:
  294. return None
  295. if len(unique(self.fields["M"])) > 1:
  296. return None
  297. pos_check = self.month_position_check()
  298. if pos_check is None:
  299. return None
  300. else:
  301. return {"cs": "YS", "bs": "BYS", "ce": "YE", "be": "BYE"}.get(pos_check)
  302. def _get_quarterly_rule(self) -> str | None:
  303. if len(self.mdiffs) > 1:
  304. return None
  305. if not self.mdiffs[0] % 3 == 0:
  306. return None
  307. pos_check = self.month_position_check()
  308. if pos_check is None:
  309. return None
  310. else:
  311. return {"cs": "QS", "bs": "BQS", "ce": "QE", "be": "BQE"}.get(pos_check)
  312. def _get_monthly_rule(self) -> str | None:
  313. if len(self.mdiffs) > 1:
  314. return None
  315. pos_check = self.month_position_check()
  316. if pos_check is None:
  317. return None
  318. else:
  319. return {"cs": "MS", "bs": "BMS", "ce": "ME", "be": "BME"}.get(pos_check)
  320. def _is_business_daily(self) -> bool:
  321. # quick check: cannot be business daily
  322. if self.day_deltas != [1, 3]:
  323. return False
  324. # probably business daily, but need to confirm
  325. first_weekday = self.index[0].weekday()
  326. shifts = np.diff(self.i8values)
  327. ppd = periods_per_day(self._creso)
  328. shifts = np.floor_divide(shifts, ppd)
  329. weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
  330. return bool(
  331. np.all(
  332. ((weekdays == 0) & (shifts == 3))
  333. | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))
  334. )
  335. )
  336. def _get_wom_rule(self) -> str | None:
  337. weekdays = unique(self.index.weekday)
  338. if len(weekdays) > 1:
  339. return None
  340. week_of_months = unique((self.index.day - 1) // 7)
  341. # Only attempt to infer up to WOM-4. See #9425
  342. week_of_months = week_of_months[week_of_months < 4]
  343. if len(week_of_months) == 0 or len(week_of_months) > 1:
  344. return None
  345. # get which week
  346. week = week_of_months[0] + 1
  347. wd = int_to_weekday[weekdays[0]]
  348. return f"WOM-{week}{wd}"
  349. class _TimedeltaFrequencyInferer(_FrequencyInferer):
  350. def _infer_daily_rule(self):
  351. if self.is_unique:
  352. return self._get_daily_rule()
  353. def _is_multiple(us, mult: int) -> bool:
  354. return us % mult == 0
  355. def _maybe_add_count(base: str, count: float) -> str:
  356. if count != 1:
  357. assert count == int(count)
  358. count = int(count)
  359. return f"{count}{base}"
  360. else:
  361. return base
  362. # ----------------------------------------------------------------------
  363. # Frequency comparison
  364. def is_subperiod(source, target) -> bool:
  365. """
  366. Returns True if downsampling is possible between source and target
  367. frequencies
  368. Parameters
  369. ----------
  370. source : str or DateOffset
  371. Frequency converting from
  372. target : str or DateOffset
  373. Frequency converting to
  374. Returns
  375. -------
  376. bool
  377. """
  378. if target is None or source is None:
  379. return False
  380. source = _maybe_coerce_freq(source)
  381. target = _maybe_coerce_freq(target)
  382. if _is_annual(target):
  383. if _is_quarterly(source):
  384. return _quarter_months_conform(
  385. get_rule_month(source), get_rule_month(target)
  386. )
  387. return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"}
  388. elif _is_quarterly(target):
  389. return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"}
  390. elif _is_monthly(target):
  391. return source in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
  392. elif _is_weekly(target):
  393. return source in {target, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
  394. elif target == "B":
  395. return source in {"B", "h", "min", "s", "ms", "us", "ns"}
  396. elif target == "C":
  397. return source in {"C", "h", "min", "s", "ms", "us", "ns"}
  398. elif target == "D":
  399. return source in {"D", "h", "min", "s", "ms", "us", "ns"}
  400. elif target == "h":
  401. return source in {"h", "min", "s", "ms", "us", "ns"}
  402. elif target == "min":
  403. return source in {"min", "s", "ms", "us", "ns"}
  404. elif target == "s":
  405. return source in {"s", "ms", "us", "ns"}
  406. elif target == "ms":
  407. return source in {"ms", "us", "ns"}
  408. elif target == "us":
  409. return source in {"us", "ns"}
  410. elif target == "ns":
  411. return source in {"ns"}
  412. else:
  413. return False
  414. def is_superperiod(source, target) -> bool:
  415. """
  416. Returns True if upsampling is possible between source and target
  417. frequencies
  418. Parameters
  419. ----------
  420. source : str or DateOffset
  421. Frequency converting from
  422. target : str or DateOffset
  423. Frequency converting to
  424. Returns
  425. -------
  426. bool
  427. """
  428. if target is None or source is None:
  429. return False
  430. source = _maybe_coerce_freq(source)
  431. target = _maybe_coerce_freq(target)
  432. if _is_annual(source):
  433. if _is_annual(target):
  434. return get_rule_month(source) == get_rule_month(target)
  435. if _is_quarterly(target):
  436. smonth = get_rule_month(source)
  437. tmonth = get_rule_month(target)
  438. return _quarter_months_conform(smonth, tmonth)
  439. return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"}
  440. elif _is_quarterly(source):
  441. return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"}
  442. elif _is_monthly(source):
  443. return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
  444. elif _is_weekly(source):
  445. return target in {source, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
  446. elif source == "B":
  447. return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
  448. elif source == "C":
  449. return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
  450. elif source == "D":
  451. return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
  452. elif source == "h":
  453. return target in {"h", "min", "s", "ms", "us", "ns"}
  454. elif source == "min":
  455. return target in {"min", "s", "ms", "us", "ns"}
  456. elif source == "s":
  457. return target in {"s", "ms", "us", "ns"}
  458. elif source == "ms":
  459. return target in {"ms", "us", "ns"}
  460. elif source == "us":
  461. return target in {"us", "ns"}
  462. elif source == "ns":
  463. return target in {"ns"}
  464. else:
  465. return False
  466. def _maybe_coerce_freq(code) -> str:
  467. """we might need to coerce a code to a rule_code
  468. and uppercase it
  469. Parameters
  470. ----------
  471. source : str or DateOffset
  472. Frequency converting from
  473. Returns
  474. -------
  475. str
  476. """
  477. assert code is not None
  478. if isinstance(code, DateOffset):
  479. code = PeriodDtype(to_offset(code.name))._freqstr
  480. if code in {"h", "min", "s", "ms", "us", "ns"}:
  481. return code
  482. else:
  483. return code.upper()
  484. def _quarter_months_conform(source: str, target: str) -> bool:
  485. snum = MONTH_NUMBERS[source]
  486. tnum = MONTH_NUMBERS[target]
  487. return snum % 3 == tnum % 3
  488. def _is_annual(rule: str) -> bool:
  489. rule = rule.upper()
  490. return rule == "Y" or rule.startswith("Y-")
  491. def _is_quarterly(rule: str) -> bool:
  492. rule = rule.upper()
  493. return rule == "Q" or rule.startswith(("Q-", "BQ"))
  494. def _is_monthly(rule: str) -> bool:
  495. rule = rule.upper()
  496. return rule in ("M", "BM")
  497. def _is_weekly(rule: str) -> bool:
  498. rule = rule.upper()
  499. return rule == "W" or rule.startswith("W-")
  500. __all__ = [
  501. "Day",
  502. "get_period_alias",
  503. "infer_freq",
  504. "is_subperiod",
  505. "is_superperiod",
  506. "to_offset",
  507. ]