test_inference.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. from datetime import (
  2. datetime,
  3. timedelta,
  4. )
  5. import numpy as np
  6. import pytest
  7. from pandas._libs.tslibs.ccalendar import (
  8. DAYS,
  9. MONTHS,
  10. )
  11. from pandas._libs.tslibs.offsets import _get_offset
  12. from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG
  13. from pandas.compat import is_platform_windows
  14. from pandas import (
  15. DatetimeIndex,
  16. Index,
  17. RangeIndex,
  18. Series,
  19. Timestamp,
  20. date_range,
  21. period_range,
  22. )
  23. import pandas._testing as tm
  24. from pandas.core.arrays import (
  25. DatetimeArray,
  26. TimedeltaArray,
  27. )
  28. from pandas.core.tools.datetimes import to_datetime
  29. from pandas.tseries import (
  30. frequencies,
  31. offsets,
  32. )
  33. @pytest.fixture(
  34. params=[
  35. (timedelta(1), "D"),
  36. (timedelta(hours=1), "h"),
  37. (timedelta(minutes=1), "min"),
  38. (timedelta(seconds=1), "s"),
  39. (np.timedelta64(1, "ns"), "ns"),
  40. (timedelta(microseconds=1), "us"),
  41. (timedelta(microseconds=1000), "ms"),
  42. ]
  43. )
  44. def base_delta_code_pair(request):
  45. return request.param
  46. freqs = (
  47. [f"QE-{month}" for month in MONTHS]
  48. + [f"{annual}-{month}" for annual in ["YE", "BYE"] for month in MONTHS]
  49. + ["ME", "BME", "BMS"]
  50. + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS]
  51. + [f"W-{day}" for day in DAYS]
  52. )
  53. @pytest.mark.parametrize("freq", freqs)
  54. @pytest.mark.parametrize("periods", [5, 7])
  55. def test_infer_freq_range(periods, freq):
  56. freq = freq.upper()
  57. gen = date_range("1/1/2000", periods=periods, freq=freq)
  58. index = DatetimeIndex(gen.values)
  59. if not freq.startswith("QE-"):
  60. assert frequencies.infer_freq(index) == gen.freqstr
  61. else:
  62. inf_freq = frequencies.infer_freq(index)
  63. is_dec_range = inf_freq == "QE-DEC" and gen.freqstr in (
  64. "QE",
  65. "QE-DEC",
  66. "QE-SEP",
  67. "QE-JUN",
  68. "QE-MAR",
  69. )
  70. is_nov_range = inf_freq == "QE-NOV" and gen.freqstr in (
  71. "QE-NOV",
  72. "QE-AUG",
  73. "QE-MAY",
  74. "QE-FEB",
  75. )
  76. is_oct_range = inf_freq == "QE-OCT" and gen.freqstr in (
  77. "QE-OCT",
  78. "QE-JUL",
  79. "QE-APR",
  80. "QE-JAN",
  81. )
  82. assert is_dec_range or is_nov_range or is_oct_range
  83. def test_raise_if_period_index():
  84. index = period_range(start="1/1/1990", periods=20, freq="M")
  85. msg = "Check the `freq` attribute instead of using infer_freq"
  86. with pytest.raises(TypeError, match=msg):
  87. frequencies.infer_freq(index)
  88. def test_raise_if_too_few():
  89. index = DatetimeIndex(["12/31/1998", "1/3/1999"])
  90. msg = "Need at least 3 dates to infer frequency"
  91. with pytest.raises(ValueError, match=msg):
  92. frequencies.infer_freq(index)
  93. def test_business_daily():
  94. index = DatetimeIndex(["01/01/1999", "1/4/1999", "1/5/1999"])
  95. assert frequencies.infer_freq(index) == "B"
  96. def test_business_daily_look_alike():
  97. # see gh-16624
  98. #
  99. # Do not infer "B when "weekend" (2-day gap) in wrong place.
  100. index = DatetimeIndex(["12/31/1998", "1/3/1999", "1/4/1999"])
  101. assert frequencies.infer_freq(index) is None
  102. def test_day_corner():
  103. index = DatetimeIndex(["1/1/2000", "1/2/2000", "1/3/2000"])
  104. assert frequencies.infer_freq(index) == "D"
  105. def test_non_datetime_index():
  106. dates = to_datetime(["1/1/2000", "1/2/2000", "1/3/2000"])
  107. assert frequencies.infer_freq(dates) == "D"
  108. def test_fifth_week_of_month_infer():
  109. # see gh-9425
  110. #
  111. # Only attempt to infer up to WOM-4.
  112. index = DatetimeIndex(["2014-03-31", "2014-06-30", "2015-03-30"])
  113. assert frequencies.infer_freq(index) is None
  114. def test_week_of_month_fake():
  115. # All of these dates are on same day
  116. # of week and are 4 or 5 weeks apart.
  117. index = DatetimeIndex(["2013-08-27", "2013-10-01", "2013-10-29", "2013-11-26"])
  118. assert frequencies.infer_freq(index) != "WOM-4TUE"
  119. def test_fifth_week_of_month():
  120. # see gh-9425
  121. #
  122. # Only supports freq up to WOM-4.
  123. msg = (
  124. "Of the four parameters: start, end, periods, "
  125. "and freq, exactly three must be specified"
  126. )
  127. with pytest.raises(ValueError, match=msg):
  128. date_range("2014-01-01", freq="WOM-5MON")
  129. def test_monthly_ambiguous():
  130. rng = DatetimeIndex(["1/31/2000", "2/29/2000", "3/31/2000"])
  131. assert rng.inferred_freq == "ME"
  132. def test_annual_ambiguous():
  133. rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"])
  134. assert rng.inferred_freq == "YE-JAN"
  135. @pytest.mark.parametrize("count", range(1, 5))
  136. def test_infer_freq_delta(base_delta_code_pair, count):
  137. b = Timestamp(datetime.now())
  138. base_delta, code = base_delta_code_pair
  139. inc = base_delta * count
  140. index = DatetimeIndex([b + inc * j for j in range(3)])
  141. exp_freq = f"{count:d}{code}" if count > 1 else code
  142. assert frequencies.infer_freq(index) == exp_freq
  143. @pytest.mark.parametrize(
  144. "constructor",
  145. [
  146. lambda now, delta: DatetimeIndex(
  147. [now + delta * 7] + [now + delta * j for j in range(3)]
  148. ),
  149. lambda now, delta: DatetimeIndex(
  150. [now + delta * j for j in range(3)] + [now + delta * 7]
  151. ),
  152. ],
  153. )
  154. def test_infer_freq_custom(base_delta_code_pair, constructor):
  155. b = Timestamp(datetime.now())
  156. base_delta, _ = base_delta_code_pair
  157. index = constructor(b, base_delta)
  158. assert frequencies.infer_freq(index) is None
  159. @pytest.mark.parametrize(
  160. "freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")]
  161. )
  162. def test_infer_freq_index(freq, expected):
  163. rng = period_range("1959Q2", "2009Q3", freq=freq)
  164. with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
  165. rng = Index(rng.to_timestamp("D", how="e").astype(object))
  166. assert rng.inferred_freq == expected
  167. @pytest.mark.parametrize(
  168. "expected,dates",
  169. list(
  170. {
  171. "YS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"],
  172. "QE-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"],
  173. "ME": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"],
  174. "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"],
  175. "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"],
  176. "h": [
  177. "2011-12-31 22:00",
  178. "2011-12-31 23:00",
  179. "2012-01-01 00:00",
  180. "2012-01-01 01:00",
  181. ],
  182. }.items()
  183. ),
  184. )
  185. @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
  186. def test_infer_freq_tz(tz_naive_fixture, expected, dates, unit):
  187. # see gh-7310, GH#55609
  188. tz = tz_naive_fixture
  189. idx = DatetimeIndex(dates, tz=tz).as_unit(unit)
  190. assert idx.inferred_freq == expected
  191. def test_infer_freq_tz_series(tz_naive_fixture):
  192. # infer_freq should work with both tz-naive and tz-aware series. See gh-52456
  193. tz = tz_naive_fixture
  194. idx = date_range("2021-01-01", "2021-01-04", tz=tz)
  195. series = idx.to_series().reset_index(drop=True)
  196. inferred_freq = frequencies.infer_freq(series)
  197. assert inferred_freq == "D"
  198. @pytest.mark.parametrize(
  199. "date_pair",
  200. [
  201. ["2013-11-02", "2013-11-5"], # Fall DST
  202. ["2014-03-08", "2014-03-11"], # Spring DST
  203. ["2014-01-01", "2014-01-03"], # Regular Time
  204. ],
  205. )
  206. @pytest.mark.parametrize(
  207. "freq",
  208. ["h", "3h", "10min", "3601s", "3600001ms", "3600000001us", "3600000000001ns"],
  209. )
  210. def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq):
  211. # see gh-8772
  212. tz = tz_naive_fixture
  213. idx = date_range(date_pair[0], date_pair[1], freq=freq, tz=tz)
  214. assert idx.inferred_freq == freq
  215. def test_infer_freq_tz_transition_custom():
  216. index = date_range("2013-11-03", periods=5, freq="3h").tz_localize(
  217. "America/Chicago"
  218. )
  219. assert index.inferred_freq is None
  220. @pytest.mark.parametrize(
  221. "data,expected",
  222. [
  223. # Hourly freq in a day must result in "h"
  224. (
  225. [
  226. "2014-07-01 09:00",
  227. "2014-07-01 10:00",
  228. "2014-07-01 11:00",
  229. "2014-07-01 12:00",
  230. "2014-07-01 13:00",
  231. "2014-07-01 14:00",
  232. ],
  233. "h",
  234. ),
  235. (
  236. [
  237. "2014-07-01 09:00",
  238. "2014-07-01 10:00",
  239. "2014-07-01 11:00",
  240. "2014-07-01 12:00",
  241. "2014-07-01 13:00",
  242. "2014-07-01 14:00",
  243. "2014-07-01 15:00",
  244. "2014-07-01 16:00",
  245. "2014-07-02 09:00",
  246. "2014-07-02 10:00",
  247. "2014-07-02 11:00",
  248. ],
  249. "bh",
  250. ),
  251. (
  252. [
  253. "2014-07-04 09:00",
  254. "2014-07-04 10:00",
  255. "2014-07-04 11:00",
  256. "2014-07-04 12:00",
  257. "2014-07-04 13:00",
  258. "2014-07-04 14:00",
  259. "2014-07-04 15:00",
  260. "2014-07-04 16:00",
  261. "2014-07-07 09:00",
  262. "2014-07-07 10:00",
  263. "2014-07-07 11:00",
  264. ],
  265. "bh",
  266. ),
  267. (
  268. [
  269. "2014-07-04 09:00",
  270. "2014-07-04 10:00",
  271. "2014-07-04 11:00",
  272. "2014-07-04 12:00",
  273. "2014-07-04 13:00",
  274. "2014-07-04 14:00",
  275. "2014-07-04 15:00",
  276. "2014-07-04 16:00",
  277. "2014-07-07 09:00",
  278. "2014-07-07 10:00",
  279. "2014-07-07 11:00",
  280. "2014-07-07 12:00",
  281. "2014-07-07 13:00",
  282. "2014-07-07 14:00",
  283. "2014-07-07 15:00",
  284. "2014-07-07 16:00",
  285. "2014-07-08 09:00",
  286. "2014-07-08 10:00",
  287. "2014-07-08 11:00",
  288. "2014-07-08 12:00",
  289. "2014-07-08 13:00",
  290. "2014-07-08 14:00",
  291. "2014-07-08 15:00",
  292. "2014-07-08 16:00",
  293. ],
  294. "bh",
  295. ),
  296. ],
  297. )
  298. def test_infer_freq_business_hour(data, expected):
  299. # see gh-7905
  300. idx = DatetimeIndex(data)
  301. assert idx.inferred_freq == expected
  302. def test_not_monotonic():
  303. rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"])
  304. rng = rng[::-1]
  305. assert rng.inferred_freq == "-1YE-JAN"
  306. def test_non_datetime_index2():
  307. rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"])
  308. vals = rng.to_pydatetime()
  309. result = frequencies.infer_freq(vals)
  310. assert result == rng.inferred_freq
  311. @pytest.mark.parametrize(
  312. "idx",
  313. [
  314. Index(np.arange(5), dtype=np.int64),
  315. Index(np.arange(5), dtype=np.float64),
  316. period_range("2020-01-01", periods=5),
  317. RangeIndex(5),
  318. ],
  319. )
  320. def test_invalid_index_types(idx):
  321. # see gh-48439
  322. msg = "|".join(
  323. [
  324. "cannot infer freq from a non-convertible",
  325. "Check the `freq` attribute instead of using infer_freq",
  326. ]
  327. )
  328. with pytest.raises(TypeError, match=msg):
  329. frequencies.infer_freq(idx)
  330. @pytest.mark.skipif(is_platform_windows(), reason="see gh-10822: Windows issue")
  331. def test_invalid_index_types_unicode():
  332. # see gh-10822
  333. #
  334. # Odd error message on conversions to datetime for unicode.
  335. msg = "Unknown datetime string format"
  336. with pytest.raises(ValueError, match=msg):
  337. frequencies.infer_freq(Index(["ZqgszYBfuL"]))
  338. def test_string_datetime_like_compat():
  339. # see gh-6463
  340. data = ["2004-01", "2004-02", "2004-03", "2004-04"]
  341. expected = frequencies.infer_freq(data)
  342. result = frequencies.infer_freq(Index(data))
  343. assert result == expected
  344. def test_series():
  345. # see gh-6407
  346. s = Series(date_range("20130101", "20130110"))
  347. inferred = frequencies.infer_freq(s)
  348. assert inferred == "D"
  349. @pytest.mark.parametrize("end", [10, 10.0])
  350. def test_series_invalid_type(end):
  351. # see gh-6407
  352. msg = "cannot infer freq from a non-convertible dtype on a Series"
  353. s = Series(np.arange(end))
  354. with pytest.raises(TypeError, match=msg):
  355. frequencies.infer_freq(s)
  356. def test_series_inconvertible_string(using_infer_string):
  357. # see gh-6407
  358. if using_infer_string:
  359. msg = "cannot infer freq from"
  360. with pytest.raises(TypeError, match=msg):
  361. frequencies.infer_freq(Series(["foo", "bar"]))
  362. else:
  363. msg = "Unknown datetime string format"
  364. with pytest.raises(ValueError, match=msg):
  365. frequencies.infer_freq(Series(["foo", "bar"]))
  366. @pytest.mark.parametrize("freq", [None, "ms"])
  367. def test_series_period_index(freq):
  368. # see gh-6407
  369. #
  370. # Cannot infer on PeriodIndex
  371. msg = "cannot infer freq from a non-convertible dtype on a Series"
  372. s = Series(period_range("2013", periods=10, freq=freq))
  373. with pytest.raises(TypeError, match=msg):
  374. frequencies.infer_freq(s)
  375. @pytest.mark.parametrize("freq", ["ME", "ms", "s"])
  376. def test_series_datetime_index(freq):
  377. s = Series(date_range("20130101", periods=10, freq=freq))
  378. inferred = frequencies.infer_freq(s)
  379. assert inferred == freq
  380. @pytest.mark.parametrize(
  381. "offset_func",
  382. [
  383. _get_offset,
  384. lambda freq: date_range("2011-01-01", periods=5, freq=freq),
  385. ],
  386. )
  387. @pytest.mark.parametrize(
  388. "freq",
  389. [
  390. "WEEKDAY",
  391. "EOM",
  392. "W@MON",
  393. "W@TUE",
  394. "W@WED",
  395. "W@THU",
  396. "W@FRI",
  397. "W@SAT",
  398. "W@SUN",
  399. "QE@JAN",
  400. "QE@FEB",
  401. "QE@MAR",
  402. "YE@JAN",
  403. "YE@FEB",
  404. "YE@MAR",
  405. "YE@APR",
  406. "YE@MAY",
  407. "YE@JUN",
  408. "YE@JUL",
  409. "YE@AUG",
  410. "YE@SEP",
  411. "YE@OCT",
  412. "YE@NOV",
  413. "YE@DEC",
  414. "YE@JAN",
  415. "WOM@1MON",
  416. "WOM@2MON",
  417. "WOM@3MON",
  418. "WOM@4MON",
  419. "WOM@1TUE",
  420. "WOM@2TUE",
  421. "WOM@3TUE",
  422. "WOM@4TUE",
  423. "WOM@1WED",
  424. "WOM@2WED",
  425. "WOM@3WED",
  426. "WOM@4WED",
  427. "WOM@1THU",
  428. "WOM@2THU",
  429. "WOM@3THU",
  430. "WOM@4THU",
  431. "WOM@1FRI",
  432. "WOM@2FRI",
  433. "WOM@3FRI",
  434. "WOM@4FRI",
  435. ],
  436. )
  437. def test_legacy_offset_warnings(offset_func, freq):
  438. with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG):
  439. offset_func(freq)
  440. def test_ms_vs_capital_ms():
  441. left = _get_offset("ms")
  442. right = _get_offset("MS")
  443. assert left == offsets.Milli()
  444. assert right == offsets.MonthBegin()
  445. def test_infer_freq_non_nano():
  446. arr = np.arange(10).astype(np.int64).view("M8[s]")
  447. dta = DatetimeArray._simple_new(arr, dtype=arr.dtype)
  448. res = frequencies.infer_freq(dta)
  449. assert res == "s"
  450. arr2 = arr.view("m8[ms]")
  451. tda = TimedeltaArray._simple_new(arr2, dtype=arr2.dtype)
  452. res2 = frequencies.infer_freq(tda)
  453. assert res2 == "ms"
  454. def test_infer_freq_non_nano_tzaware(tz_aware_fixture):
  455. tz = tz_aware_fixture
  456. dti = date_range("2016-01-01", periods=365, freq="B", tz=tz)
  457. dta = dti._data.as_unit("s")
  458. res = frequencies.infer_freq(dta)
  459. assert res == "B"