test_parsing.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. """
  2. Tests for Timestamp parsing, aimed at pandas/_libs/tslibs/parsing.pyx
  3. """
  4. from datetime import datetime
  5. import re
  6. from dateutil.parser import parse as du_parse
  7. from dateutil.tz import tzlocal
  8. from hypothesis import given
  9. import numpy as np
  10. import pytest
  11. from pandas._libs.tslibs import (
  12. parsing,
  13. strptime,
  14. )
  15. from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso
  16. from pandas.compat import (
  17. ISMUSL,
  18. is_platform_arm,
  19. is_platform_windows,
  20. )
  21. import pandas.util._test_decorators as td
  22. import pandas._testing as tm
  23. from pandas._testing._hypothesis import DATETIME_NO_TZ
  24. @pytest.mark.skipif(
  25. is_platform_windows() or ISMUSL or is_platform_arm(),
  26. reason="TZ setting incorrect on Windows and MUSL Linux",
  27. )
  28. def test_parsing_tzlocal_deprecated():
  29. # GH#50791
  30. msg = (
  31. "Parsing 'EST' as tzlocal.*"
  32. "Pass the 'tz' keyword or call tz_localize after construction instead"
  33. )
  34. dtstr = "Jan 15 2004 03:00 EST"
  35. with tm.set_timezone("US/Eastern"):
  36. with tm.assert_produces_warning(FutureWarning, match=msg):
  37. res, _ = parse_datetime_string_with_reso(dtstr)
  38. assert isinstance(res.tzinfo, tzlocal)
  39. with tm.assert_produces_warning(FutureWarning, match=msg):
  40. res = parsing.py_parse_datetime_string(dtstr)
  41. assert isinstance(res.tzinfo, tzlocal)
  42. def test_parse_datetime_string_with_reso():
  43. (parsed, reso) = parse_datetime_string_with_reso("4Q1984")
  44. (parsed_lower, reso_lower) = parse_datetime_string_with_reso("4q1984")
  45. assert reso == reso_lower
  46. assert parsed == parsed_lower
  47. def test_parse_datetime_string_with_reso_nanosecond_reso():
  48. # GH#46811
  49. parsed, reso = parse_datetime_string_with_reso("2022-04-20 09:19:19.123456789")
  50. assert reso == "nanosecond"
  51. def test_parse_datetime_string_with_reso_invalid_type():
  52. # Raise on invalid input, don't just return it
  53. msg = "Argument 'date_string' has incorrect type (expected str, got tuple)"
  54. with pytest.raises(TypeError, match=re.escape(msg)):
  55. parse_datetime_string_with_reso((4, 5))
  56. @pytest.mark.parametrize(
  57. "dashed,normal", [("1988-Q2", "1988Q2"), ("2Q-1988", "2Q1988")]
  58. )
  59. def test_parse_time_quarter_with_dash(dashed, normal):
  60. # see gh-9688
  61. (parsed_dash, reso_dash) = parse_datetime_string_with_reso(dashed)
  62. (parsed, reso) = parse_datetime_string_with_reso(normal)
  63. assert parsed_dash == parsed
  64. assert reso_dash == reso
  65. @pytest.mark.parametrize("dashed", ["-2Q1992", "2-Q1992", "4-4Q1992"])
  66. def test_parse_time_quarter_with_dash_error(dashed):
  67. msg = f"Unknown datetime string format, unable to parse: {dashed}"
  68. with pytest.raises(parsing.DateParseError, match=msg):
  69. parse_datetime_string_with_reso(dashed)
  70. @pytest.mark.parametrize(
  71. "date_string,expected",
  72. [
  73. ("123.1234", False),
  74. ("-50000", False),
  75. ("999", False),
  76. ("m", False),
  77. ("T", False),
  78. ("Mon Sep 16, 2013", True),
  79. ("2012-01-01", True),
  80. ("01/01/2012", True),
  81. ("01012012", True),
  82. ("0101", True),
  83. ("1-1", True),
  84. ],
  85. )
  86. def test_does_not_convert_mixed_integer(date_string, expected):
  87. assert parsing._does_string_look_like_datetime(date_string) is expected
  88. @pytest.mark.parametrize(
  89. "date_str,kwargs,msg",
  90. [
  91. (
  92. "2013Q5",
  93. {},
  94. (
  95. "Incorrect quarterly string is given, "
  96. "quarter must be between 1 and 4: 2013Q5"
  97. ),
  98. ),
  99. # see gh-5418
  100. (
  101. "2013Q1",
  102. {"freq": "INVLD-L-DEC-SAT"},
  103. (
  104. "Unable to retrieve month information "
  105. "from given freq: INVLD-L-DEC-SAT"
  106. ),
  107. ),
  108. ],
  109. )
  110. def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg):
  111. with pytest.raises(parsing.DateParseError, match=msg):
  112. parsing.parse_datetime_string_with_reso(date_str, **kwargs)
  113. @pytest.mark.parametrize(
  114. "date_str,freq,expected",
  115. [
  116. ("2013Q2", None, datetime(2013, 4, 1)),
  117. ("2013Q2", "Y-APR", datetime(2012, 8, 1)),
  118. ("2013-Q2", "Y-DEC", datetime(2013, 4, 1)),
  119. ],
  120. )
  121. def test_parsers_quarterly_with_freq(date_str, freq, expected):
  122. result, _ = parsing.parse_datetime_string_with_reso(date_str, freq=freq)
  123. assert result == expected
  124. @pytest.mark.parametrize(
  125. "date_str", ["2Q 2005", "2Q-200Y", "2Q-200", "22Q2005", "2Q200.", "6Q-20"]
  126. )
  127. def test_parsers_quarter_invalid(date_str):
  128. if date_str == "6Q-20":
  129. msg = (
  130. "Incorrect quarterly string is given, quarter "
  131. f"must be between 1 and 4: {date_str}"
  132. )
  133. else:
  134. msg = f"Unknown datetime string format, unable to parse: {date_str}"
  135. with pytest.raises(ValueError, match=msg):
  136. parsing.parse_datetime_string_with_reso(date_str)
  137. @pytest.mark.parametrize(
  138. "date_str,expected",
  139. [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))],
  140. )
  141. def test_parsers_month_freq(date_str, expected):
  142. result, _ = parsing.parse_datetime_string_with_reso(date_str, freq="ME")
  143. assert result == expected
  144. @td.skip_if_not_us_locale
  145. @pytest.mark.parametrize(
  146. "string,fmt",
  147. [
  148. ("20111230", "%Y%m%d"),
  149. ("201112300000", "%Y%m%d%H%M"),
  150. ("20111230000000", "%Y%m%d%H%M%S"),
  151. ("20111230T00", "%Y%m%dT%H"),
  152. ("20111230T0000", "%Y%m%dT%H%M"),
  153. ("20111230T000000", "%Y%m%dT%H%M%S"),
  154. ("2011-12-30", "%Y-%m-%d"),
  155. ("2011", "%Y"),
  156. ("2011-01", "%Y-%m"),
  157. ("30-12-2011", "%d-%m-%Y"),
  158. ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"),
  159. ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"),
  160. ("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"),
  161. ("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"),
  162. ("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"),
  163. ("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"),
  164. ("2011-12-30T00:00:00+090", None),
  165. ("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"),
  166. ("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"),
  167. ("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"),
  168. ("2011-12-30T00:00:00+09:000", None),
  169. ("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"),
  170. ("2011-12-30T00:00:00+09:", None),
  171. ("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"),
  172. ("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"),
  173. ("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"),
  174. ("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"),
  175. ("2011-12-30T00:00:00.000000+090", None),
  176. ("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
  177. ("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
  178. ("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"),
  179. ("2011-12-30T00:00:00.000000+09:000", None),
  180. ("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"),
  181. ("2011-12-30T00:00:00.000000+09:", None),
  182. ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"),
  183. ("Tue 24 Aug 2021 01:30:48", "%a %d %b %Y %H:%M:%S"),
  184. ("Tuesday 24 Aug 2021 01:30:48", "%A %d %b %Y %H:%M:%S"),
  185. ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %I:%M:%S %p"),
  186. ("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %I:%M:%S %p"),
  187. ("27.03.2003 14:55:00.000", "%d.%m.%Y %H:%M:%S.%f"), # GH50317
  188. ],
  189. )
  190. def test_guess_datetime_format_with_parseable_formats(string, fmt):
  191. with tm.maybe_produces_warning(
  192. UserWarning, fmt is not None and re.search(r"%d.*%m", fmt)
  193. ):
  194. result = parsing.guess_datetime_format(string)
  195. assert result == fmt
  196. @pytest.mark.parametrize("dayfirst,expected", [(True, "%d/%m/%Y"), (False, "%m/%d/%Y")])
  197. def test_guess_datetime_format_with_dayfirst(dayfirst, expected):
  198. ambiguous_string = "01/01/2011"
  199. result = parsing.guess_datetime_format(ambiguous_string, dayfirst=dayfirst)
  200. assert result == expected
  201. @td.skip_if_not_us_locale
  202. @pytest.mark.parametrize(
  203. "string,fmt",
  204. [
  205. ("30/Dec/2011", "%d/%b/%Y"),
  206. ("30/December/2011", "%d/%B/%Y"),
  207. ("30/Dec/2011 00:00:00", "%d/%b/%Y %H:%M:%S"),
  208. ],
  209. )
  210. def test_guess_datetime_format_with_locale_specific_formats(string, fmt):
  211. result = parsing.guess_datetime_format(string)
  212. assert result == fmt
  213. @pytest.mark.parametrize(
  214. "invalid_dt",
  215. [
  216. "01/2013",
  217. "12:00:00",
  218. "1/1/1/1",
  219. "this_is_not_a_datetime",
  220. "51a",
  221. "13/2019",
  222. "202001", # YYYYMM isn't ISO8601
  223. "2020/01", # YYYY/MM isn't ISO8601 either
  224. "87156549591102612381000001219H5",
  225. ],
  226. )
  227. def test_guess_datetime_format_invalid_inputs(invalid_dt):
  228. # A datetime string must include a year, month and a day for it to be
  229. # guessable, in addition to being a string that looks like a datetime.
  230. assert parsing.guess_datetime_format(invalid_dt) is None
  231. @pytest.mark.parametrize("invalid_type_dt", [9, datetime(2011, 1, 1)])
  232. def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt):
  233. # A datetime string must include a year, month and a day for it to be
  234. # guessable, in addition to being a string that looks like a datetime.
  235. with pytest.raises(
  236. TypeError,
  237. match=r"^Argument 'dt_str' has incorrect type \(expected str, got .*\)$",
  238. ):
  239. parsing.guess_datetime_format(invalid_type_dt)
  240. @pytest.mark.parametrize(
  241. "string,fmt,dayfirst,warning",
  242. [
  243. ("2011-1-1", "%Y-%m-%d", False, None),
  244. ("2011-1-1", "%Y-%d-%m", True, None),
  245. ("1/1/2011", "%m/%d/%Y", False, None),
  246. ("1/1/2011", "%d/%m/%Y", True, None),
  247. ("30-1-2011", "%d-%m-%Y", False, UserWarning),
  248. ("30-1-2011", "%d-%m-%Y", True, None),
  249. ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False, None),
  250. ("2011-1-1 0:0:0", "%Y-%d-%m %H:%M:%S", True, None),
  251. ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False, None),
  252. ("2011-1-3T00:00:0", "%Y-%d-%mT%H:%M:%S", True, None),
  253. ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False, None),
  254. ("2011-1-1 00:00:00", "%Y-%d-%m %H:%M:%S", True, None),
  255. ],
  256. )
  257. def test_guess_datetime_format_no_padding(string, fmt, dayfirst, warning):
  258. # see gh-11142
  259. msg = (
  260. rf"Parsing dates in {fmt} format when dayfirst=False \(the default\) "
  261. "was specified. "
  262. "Pass `dayfirst=True` or specify a format to silence this warning."
  263. )
  264. with tm.assert_produces_warning(warning, match=msg):
  265. result = parsing.guess_datetime_format(string, dayfirst=dayfirst)
  266. assert result == fmt
  267. def test_try_parse_dates():
  268. arr = np.array(["5/1/2000", "6/1/2000", "7/1/2000"], dtype=object)
  269. result = parsing.try_parse_dates(arr, parser=lambda x: du_parse(x, dayfirst=True))
  270. expected = np.array([du_parse(d, dayfirst=True) for d in arr])
  271. tm.assert_numpy_array_equal(result, expected)
  272. def test_parse_datetime_string_with_reso_check_instance_type_raise_exception():
  273. # issue 20684
  274. msg = "Argument 'date_string' has incorrect type (expected str, got tuple)"
  275. with pytest.raises(TypeError, match=re.escape(msg)):
  276. parse_datetime_string_with_reso((1, 2, 3))
  277. result = parse_datetime_string_with_reso("2019")
  278. expected = (datetime(2019, 1, 1), "year")
  279. assert result == expected
  280. @pytest.mark.parametrize(
  281. "fmt,expected",
  282. [
  283. ("%Y %m %d %H:%M:%S", True),
  284. ("%Y/%m/%d %H:%M:%S", True),
  285. (r"%Y\%m\%d %H:%M:%S", True),
  286. ("%Y-%m-%d %H:%M:%S", True),
  287. ("%Y.%m.%d %H:%M:%S", True),
  288. ("%Y%m%d %H:%M:%S", True),
  289. ("%Y-%m-%dT%H:%M:%S", True),
  290. ("%Y-%m-%dT%H:%M:%S%z", True),
  291. ("%Y-%m-%dT%H:%M:%S%Z", False),
  292. ("%Y-%m-%dT%H:%M:%S.%f", True),
  293. ("%Y-%m-%dT%H:%M:%S.%f%z", True),
  294. ("%Y-%m-%dT%H:%M:%S.%f%Z", False),
  295. ("%Y%m%d", True),
  296. ("%Y%m", False),
  297. ("%Y", True),
  298. ("%Y-%m-%d", True),
  299. ("%Y-%m", True),
  300. ],
  301. )
  302. def test_is_iso_format(fmt, expected):
  303. # see gh-41047
  304. result = strptime._test_format_is_iso(fmt)
  305. assert result == expected
  306. @pytest.mark.parametrize(
  307. "input",
  308. [
  309. "2018-01-01T00:00:00.123456789",
  310. "2018-01-01T00:00:00.123456",
  311. "2018-01-01T00:00:00.123",
  312. ],
  313. )
  314. def test_guess_datetime_format_f(input):
  315. # https://github.com/pandas-dev/pandas/issues/49043
  316. result = parsing.guess_datetime_format(input)
  317. expected = "%Y-%m-%dT%H:%M:%S.%f"
  318. assert result == expected
  319. def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
  320. msg, result = None, None
  321. try:
  322. result = call(date_string, **kwargs)
  323. except ValueError as err:
  324. msg = str(err)
  325. return msg, result
  326. @given(DATETIME_NO_TZ)
  327. @pytest.mark.parametrize("delimiter", list(" -./"))
  328. @pytest.mark.parametrize("dayfirst", [True, False])
  329. @pytest.mark.parametrize(
  330. "date_format",
  331. ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"],
  332. )
  333. def test_hypothesis_delimited_date(
  334. request, date_format, dayfirst, delimiter, test_datetime
  335. ):
  336. if date_format == "%m %Y" and delimiter == ".":
  337. request.applymarker(
  338. pytest.mark.xfail(
  339. reason="parse_datetime_string cannot reliably tell whether "
  340. "e.g. %m.%Y is a float or a date",
  341. strict=False,
  342. )
  343. )
  344. date_string = test_datetime.strftime(date_format.replace(" ", delimiter))
  345. except_out_dateutil, result = _helper_hypothesis_delimited_date(
  346. parsing.py_parse_datetime_string, date_string, dayfirst=dayfirst
  347. )
  348. except_in_dateutil, expected = _helper_hypothesis_delimited_date(
  349. du_parse,
  350. date_string,
  351. default=datetime(1, 1, 1),
  352. dayfirst=dayfirst,
  353. yearfirst=False,
  354. )
  355. assert except_out_dateutil == except_in_dateutil
  356. assert result == expected