common.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. # common.py
  2. from .core import *
  3. from .helpers import DelimitedList, any_open_tag, any_close_tag
  4. from datetime import datetime
  5. import sys
  6. PY_310 = sys.version_info >= (3, 10)
  7. # some other useful expressions - using lower-case class name since we are really using this as a namespace
  8. class pyparsing_common:
  9. """Here are some common low-level expressions that may be useful in
  10. jump-starting parser development:
  11. - numeric forms (:class:`integers<integer>`, :class:`reals<real>`,
  12. :class:`scientific notation<sci_real>`)
  13. - common :class:`programming identifiers<identifier>`
  14. - network addresses (:class:`MAC<mac_address>`,
  15. :class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`)
  16. - ISO8601 :class:`dates<iso8601_date>` and
  17. :class:`datetime<iso8601_datetime>`
  18. - :class:`UUID<uuid>`
  19. - :class:`comma-separated list<comma_separated_list>`
  20. - :class:`url`
  21. Parse actions:
  22. - :class:`convert_to_integer`
  23. - :class:`convert_to_float`
  24. - :class:`convert_to_date`
  25. - :class:`convert_to_datetime`
  26. - :class:`strip_html_tags`
  27. - :class:`upcase_tokens`
  28. - :class:`downcase_tokens`
  29. Examples:
  30. .. testcode::
  31. pyparsing_common.number.run_tests('''
  32. # any int or real number, returned as the appropriate type
  33. 100
  34. -100
  35. +100
  36. 3.14159
  37. 6.02e23
  38. 1e-12
  39. ''')
  40. .. testoutput::
  41. :options: +NORMALIZE_WHITESPACE
  42. # any int or real number, returned as the appropriate type
  43. 100
  44. [100]
  45. -100
  46. [-100]
  47. +100
  48. [100]
  49. 3.14159
  50. [3.14159]
  51. 6.02e23
  52. [6.02e+23]
  53. 1e-12
  54. [1e-12]
  55. .. testcode::
  56. pyparsing_common.fnumber.run_tests('''
  57. # any int or real number, returned as float
  58. 100
  59. -100
  60. +100
  61. 3.14159
  62. 6.02e23
  63. 1e-12
  64. ''')
  65. .. testoutput::
  66. :options: +NORMALIZE_WHITESPACE
  67. # any int or real number, returned as float
  68. 100
  69. [100.0]
  70. -100
  71. [-100.0]
  72. +100
  73. [100.0]
  74. 3.14159
  75. [3.14159]
  76. 6.02e23
  77. [6.02e+23]
  78. 1e-12
  79. [1e-12]
  80. .. testcode::
  81. pyparsing_common.hex_integer.run_tests('''
  82. # hex numbers
  83. 100
  84. FF
  85. ''')
  86. .. testoutput::
  87. :options: +NORMALIZE_WHITESPACE
  88. # hex numbers
  89. 100
  90. [256]
  91. FF
  92. [255]
  93. .. testcode::
  94. pyparsing_common.fraction.run_tests('''
  95. # fractions
  96. 1/2
  97. -3/4
  98. ''')
  99. .. testoutput::
  100. :options: +NORMALIZE_WHITESPACE
  101. # fractions
  102. 1/2
  103. [0.5]
  104. -3/4
  105. [-0.75]
  106. .. testcode::
  107. pyparsing_common.mixed_integer.run_tests('''
  108. # mixed fractions
  109. 1
  110. 1/2
  111. -3/4
  112. 1-3/4
  113. ''')
  114. .. testoutput::
  115. :options: +NORMALIZE_WHITESPACE
  116. # mixed fractions
  117. 1
  118. [1]
  119. 1/2
  120. [0.5]
  121. -3/4
  122. [-0.75]
  123. 1-3/4
  124. [1.75]
  125. .. testcode::
  126. import uuid
  127. pyparsing_common.uuid.set_parse_action(token_map(uuid.UUID))
  128. pyparsing_common.uuid.run_tests('''
  129. # uuid
  130. 12345678-1234-5678-1234-567812345678
  131. ''')
  132. .. testoutput::
  133. :options: +NORMALIZE_WHITESPACE
  134. # uuid
  135. 12345678-1234-5678-1234-567812345678
  136. [UUID('12345678-1234-5678-1234-567812345678')]
  137. """
  138. @staticmethod
  139. def convert_to_integer(_, __, t):
  140. """
  141. Parse action for converting parsed integers to Python int
  142. """
  143. return [int(tt) for tt in t]
  144. @staticmethod
  145. def convert_to_float(_, __, t):
  146. """
  147. Parse action for converting parsed numbers to Python float
  148. """
  149. return [float(tt) for tt in t]
  150. integer = (
  151. Word(nums)
  152. .set_name("integer")
  153. .set_parse_action(
  154. convert_to_integer
  155. if PY_310
  156. else lambda t: [int(tt) for tt in t] # type: ignore[misc]
  157. )
  158. )
  159. """expression that parses an unsigned integer, converts to an int"""
  160. hex_integer = (
  161. Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16))
  162. )
  163. """expression that parses a hexadecimal integer, converts to an int"""
  164. signed_integer = (
  165. Regex(r"[+-]?\d+")
  166. .set_name("signed integer")
  167. .set_parse_action(
  168. convert_to_integer
  169. if PY_310
  170. else lambda t: [int(tt) for tt in t] # type: ignore[misc]
  171. )
  172. )
  173. """expression that parses an integer with optional leading sign, converts to an int"""
  174. fraction = (
  175. signed_integer().set_parse_action(
  176. convert_to_float
  177. if PY_310
  178. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  179. )
  180. + "/"
  181. + signed_integer().set_parse_action(
  182. convert_to_float
  183. if PY_310
  184. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  185. )
  186. ).set_name("fraction")
  187. """fractional expression of an integer divided by an integer, converts to a float"""
  188. fraction.add_parse_action(lambda tt: tt[0] / tt[-1])
  189. mixed_integer = (
  190. fraction | signed_integer + Opt(Opt("-").suppress() + fraction)
  191. ).set_name("fraction or mixed integer-fraction")
  192. """mixed integer of the form 'integer - fraction', with optional leading integer, converts to a float"""
  193. mixed_integer.add_parse_action(sum)
  194. real = (
  195. Regex(r"[+-]?(?:\d+\.\d*|\.\d+)")
  196. .set_name("real number")
  197. .set_parse_action(
  198. convert_to_float
  199. if PY_310
  200. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  201. )
  202. )
  203. """expression that parses a floating point number, converts to a float"""
  204. sci_real = (
  205. Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)")
  206. .set_name("real number with scientific notation")
  207. .set_parse_action(
  208. convert_to_float
  209. if PY_310
  210. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  211. )
  212. )
  213. """expression that parses a floating point number with optional
  214. scientific notation, converts to a float"""
  215. # streamlining this expression makes the docs nicer-looking
  216. number = (sci_real | real | signed_integer).set_name("number").streamline()
  217. """any numeric expression, converts to the corresponding Python type"""
  218. fnumber = (
  219. Regex(r"[+-]?\d+\.?\d*(?:[eE][+-]?\d+)?")
  220. .set_name("fnumber")
  221. .set_parse_action(
  222. convert_to_float
  223. if PY_310
  224. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  225. )
  226. )
  227. """any int or real number, always converts to a float"""
  228. ieee_float = (
  229. Regex(r"(?i:[+-]?(?:(?:\d+\.?\d*(?:e[+-]?\d+)?)|nan|inf(?:inity)?))")
  230. .set_name("ieee_float")
  231. .set_parse_action(
  232. convert_to_float
  233. if PY_310
  234. else lambda t: [float(tt) for tt in t] # type: ignore[misc]
  235. )
  236. )
  237. """any floating-point literal (int, real number, infinity, or NaN), converts to a float"""
  238. identifier = Word(identchars, identbodychars).set_name("identifier")
  239. """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
  240. ipv4_address = Regex(
  241. r"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}"
  242. ).set_name("IPv4 address")
  243. "IPv4 address (``0.0.0.0 - 255.255.255.255``)"
  244. _ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer")
  245. _full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name(
  246. "full IPv6 address"
  247. )
  248. _short_ipv6_address = (
  249. Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
  250. + "::"
  251. + Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
  252. ).set_name("short IPv6 address")
  253. _short_ipv6_address.add_condition(
  254. lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8
  255. )
  256. _mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address")
  257. ipv6_address = Combine(
  258. (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name(
  259. "IPv6 address"
  260. )
  261. ).set_name("IPv6 address")
  262. "IPv6 address (long, short, or mixed form)"
  263. mac_address = Regex(
  264. r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}"
  265. ).set_name("MAC address")
  266. "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
  267. @staticmethod
  268. def convert_to_date(fmt: str = "%Y-%m-%d"):
  269. """
  270. Helper to create a parse action for converting parsed date string to Python datetime.date
  271. Params -
  272. - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``)
  273. Example:
  274. .. testcode::
  275. date_expr = pyparsing_common.iso8601_date.copy()
  276. date_expr.set_parse_action(pyparsing_common.convert_to_date())
  277. print(date_expr.parse_string("1999-12-31"))
  278. prints:
  279. .. testoutput::
  280. [datetime.date(1999, 12, 31)]
  281. """
  282. def cvt_fn(ss, ll, tt):
  283. try:
  284. return datetime.strptime(tt[0], fmt).date()
  285. except ValueError as ve:
  286. raise ParseException(ss, ll, str(ve))
  287. return cvt_fn
  288. @staticmethod
  289. def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"):
  290. """Helper to create a parse action for converting parsed
  291. datetime string to Python datetime.datetime
  292. Params -
  293. - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%dT%H:%M:%S.%f"``)
  294. Example:
  295. .. testcode::
  296. dt_expr = pyparsing_common.iso8601_datetime.copy()
  297. dt_expr.set_parse_action(pyparsing_common.convert_to_datetime())
  298. print(dt_expr.parse_string("1999-12-31T23:59:59.999"))
  299. prints:
  300. .. testoutput::
  301. [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
  302. """
  303. def cvt_fn(s, l, t):
  304. try:
  305. return datetime.strptime(t[0], fmt)
  306. except ValueError as ve:
  307. raise ParseException(s, l, str(ve))
  308. return cvt_fn
  309. iso8601_date = Regex(
  310. r"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?"
  311. ).set_name("ISO8601 date")
  312. "ISO8601 date (``yyyy-mm-dd``)"
  313. iso8601_datetime = Regex(
  314. r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?"
  315. ).set_name("ISO8601 datetime")
  316. "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``"
  317. uuid = Regex(r"[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name("UUID")
  318. "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)"
  319. _html_stripper = any_open_tag.suppress() | any_close_tag.suppress()
  320. @staticmethod
  321. def strip_html_tags(s: str, l: int, tokens: ParseResults):
  322. """Parse action to remove HTML tags from web page HTML source
  323. Example:
  324. .. testcode::
  325. # strip HTML links from normal text
  326. text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
  327. td, td_end = make_html_tags("TD")
  328. table_text = td + SkipTo(td_end).set_parse_action(
  329. pyparsing_common.strip_html_tags)("body") + td_end
  330. print(table_text.parse_string(text).body)
  331. Prints:
  332. .. testoutput::
  333. More info at the pyparsing wiki page
  334. """
  335. return pyparsing_common._html_stripper.transform_string(tokens[0])
  336. _commasepitem = (
  337. Combine(
  338. OneOrMore(
  339. ~Literal(",")
  340. + ~LineEnd()
  341. + Word(printables, exclude_chars=",")
  342. + Opt(White(" \t") + ~FollowedBy(LineEnd() | ","))
  343. )
  344. )
  345. .streamline()
  346. .set_name("commaItem")
  347. )
  348. comma_separated_list = DelimitedList(
  349. Opt(quoted_string.copy() | _commasepitem, default="")
  350. ).set_name("comma separated list")
  351. """Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
  352. @staticmethod
  353. def upcase_tokens(s, l, t):
  354. """Parse action to convert tokens to upper case."""
  355. return [tt.upper() for tt in t]
  356. @staticmethod
  357. def downcase_tokens(s, l, t):
  358. """Parse action to convert tokens to lower case."""
  359. return [tt.lower() for tt in t]
  360. # fmt: off
  361. url = Regex(
  362. # https://mathiasbynens.be/demo/url-regex
  363. # https://gist.github.com/dperini/729294
  364. r"(?P<url>" +
  365. # protocol identifier (optional)
  366. # short syntax // still required
  367. r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" +
  368. # user:pass BasicAuth (optional)
  369. r"(?:(?P<auth>\S+(?::\S*)?)@)?" +
  370. r"(?P<host>" +
  371. # IP address exclusion
  372. # private & local networks
  373. r"(?!(?:10|127)(?:\.\d{1,3}){3})" +
  374. r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" +
  375. r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" +
  376. # IP address dotted notation octets
  377. # excludes loopback network 0.0.0.0
  378. # excludes reserved space >= 224.0.0.0
  379. # excludes network & broadcast addresses
  380. # (first & last IP address of each class)
  381. r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" +
  382. r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" +
  383. r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +
  384. r"|" +
  385. # host & domain names, may end with dot
  386. # can be replaced by a shortest alternative
  387. # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
  388. r"(?:" +
  389. r"(?:" +
  390. r"[a-z0-9\u00a1-\uffff]" +
  391. r"[a-z0-9\u00a1-\uffff_-]{0,62}" +
  392. r")?" +
  393. r"[a-z0-9\u00a1-\uffff]\." +
  394. r")+" +
  395. # TLD identifier name, may end with dot
  396. r"(?:[a-z\u00a1-\uffff]{2,}\.?)" +
  397. r")" +
  398. # port number (optional)
  399. r"(:(?P<port>\d{2,5}))?" +
  400. # resource path (optional)
  401. r"(?P<path>\/[^?# ]*)?" +
  402. # query string (optional)
  403. r"(\?(?P<query>[^#]*))?" +
  404. # fragment (optional)
  405. r"(#(?P<fragment>\S*))?" +
  406. r")"
  407. ).set_name("url")
  408. """
  409. URL (http/https/ftp scheme)
  410. .. versionchanged:: 3.1.0
  411. ``url`` named group added
  412. """
  413. # fmt: on
  414. # pre-PEP8 compatibility names
  415. # fmt: off
  416. convertToInteger = staticmethod(replaced_by_pep8("convertToInteger", convert_to_integer))
  417. convertToFloat = staticmethod(replaced_by_pep8("convertToFloat", convert_to_float))
  418. convertToDate = staticmethod(replaced_by_pep8("convertToDate", convert_to_date))
  419. convertToDatetime = staticmethod(replaced_by_pep8("convertToDatetime", convert_to_datetime))
  420. stripHTMLTags = staticmethod(replaced_by_pep8("stripHTMLTags", strip_html_tags))
  421. upcaseTokens = staticmethod(replaced_by_pep8("upcaseTokens", upcase_tokens))
  422. downcaseTokens = staticmethod(replaced_by_pep8("downcaseTokens", downcase_tokens))
  423. # fmt: on
  424. _builtin_exprs = [
  425. v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement)
  426. ]