helpers.py 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217
  1. # helpers.py
  2. import html.entities
  3. import operator
  4. import re
  5. import sys
  6. import typing
  7. from . import __diag__
  8. from .core import *
  9. from .util import (
  10. _bslash,
  11. _flatten,
  12. _escape_regex_range_chars,
  13. make_compressed_re,
  14. replaced_by_pep8,
  15. )
  16. def _suppression(expr: Union[ParserElement, str]) -> ParserElement:
  17. # internal helper to avoid wrapping Suppress inside another Suppress
  18. if isinstance(expr, Suppress):
  19. return expr
  20. return Suppress(expr)
  21. #
  22. # global helpers
  23. #
  24. def counted_array(
  25. expr: ParserElement, int_expr: typing.Optional[ParserElement] = None, **kwargs
  26. ) -> ParserElement:
  27. """Helper to define a counted list of expressions.
  28. This helper defines a pattern of the form::
  29. integer expr expr expr...
  30. where the leading integer tells how many expr expressions follow.
  31. The matched tokens returns the array of expr tokens as a list - the
  32. leading count token is suppressed.
  33. If ``int_expr`` is specified, it should be a pyparsing expression
  34. that produces an integer value.
  35. Examples:
  36. .. doctest::
  37. >>> counted_array(Word(alphas)).parse_string('2 ab cd ef')
  38. ParseResults(['ab', 'cd'], {})
  39. - In this parser, the leading integer value is given in binary,
  40. '10' indicating that 2 values are in the array:
  41. .. doctest::
  42. >>> binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
  43. >>> counted_array(Word(alphas), int_expr=binary_constant
  44. ... ).parse_string('10 ab cd ef')
  45. ParseResults(['ab', 'cd'], {})
  46. - If other fields must be parsed after the count but before the
  47. list items, give the fields results names and they will
  48. be preserved in the returned ParseResults:
  49. .. doctest::
  50. >>> ppc = pyparsing.common
  51. >>> count_with_metadata = ppc.integer + Word(alphas)("type")
  52. >>> typed_array = counted_array(Word(alphanums),
  53. ... int_expr=count_with_metadata)("items")
  54. >>> result = typed_array.parse_string("3 bool True True False")
  55. >>> print(result.dump())
  56. ['True', 'True', 'False']
  57. - items: ['True', 'True', 'False']
  58. - type: 'bool'
  59. """
  60. intExpr: typing.Optional[ParserElement] = deprecate_argument(
  61. kwargs, "intExpr", None
  62. )
  63. intExpr = intExpr or int_expr
  64. array_expr = Forward()
  65. def count_field_parse_action(s, l, t):
  66. nonlocal array_expr
  67. n = t[0]
  68. array_expr <<= (expr * n) if n else Empty()
  69. # clear list contents, but keep any named results
  70. del t[:]
  71. if intExpr is None:
  72. intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
  73. else:
  74. intExpr = intExpr.copy()
  75. intExpr.set_name("arrayLen")
  76. intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
  77. return (intExpr + array_expr).set_name(f"(len) {expr}...")
  78. def match_previous_literal(expr: ParserElement) -> ParserElement:
  79. """Helper to define an expression that is indirectly defined from
  80. the tokens matched in a previous expression, that is, it looks for
  81. a 'repeat' of a previous expression. For example::
  82. .. testcode::
  83. first = Word(nums)
  84. second = match_previous_literal(first)
  85. match_expr = first + ":" + second
  86. will match ``"1:1"``, but not ``"1:2"``. Because this
  87. matches a previous literal, will also match the leading
  88. ``"1:1"`` in ``"1:10"``. If this is not desired, use
  89. :class:`match_previous_expr`. Do *not* use with packrat parsing
  90. enabled.
  91. """
  92. rep = Forward()
  93. def copy_token_to_repeater(s, l, t):
  94. if not t:
  95. rep << Empty()
  96. return
  97. if len(t) == 1:
  98. rep << t[0]
  99. return
  100. # flatten t tokens
  101. tflat = _flatten(t.as_list())
  102. rep << And(Literal(tt) for tt in tflat)
  103. expr.add_parse_action(copy_token_to_repeater, call_during_try=True)
  104. rep.set_name("(prev) " + str(expr))
  105. return rep
  106. def match_previous_expr(expr: ParserElement) -> ParserElement:
  107. """Helper to define an expression that is indirectly defined from
  108. the tokens matched in a previous expression, that is, it looks for
  109. a 'repeat' of a previous expression. For example:
  110. .. testcode::
  111. first = Word(nums)
  112. second = match_previous_expr(first)
  113. match_expr = first + ":" + second
  114. will match ``"1:1"``, but not ``"1:2"``. Because this
  115. matches by expressions, will *not* match the leading ``"1:1"``
  116. in ``"1:10"``; the expressions are evaluated first, and then
  117. compared, so ``"1"`` is compared with ``"10"``. Do *not* use
  118. with packrat parsing enabled.
  119. """
  120. rep = Forward()
  121. e2 = expr.copy()
  122. rep <<= e2
  123. def copy_token_to_repeater(s, l, t):
  124. matchTokens = _flatten(t.as_list())
  125. def must_match_these_tokens(s, l, t):
  126. theseTokens = _flatten(t.as_list())
  127. if theseTokens != matchTokens:
  128. raise ParseException(
  129. s, l, f"Expected {matchTokens}, found{theseTokens}"
  130. )
  131. rep.set_parse_action(must_match_these_tokens, call_during_try=True)
  132. expr.add_parse_action(copy_token_to_repeater, call_during_try=True)
  133. rep.set_name("(prev) " + str(expr))
  134. return rep
  135. def one_of(
  136. strs: Union[typing.Iterable[str], str],
  137. caseless: bool = False,
  138. use_regex: bool = True,
  139. as_keyword: bool = False,
  140. **kwargs,
  141. ) -> ParserElement:
  142. """Helper to quickly define a set of alternative :class:`Literal` s,
  143. and makes sure to do longest-first testing when there is a conflict,
  144. regardless of the input order, but returns
  145. a :class:`MatchFirst` for best performance.
  146. :param strs: a string of space-delimited literals, or a collection of
  147. string literals
  148. :param caseless: treat all literals as caseless
  149. :param use_regex: bool - as an optimization, will
  150. generate a :class:`Regex` object; otherwise, will generate
  151. a :class:`MatchFirst` object (if ``caseless=True`` or
  152. ``as_keyword=True``, or if creating a :class:`Regex` raises an exception)
  153. :param as_keyword: bool - enforce :class:`Keyword`-style matching on the
  154. generated expressions
  155. Parameters ``asKeyword`` and ``useRegex`` are retained for pre-PEP8
  156. compatibility, but will be removed in a future release.
  157. Example:
  158. .. testcode::
  159. comp_oper = one_of("< = > <= >= !=")
  160. var = Word(alphas)
  161. number = Word(nums)
  162. term = var | number
  163. comparison_expr = term + comp_oper + term
  164. print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12"))
  165. prints:
  166. .. testoutput::
  167. [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
  168. """
  169. useRegex: bool = deprecate_argument(kwargs, "useRegex", True)
  170. asKeyword: bool = deprecate_argument(kwargs, "asKeyword", False)
  171. asKeyword = asKeyword or as_keyword
  172. useRegex = useRegex and use_regex
  173. if (
  174. isinstance(caseless, str_type)
  175. and __diag__.warn_on_multiple_string_args_to_oneof
  176. ):
  177. warnings.warn(
  178. "warn_on_multiple_string_args_to_oneof:"
  179. " More than one string argument passed to one_of, pass"
  180. " choices as a list or space-delimited string",
  181. stacklevel=2,
  182. )
  183. if caseless:
  184. is_equal = lambda a, b: a.upper() == b.upper()
  185. masks = lambda a, b: b.upper().startswith(a.upper())
  186. else:
  187. is_equal = operator.eq
  188. masks = lambda a, b: b.startswith(a)
  189. symbols: list[str]
  190. if isinstance(strs, str_type):
  191. strs = typing.cast(str, strs)
  192. symbols = strs.split()
  193. elif isinstance(strs, Iterable):
  194. symbols = list(strs)
  195. else:
  196. raise TypeError("Invalid argument to one_of, expected string or iterable")
  197. if not symbols:
  198. return NoMatch()
  199. # reorder given symbols to take care to avoid masking longer choices with shorter ones
  200. # (but only if the given symbols are not just single characters)
  201. i = 0
  202. while i < len(symbols) - 1:
  203. cur = symbols[i]
  204. for j, other in enumerate(symbols[i + 1 :]):
  205. if is_equal(other, cur):
  206. del symbols[i + j + 1]
  207. break
  208. if len(other) > len(cur) and masks(cur, other):
  209. del symbols[i + j + 1]
  210. symbols.insert(i, other)
  211. break
  212. else:
  213. i += 1
  214. if useRegex:
  215. re_flags: int = re.IGNORECASE if caseless else 0
  216. try:
  217. if all(len(sym) == 1 for sym in symbols):
  218. # symbols are just single characters, create range regex pattern
  219. patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]"
  220. else:
  221. patt = "|".join(re.escape(sym) for sym in symbols)
  222. # wrap with \b word break markers if defining as keywords
  223. if asKeyword:
  224. patt = rf"\b(?:{patt})\b"
  225. ret = Regex(patt, flags=re_flags)
  226. ret.set_name(" | ".join(repr(s) for s in symbols))
  227. if caseless:
  228. # add parse action to return symbols as specified, not in random
  229. # casing as found in input string
  230. symbol_map = {sym.lower(): sym for sym in symbols}
  231. ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
  232. return ret
  233. except re.error:
  234. warnings.warn(
  235. "Exception creating Regex for one_of, building MatchFirst", stacklevel=2
  236. )
  237. # last resort, just use MatchFirst of Token class corresponding to caseless
  238. # and asKeyword settings
  239. CASELESS = KEYWORD = True
  240. parse_element_class = {
  241. (CASELESS, KEYWORD): CaselessKeyword,
  242. (CASELESS, not KEYWORD): CaselessLiteral,
  243. (not CASELESS, KEYWORD): Keyword,
  244. (not CASELESS, not KEYWORD): Literal,
  245. }[(caseless, asKeyword)]
  246. return MatchFirst(parse_element_class(sym) for sym in symbols).set_name(
  247. " | ".join(symbols)
  248. )
  249. def dict_of(key: ParserElement, value: ParserElement) -> Dict:
  250. """Helper to easily and clearly define a dictionary by specifying
  251. the respective patterns for the key and value. Takes care of
  252. defining the :class:`Dict`, :class:`ZeroOrMore`, and
  253. :class:`Group` tokens in the proper order. The key pattern
  254. can include delimiting markers or punctuation, as long as they are
  255. suppressed, thereby leaving the significant key text. The value
  256. pattern can include named results, so that the :class:`Dict` results
  257. can include named token fields.
  258. Example:
  259. .. doctest::
  260. >>> text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
  261. >>> data_word = Word(alphas)
  262. >>> label = data_word + FollowedBy(':')
  263. >>> attr_expr = (
  264. ... label
  265. ... + Suppress(':')
  266. ... + OneOrMore(data_word, stop_on=label)
  267. ... .set_parse_action(' '.join))
  268. >>> print(attr_expr[1, ...].parse_string(text).dump())
  269. ['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap']
  270. >>> attr_label = label
  271. >>> attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label
  272. ... ).set_parse_action(' '.join)
  273. # similar to Dict, but simpler call format
  274. >>> result = dict_of(attr_label, attr_value).parse_string(text)
  275. >>> print(result.dump())
  276. [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
  277. - color: 'light blue'
  278. - posn: 'upper left'
  279. - shape: 'SQUARE'
  280. - texture: 'burlap'
  281. [0]:
  282. ['shape', 'SQUARE']
  283. [1]:
  284. ['posn', 'upper left']
  285. [2]:
  286. ['color', 'light blue']
  287. [3]:
  288. ['texture', 'burlap']
  289. >>> print(result['shape'])
  290. SQUARE
  291. >>> print(result.shape) # object attribute access works too
  292. SQUARE
  293. >>> print(result.as_dict())
  294. {'shape': 'SQUARE', 'posn': 'upper left', 'color': 'light blue', 'texture': 'burlap'}
  295. """
  296. return Dict(OneOrMore(Group(key + value)))
  297. def original_text_for(
  298. expr: ParserElement, as_string: bool = True, **kwargs
  299. ) -> ParserElement:
  300. """Helper to return the original, untokenized text for a given
  301. expression. Useful to restore the parsed fields of an HTML start
  302. tag into the raw tag text itself, or to revert separate tokens with
  303. intervening whitespace back to the original matching input text. By
  304. default, returns a string containing the original parsed text.
  305. If the optional ``as_string`` argument is passed as
  306. ``False``, then the return value is
  307. a :class:`ParseResults` containing any results names that
  308. were originally matched, and a single token containing the original
  309. matched text from the input string. So if the expression passed to
  310. :class:`original_text_for` contains expressions with defined
  311. results names, you must set ``as_string`` to ``False`` if you
  312. want to preserve those results name values.
  313. The ``asString`` pre-PEP8 argument is retained for compatibility,
  314. but will be removed in a future release.
  315. Example:
  316. .. testcode::
  317. src = "this is test <b> bold <i>text</i> </b> normal text "
  318. for tag in ("b", "i"):
  319. opener, closer = make_html_tags(tag)
  320. patt = original_text_for(opener + ... + closer)
  321. print(patt.search_string(src)[0])
  322. prints:
  323. .. testoutput::
  324. ['<b> bold <i>text</i> </b>']
  325. ['<i>text</i>']
  326. """
  327. asString: bool = deprecate_argument(kwargs, "asString", True)
  328. asString = asString and as_string
  329. locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
  330. endlocMarker = locMarker.copy()
  331. endlocMarker.callPreparse = False
  332. matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
  333. if asString:
  334. extractText = lambda s, l, t: s[t._original_start : t._original_end]
  335. else:
  336. def extractText(s, l, t):
  337. t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
  338. matchExpr.set_parse_action(extractText)
  339. matchExpr.ignoreExprs = expr.ignoreExprs
  340. matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
  341. return matchExpr
  342. def ungroup(expr: ParserElement) -> ParserElement:
  343. """Helper to undo pyparsing's default grouping of And expressions,
  344. even if all but one are non-empty.
  345. """
  346. return TokenConverter(expr).add_parse_action(lambda t: t[0])
  347. def locatedExpr(expr: ParserElement) -> ParserElement:
  348. """
  349. .. deprecated:: 3.0.0
  350. Use the :class:`Located` class instead. Note that `Located`
  351. returns results with one less grouping level.
  352. Helper to decorate a returned token with its starting and ending
  353. locations in the input string.
  354. This helper adds the following results names:
  355. - ``locn_start`` - location where matched expression begins
  356. - ``locn_end`` - location where matched expression ends
  357. - ``value`` - the actual parsed results
  358. Be careful if the input text contains ``<TAB>`` characters, you
  359. may want to call :meth:`ParserElement.parse_with_tabs`
  360. """
  361. warnings.warn(
  362. f"{'locatedExpr'!r} deprecated - use {'Located'!r}",
  363. DeprecationWarning,
  364. stacklevel=2,
  365. )
  366. locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
  367. return Group(
  368. locator("locn_start")
  369. + expr("value")
  370. + locator.copy().leave_whitespace()("locn_end")
  371. )
  372. # define special default value to permit None as a significant value for
  373. # ignore_expr
  374. _NO_IGNORE_EXPR_GIVEN = NoMatch()
  375. def nested_expr(
  376. opener: Union[str, ParserElement] = "(",
  377. closer: Union[str, ParserElement] = ")",
  378. content: typing.Optional[ParserElement] = None,
  379. ignore_expr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN,
  380. **kwargs,
  381. ) -> ParserElement:
  382. """Helper method for defining nested lists enclosed in opening and
  383. closing delimiters (``"("`` and ``")"`` are the default).
  384. :param opener: str - opening character for a nested list
  385. (default= ``"("``); can also be a pyparsing expression
  386. :param closer: str - closing character for a nested list
  387. (default= ``")"``); can also be a pyparsing expression
  388. :param content: expression for items within the nested lists
  389. :param ignore_expr: expression for ignoring opening and closing delimiters
  390. (default = :class:`quoted_string`)
  391. Parameter ``ignoreExpr`` is retained for compatibility
  392. but will be removed in a future release.
  393. If an expression is not provided for the content argument, the
  394. nested expression will capture all whitespace-delimited content
  395. between delimiters as a list of separate values.
  396. Use the ``ignore_expr`` argument to define expressions that may
  397. contain opening or closing characters that should not be treated as
  398. opening or closing characters for nesting, such as quoted_string or
  399. a comment expression. Specify multiple expressions using an
  400. :class:`Or` or :class:`MatchFirst`. The default is
  401. :class:`quoted_string`, but if no expressions are to be ignored, then
  402. pass ``None`` for this argument.
  403. Example:
  404. .. testcode::
  405. data_type = one_of("void int short long char float double")
  406. decl_data_type = Combine(data_type + Opt(Word('*')))
  407. ident = Word(alphas+'_', alphanums+'_')
  408. number = pyparsing_common.number
  409. arg = Group(decl_data_type + ident)
  410. LPAR, RPAR = map(Suppress, "()")
  411. code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
  412. c_function = (decl_data_type("type")
  413. + ident("name")
  414. + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR
  415. + code_body("body"))
  416. c_function.ignore(c_style_comment)
  417. source_code = '''
  418. int is_odd(int x) {
  419. return (x%2);
  420. }
  421. int dec_to_hex(char hchar) {
  422. if (hchar >= '0' && hchar <= '9') {
  423. return (ord(hchar)-ord('0'));
  424. } else {
  425. return (10+ord(hchar)-ord('A'));
  426. }
  427. }
  428. '''
  429. for func in c_function.search_string(source_code):
  430. print(f"{func.name} ({func.type}) args: {func.args}")
  431. prints:
  432. .. testoutput::
  433. is_odd (int) args: [['int', 'x']]
  434. dec_to_hex (int) args: [['char', 'hchar']]
  435. """
  436. ignoreExpr: ParserElement = deprecate_argument(
  437. kwargs, "ignoreExpr", _NO_IGNORE_EXPR_GIVEN
  438. )
  439. if ignoreExpr != ignore_expr:
  440. ignoreExpr = ignore_expr if ignoreExpr is _NO_IGNORE_EXPR_GIVEN else ignoreExpr # type: ignore [assignment]
  441. if ignoreExpr is _NO_IGNORE_EXPR_GIVEN:
  442. ignoreExpr = quoted_string()
  443. if opener == closer:
  444. raise ValueError("opening and closing strings cannot be the same")
  445. if content is None:
  446. if isinstance(opener, str_type) and isinstance(closer, str_type):
  447. opener = typing.cast(str, opener)
  448. closer = typing.cast(str, closer)
  449. if len(opener) == 1 and len(closer) == 1:
  450. if ignoreExpr is not None:
  451. content = Combine(
  452. OneOrMore(
  453. ~ignoreExpr
  454. + CharsNotIn(
  455. opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
  456. exact=1,
  457. )
  458. )
  459. )
  460. else:
  461. content = Combine(
  462. Empty()
  463. + CharsNotIn(
  464. opener + closer + ParserElement.DEFAULT_WHITE_CHARS
  465. )
  466. )
  467. else:
  468. if ignoreExpr is not None:
  469. content = Combine(
  470. OneOrMore(
  471. ~ignoreExpr
  472. + ~Literal(opener)
  473. + ~Literal(closer)
  474. + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
  475. )
  476. )
  477. else:
  478. content = Combine(
  479. OneOrMore(
  480. ~Literal(opener)
  481. + ~Literal(closer)
  482. + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
  483. )
  484. )
  485. else:
  486. raise ValueError(
  487. "opening and closing arguments must be strings if no content expression is given"
  488. )
  489. # for these internally-created context expressions, simulate whitespace-skipping
  490. if ParserElement.DEFAULT_WHITE_CHARS:
  491. content.set_parse_action(
  492. lambda t: t[0].strip(ParserElement.DEFAULT_WHITE_CHARS)
  493. )
  494. ret = Forward()
  495. if ignoreExpr is not None:
  496. ret <<= Group(
  497. _suppression(opener)
  498. + ZeroOrMore(ignoreExpr | ret | content)
  499. + _suppression(closer)
  500. )
  501. else:
  502. ret <<= Group(
  503. _suppression(opener) + ZeroOrMore(ret | content) + _suppression(closer)
  504. )
  505. ret.set_name(f"nested {opener}{closer} expression")
  506. # don't override error message from content expressions
  507. ret.errmsg = None
  508. return ret
  509. def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
  510. """Internal helper to construct opening and closing tag expressions,
  511. given a tag name"""
  512. if isinstance(tagStr, str_type):
  513. resname = tagStr
  514. tagStr = Keyword(tagStr, caseless=not xml)
  515. else:
  516. resname = tagStr.name
  517. tagAttrName = Word(alphas, alphanums + "_-:")
  518. if xml:
  519. tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
  520. openTag = (
  521. suppress_LT
  522. + tagStr("tag")
  523. + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
  524. + Opt("/", default=[False])("empty").set_parse_action(
  525. lambda s, l, t: t[0] == "/"
  526. )
  527. + suppress_GT
  528. )
  529. else:
  530. tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
  531. printables, exclude_chars=">"
  532. )
  533. openTag = (
  534. suppress_LT
  535. + tagStr("tag")
  536. + Dict(
  537. ZeroOrMore(
  538. Group(
  539. tagAttrName.set_parse_action(lambda t: t[0].lower())
  540. + Opt(Suppress("=") + tagAttrValue)
  541. )
  542. )
  543. )
  544. + Opt("/", default=[False])("empty").set_parse_action(
  545. lambda s, l, t: t[0] == "/"
  546. )
  547. + suppress_GT
  548. )
  549. closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
  550. openTag.set_name(f"<{resname}>")
  551. # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
  552. openTag.add_parse_action(
  553. lambda t: t.__setitem__(
  554. "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
  555. )
  556. )
  557. closeTag = closeTag(
  558. "end" + "".join(resname.replace(":", " ").title().split())
  559. ).set_name(f"</{resname}>")
  560. openTag.tag = resname
  561. closeTag.tag = resname
  562. openTag.tag_body = SkipTo(closeTag())
  563. return openTag, closeTag
  564. def make_html_tags(
  565. tag_str: Union[str, ParserElement],
  566. ) -> tuple[ParserElement, ParserElement]:
  567. """Helper to construct opening and closing tag expressions for HTML,
  568. given a tag name. Matches tags in either upper or lower case,
  569. attributes with namespaces and with quoted or unquoted values.
  570. Example:
  571. .. testcode::
  572. text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
  573. # make_html_tags returns pyparsing expressions for the opening and
  574. # closing tags as a 2-tuple
  575. a, a_end = make_html_tags("A")
  576. link_expr = a + SkipTo(a_end)("link_text") + a_end
  577. for link in link_expr.search_string(text):
  578. # attributes in the <A> tag (like "href" shown here) are
  579. # also accessible as named results
  580. print(link.link_text, '->', link.href)
  581. prints:
  582. .. testoutput::
  583. pyparsing -> https://github.com/pyparsing/pyparsing/wiki
  584. """
  585. return _makeTags(tag_str, False)
  586. def make_xml_tags(
  587. tag_str: Union[str, ParserElement],
  588. ) -> tuple[ParserElement, ParserElement]:
  589. """Helper to construct opening and closing tag expressions for XML,
  590. given a tag name. Matches tags only in the given upper/lower case.
  591. Example: similar to :class:`make_html_tags`
  592. """
  593. return _makeTags(tag_str, True)
  594. any_open_tag: ParserElement
  595. any_close_tag: ParserElement
  596. any_open_tag, any_close_tag = make_html_tags(
  597. Word(alphas, alphanums + "_:").set_name("any tag")
  598. )
  599. _htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
  600. _most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace(
  601. " ", "|"
  602. )
  603. common_html_entity = Regex(
  604. lambda: f"&(?P<entity>{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});"
  605. ).set_name("common HTML entity")
  606. def replace_html_entity(s, l, t):
  607. """Helper parser action to replace common HTML entities with their special characters"""
  608. return _htmlEntityMap.get(t.entity)
  609. class OpAssoc(Enum):
  610. """Enumeration of operator associativity
  611. - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`"""
  612. LEFT = 1
  613. RIGHT = 2
  614. InfixNotationOperatorArgType = Union[
  615. ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]]
  616. ]
  617. InfixNotationOperatorSpec = Union[
  618. tuple[
  619. InfixNotationOperatorArgType,
  620. int,
  621. OpAssoc,
  622. typing.Optional[ParseAction],
  623. ],
  624. tuple[
  625. InfixNotationOperatorArgType,
  626. int,
  627. OpAssoc,
  628. ],
  629. ]
  630. def infix_notation(
  631. base_expr: ParserElement,
  632. op_list: list[InfixNotationOperatorSpec],
  633. lpar: Union[str, ParserElement] = Suppress("("),
  634. rpar: Union[str, ParserElement] = Suppress(")"),
  635. ) -> Forward:
  636. """Helper method for constructing grammars of expressions made up of
  637. operators working in a precedence hierarchy. Operators may be unary
  638. or binary, left- or right-associative. Parse actions can also be
  639. attached to operator expressions. The generated parser will also
  640. recognize the use of parentheses to override operator precedences
  641. (see example below).
  642. Note: if you define a deep operator list, you may see performance
  643. issues when using infix_notation. See
  644. :class:`ParserElement.enable_packrat` for a mechanism to potentially
  645. improve your parser performance.
  646. Parameters:
  647. :param base_expr: expression representing the most basic operand to
  648. be used in the expression
  649. :param op_list: list of tuples, one for each operator precedence level
  650. in the expression grammar; each tuple is of the form ``(op_expr,
  651. num_operands, right_left_assoc, (optional)parse_action)``, where:
  652. - ``op_expr`` is the pyparsing expression for the operator; may also
  653. be a string, which will be converted to a Literal; if ``num_operands``
  654. is 3, ``op_expr`` is a tuple of two expressions, for the two
  655. operators separating the 3 terms
  656. - ``num_operands`` is the number of terms for this operator (must be 1,
  657. 2, or 3)
  658. - ``right_left_assoc`` is the indicator whether the operator is right
  659. or left associative, using the pyparsing-defined constants
  660. ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
  661. - ``parse_action`` is the parse action to be associated with
  662. expressions matching this operator expression (the parse action
  663. tuple member may be omitted); if the parse action is passed
  664. a tuple or list of functions, this is equivalent to calling
  665. ``set_parse_action(*fn)``
  666. (:class:`ParserElement.set_parse_action`)
  667. :param lpar: expression for matching left-parentheses; if passed as a
  668. str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as
  669. an expression (such as ``Literal('(')``), then it will be kept in
  670. the parsed results, and grouped with them. (default= ``Suppress('(')``)
  671. :param rpar: expression for matching right-parentheses; if passed as a
  672. str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as
  673. an expression (such as ``Literal(')')``), then it will be kept in
  674. the parsed results, and grouped with them. (default= ``Suppress(')')``)
  675. Example:
  676. .. testcode::
  677. # simple example of four-function arithmetic with ints and
  678. # variable names
  679. integer = pyparsing_common.signed_integer
  680. varname = pyparsing_common.identifier
  681. arith_expr = infix_notation(integer | varname,
  682. [
  683. ('-', 1, OpAssoc.RIGHT),
  684. (one_of('* /'), 2, OpAssoc.LEFT),
  685. (one_of('+ -'), 2, OpAssoc.LEFT),
  686. ])
  687. arith_expr.run_tests('''
  688. 5+3*6
  689. (5+3)*6
  690. (5+x)*y
  691. -2--11
  692. ''', full_dump=False)
  693. prints:
  694. .. testoutput::
  695. :options: +NORMALIZE_WHITESPACE
  696. 5+3*6
  697. [[5, '+', [3, '*', 6]]]
  698. (5+3)*6
  699. [[[5, '+', 3], '*', 6]]
  700. (5+x)*y
  701. [[[5, '+', 'x'], '*', 'y']]
  702. -2--11
  703. [[['-', 2], '-', ['-', 11]]]
  704. """
  705. # captive version of FollowedBy that does not do parse actions or capture results names
  706. class _FB(FollowedBy):
  707. def parseImpl(self, instring, loc, doActions=True):
  708. self.expr.try_parse(instring, loc)
  709. return loc, []
  710. _FB.__name__ = "FollowedBy>"
  711. ret = Forward()
  712. ret.set_name(f"{base_expr.name}_expression")
  713. if isinstance(lpar, str):
  714. lpar = Suppress(lpar)
  715. if isinstance(rpar, str):
  716. rpar = Suppress(rpar)
  717. nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}_expression")
  718. # if lpar and rpar are not suppressed, wrap in group
  719. if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)):
  720. lastExpr = base_expr | Group(nested_expr)
  721. else:
  722. lastExpr = base_expr | nested_expr
  723. arity: int
  724. rightLeftAssoc: opAssoc
  725. pa: typing.Optional[ParseAction]
  726. opExpr1: ParserElement
  727. opExpr2: ParserElement
  728. matchExpr: ParserElement
  729. match_lookahead: ParserElement
  730. for operDef in op_list:
  731. opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment]
  732. if isinstance(opExpr, str_type):
  733. opExpr = ParserElement._literalStringClass(opExpr)
  734. opExpr = typing.cast(ParserElement, opExpr)
  735. if arity == 3:
  736. if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
  737. raise ValueError(
  738. "if numterms=3, opExpr must be a tuple or list of two expressions"
  739. )
  740. opExpr1, opExpr2 = opExpr
  741. term_name = f"{opExpr1}{opExpr2} operations"
  742. else:
  743. term_name = f"{opExpr} operations"
  744. if not 1 <= arity <= 3:
  745. raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
  746. if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
  747. raise ValueError("operator must indicate right or left associativity")
  748. thisExpr: ParserElement = Forward().set_name(term_name)
  749. thisExpr = typing.cast(Forward, thisExpr)
  750. match_lookahead = And([])
  751. if rightLeftAssoc is OpAssoc.LEFT:
  752. if arity == 1:
  753. match_lookahead = _FB(lastExpr + opExpr)
  754. matchExpr = Group(lastExpr + opExpr[1, ...])
  755. elif arity == 2:
  756. if opExpr is not None:
  757. match_lookahead = _FB(lastExpr + opExpr + lastExpr)
  758. matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...])
  759. else:
  760. match_lookahead = _FB(lastExpr + lastExpr)
  761. matchExpr = Group(lastExpr[2, ...])
  762. elif arity == 3:
  763. match_lookahead = _FB(
  764. lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
  765. )
  766. matchExpr = Group(
  767. lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...]
  768. )
  769. elif rightLeftAssoc is OpAssoc.RIGHT:
  770. if arity == 1:
  771. # try to avoid LR with this extra test
  772. if not isinstance(opExpr, Opt):
  773. opExpr = Opt(opExpr)
  774. match_lookahead = _FB(opExpr.expr + thisExpr)
  775. matchExpr = Group(opExpr + thisExpr)
  776. elif arity == 2:
  777. if opExpr is not None:
  778. match_lookahead = _FB(lastExpr + opExpr + thisExpr)
  779. matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...])
  780. else:
  781. match_lookahead = _FB(lastExpr + thisExpr)
  782. matchExpr = Group(lastExpr + thisExpr[1, ...])
  783. elif arity == 3:
  784. match_lookahead = _FB(
  785. lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
  786. )
  787. matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
  788. # suppress lookahead expr from railroad diagrams
  789. match_lookahead.show_in_diagram = False
  790. # TODO - determine why this statement can't be included in the following
  791. # if pa block
  792. matchExpr = match_lookahead + matchExpr
  793. if pa:
  794. if isinstance(pa, (tuple, list)):
  795. matchExpr.set_parse_action(*pa)
  796. else:
  797. matchExpr.set_parse_action(pa)
  798. thisExpr <<= (matchExpr | lastExpr).set_name(term_name)
  799. lastExpr = thisExpr
  800. ret <<= lastExpr
  801. return ret
  802. def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
  803. """
  804. .. deprecated:: 3.0.0
  805. Use the :class:`IndentedBlock` class instead. Note that `IndentedBlock`
  806. has a difference method signature.
  807. Helper method for defining space-delimited indentation blocks,
  808. such as those used to define block statements in Python source code.
  809. :param blockStatementExpr: expression defining syntax of statement that
  810. is repeated within the indented block
  811. :param indentStack: list created by caller to manage indentation stack
  812. (multiple ``statementWithIndentedBlock`` expressions within a single
  813. grammar should share a common ``indentStack``)
  814. :param indent: boolean indicating whether block must be indented beyond
  815. the current level; set to ``False`` for block of left-most statements
  816. A valid block must contain at least one ``blockStatement``.
  817. (Note that indentedBlock uses internal parse actions which make it
  818. incompatible with packrat parsing.)
  819. Example:
  820. .. testcode::
  821. data = '''
  822. def A(z):
  823. A1
  824. B = 100
  825. G = A2
  826. A2
  827. A3
  828. B
  829. def BB(a,b,c):
  830. BB1
  831. def BBA():
  832. bba1
  833. bba2
  834. bba3
  835. C
  836. D
  837. def spam(x,y):
  838. def eggs(z):
  839. pass
  840. '''
  841. indentStack = [1]
  842. stmt = Forward()
  843. identifier = Word(alphas, alphanums)
  844. funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
  845. func_body = indentedBlock(stmt, indentStack)
  846. funcDef = Group(funcDecl + func_body)
  847. rvalue = Forward()
  848. funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
  849. rvalue << (funcCall | identifier | Word(nums))
  850. assignment = Group(identifier + "=" + rvalue)
  851. stmt << (funcDef | assignment | identifier)
  852. module_body = stmt[1, ...]
  853. parseTree = module_body.parseString(data)
  854. parseTree.pprint()
  855. prints:
  856. .. testoutput::
  857. [['def',
  858. 'A',
  859. ['(', 'z', ')'],
  860. ':',
  861. [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
  862. 'B',
  863. ['def',
  864. 'BB',
  865. ['(', 'a', 'b', 'c', ')'],
  866. ':',
  867. [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
  868. 'C',
  869. 'D',
  870. ['def',
  871. 'spam',
  872. ['(', 'x', 'y', ')'],
  873. ':',
  874. [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
  875. """
  876. warnings.warn(
  877. f"{'indentedBlock'!r} deprecated - use {'IndentedBlock'!r}",
  878. DeprecationWarning,
  879. stacklevel=2,
  880. )
  881. backup_stacks.append(indentStack[:])
  882. def reset_stack():
  883. indentStack[:] = backup_stacks[-1]
  884. def checkPeerIndent(s, l, t):
  885. if l >= len(s):
  886. return
  887. curCol = col(l, s)
  888. if curCol != indentStack[-1]:
  889. if curCol > indentStack[-1]:
  890. raise ParseException(s, l, "illegal nesting")
  891. raise ParseException(s, l, "not a peer entry")
  892. def checkSubIndent(s, l, t):
  893. curCol = col(l, s)
  894. if curCol > indentStack[-1]:
  895. indentStack.append(curCol)
  896. else:
  897. raise ParseException(s, l, "not a subentry")
  898. def checkUnindent(s, l, t):
  899. if l >= len(s):
  900. return
  901. curCol = col(l, s)
  902. if not (indentStack and curCol in indentStack):
  903. raise ParseException(s, l, "not an unindent")
  904. if curCol < indentStack[-1]:
  905. indentStack.pop()
  906. NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
  907. INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
  908. PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
  909. UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
  910. if indent:
  911. smExpr = Group(
  912. Opt(NL)
  913. + INDENT
  914. + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
  915. + UNDENT
  916. )
  917. else:
  918. smExpr = Group(
  919. Opt(NL)
  920. + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
  921. + Opt(UNDENT)
  922. )
  923. # add a parse action to remove backup_stack from list of backups
  924. smExpr.add_parse_action(
  925. lambda: backup_stacks.pop(-1) and None if backup_stacks else None
  926. )
  927. smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
  928. blockStatementExpr.ignore(_bslash + LineEnd())
  929. return smExpr.set_name("indented block")
  930. # it's easy to get these comment structures wrong - they're very common,
  931. # so may as well make them available
  932. c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment")
  933. "Comment of the form ``/* ... */``"
  934. html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
  935. "Comment of the form ``<!-- ... -->``"
  936. rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
  937. dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
  938. "Comment of the form ``// ... (to end of line)``"
  939. cpp_style_comment = Regex(
  940. r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)"
  941. ).set_name("C++ style comment")
  942. "Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
  943. java_style_comment = cpp_style_comment
  944. "Same as :class:`cpp_style_comment`"
  945. python_style_comment = Regex(r"#.*").set_name("Python style comment")
  946. "Comment of the form ``# ... (to end of line)``"
  947. # build list of built-in expressions, for future reference if a global default value
  948. # gets updated
  949. _builtin_exprs: list[ParserElement] = [
  950. v for v in vars().values() if isinstance(v, ParserElement)
  951. ]
  952. # compatibility function, superseded by DelimitedList class
  953. def delimited_list(
  954. expr: Union[str, ParserElement],
  955. delim: Union[str, ParserElement] = ",",
  956. combine: bool = False,
  957. min: typing.Optional[int] = None,
  958. max: typing.Optional[int] = None,
  959. *,
  960. allow_trailing_delim: bool = False,
  961. ) -> ParserElement:
  962. """
  963. .. deprecated:: 3.1.0
  964. Use the :class:`DelimitedList` class instead.
  965. """
  966. return DelimitedList(
  967. expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim
  968. )
  969. # Compatibility synonyms
  970. # fmt: off
  971. opAssoc = OpAssoc
  972. anyOpenTag = any_open_tag
  973. anyCloseTag = any_close_tag
  974. commonHTMLEntity = common_html_entity
  975. cStyleComment = c_style_comment
  976. htmlComment = html_comment
  977. restOfLine = rest_of_line
  978. dblSlashComment = dbl_slash_comment
  979. cppStyleComment = cpp_style_comment
  980. javaStyleComment = java_style_comment
  981. pythonStyleComment = python_style_comment
  982. delimitedList = replaced_by_pep8("delimitedList", DelimitedList)
  983. delimited_list = replaced_by_pep8("delimited_list", DelimitedList)
  984. countedArray = replaced_by_pep8("countedArray", counted_array)
  985. matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal)
  986. matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr)
  987. oneOf = replaced_by_pep8("oneOf", one_of)
  988. dictOf = replaced_by_pep8("dictOf", dict_of)
  989. originalTextFor = replaced_by_pep8("originalTextFor", original_text_for)
  990. nestedExpr = replaced_by_pep8("nestedExpr", nested_expr)
  991. makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags)
  992. makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags)
  993. replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity)
  994. infixNotation = replaced_by_pep8("infixNotation", infix_notation)
  995. # fmt: on