_parsers.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # SPDX-FileCopyrightText: 2026 geisserml <geisserml@gmail.com>
  2. # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
  3. import os
  4. import sys
  5. import logging
  6. import argparse
  7. from pathlib import Path
  8. import pypdfium2._helpers as pdfium
  9. import pypdfium2.internal as pdfium_i
  10. def setup_logging():
  11. # could also pass through the log level by parameter, but using an env var seemed easiest for now
  12. debug_autoclose = bool(int( os.environ.get("DEBUG_AUTOCLOSE", 0) ))
  13. loglevel = getattr(logging, os.environ.get("PYPDFIUM_LOGLEVEL", "debug").upper())
  14. pdfium_i.DEBUG_AUTOCLOSE.value = debug_autoclose
  15. lib_logger = logging.getLogger("pypdfium2")
  16. lib_logger.addHandler(logging.StreamHandler())
  17. lib_logger.setLevel(loglevel)
  18. pdfium.PdfUnspHandler().setup()
  19. def parse_numtext(numtext):
  20. if not numtext:
  21. return None
  22. indices = []
  23. for num_or_range in numtext.split(","):
  24. if "-" in num_or_range:
  25. start, end = num_or_range.split("-")
  26. start = int(start) - 1
  27. end = int(end) - 1
  28. if start < end:
  29. indices.extend( [i for i in range(start, end+1)] )
  30. else:
  31. indices.extend( [i for i in range(start, end-1, -1)] )
  32. else:
  33. indices.append(int(num_or_range) - 1)
  34. return indices
  35. def round_list(lst, n_digits):
  36. if not lst:
  37. return lst
  38. result = [round(v, n_digits) for v in lst]
  39. if isinstance(lst, tuple):
  40. result = tuple(result)
  41. return result
  42. def add_input(parser, pages=True):
  43. # TODO add option to open file with buffer/bytes strategy
  44. parser.add_argument(
  45. "input",
  46. type = Path,
  47. help = "Input PDF document",
  48. )
  49. parser.add_argument(
  50. "--password",
  51. help = "A password to unlock the PDF, if encrypted",
  52. )
  53. if pages:
  54. parser.add_argument(
  55. "--pages",
  56. default = None,
  57. type = parse_numtext,
  58. help = "Page numbers and ranges to include",
  59. )
  60. def add_n_digits(parser):
  61. parser.add_argument(
  62. "--n-digits",
  63. type = int,
  64. default = 4,
  65. help = "Number of digits to which coordinates/sizes shall be rounded",
  66. )
  67. def get_input(args, init_forms=False, **kwargs):
  68. pdf = pdfium.PdfDocument(args.input, password=args.password, **kwargs)
  69. if init_forms:
  70. pdf.init_forms()
  71. if "pages" in args and not args.pages:
  72. args.pages = [i for i in range(len(pdf))]
  73. # TODO else validate pages, as seen in ./render.py
  74. return pdf
  75. # dummy more_itertools.peekable().__bool__ alternative
  76. def _postpeek_generator(value, iterator):
  77. yield value; yield from iterator
  78. def iterator_hasvalue(iterator):
  79. try:
  80. first_value = next(iterator)
  81. except StopIteration:
  82. return False, None
  83. else:
  84. return True, _postpeek_generator(first_value, iterator)
  85. if sys.version_info >= (3, 9):
  86. from argparse import BooleanOptionalAction
  87. else:
  88. # backport, adapted from argparse sources
  89. class BooleanOptionalAction (argparse.Action):
  90. def __init__(self, option_strings, dest, **kwargs):
  91. _option_strings = []
  92. for option_string in option_strings:
  93. _option_strings.append(option_string)
  94. if option_string.startswith('--'):
  95. option_string = '--no-' + option_string[2:]
  96. _option_strings.append(option_string)
  97. super().__init__(option_strings=_option_strings, dest=dest, nargs=0, **kwargs)
  98. def __call__(self, parser, namespace, values, option_string=None):
  99. if option_string in self.option_strings:
  100. setattr(namespace, self.dest, not option_string.startswith('--no-'))
  101. def format_usage(self):
  102. return ' | '.join(self.option_strings)