extract_text.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. # SPDX-FileCopyrightText: 2026 geisserml <geisserml@gmail.com>
  2. # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
  3. from pypdfium2._cli._parsers import add_input, get_input
  4. EXTRACT_RANGE = "range"
  5. EXTRACT_BOUNDED = "bounded"
  6. # __main__.py hook
  7. PARSER_DESC = """\
  8. Note that PDFium outputs CRLF (\\r\\n) style line breaks.
  9. This may be undesirable or confusing in some situations, e.g. when processing the output with an (unaware) parser on the command line.
  10. If this is an issue, run e.g. `dos2unix` on the output, or use the Python API.\
  11. """
  12. def attach(parser):
  13. add_input(parser, pages=True)
  14. parser.add_argument(
  15. "--strategy",
  16. default = EXTRACT_RANGE,
  17. choices = (EXTRACT_RANGE, EXTRACT_BOUNDED),
  18. help = "PDFium text extraction strategy (range, bounded).",
  19. )
  20. def main(args):
  21. pdf = get_input(args)
  22. sep = ""
  23. for i in args.pages:
  24. page = pdf[i]
  25. textpage = page.get_textpage()
  26. # TODO let caller pass in possible range/boundary parameters
  27. if args.strategy == EXTRACT_RANGE:
  28. text = textpage.get_text_range()
  29. elif args.strategy == EXTRACT_BOUNDED:
  30. text = textpage.get_text_bounded()
  31. else:
  32. assert False
  33. print(sep + f"# Page {i+1}\n" + text)
  34. sep = "\n"