pageobjects.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. # SPDX-FileCopyrightText: 2026 geisserml <geisserml@gmail.com>
  2. # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
  3. # TODO test-confirm filter and info params
  4. from collections import OrderedDict
  5. import pypdfium2._helpers as pdfium
  6. import pypdfium2.internal as pdfium_i
  7. import pypdfium2.raw as pdfium_c
  8. from pypdfium2._cli._parsers import (
  9. add_input,
  10. add_n_digits,
  11. get_input,
  12. round_list,
  13. iterator_hasvalue,
  14. )
  15. PARAM_POS = "pos"
  16. PARAM_IMGINFO = "imginfo"
  17. INFO_PARAMS = (PARAM_POS, PARAM_IMGINFO)
  18. def attach(parser):
  19. add_input(parser, pages=True)
  20. add_n_digits(parser)
  21. # TODO think out strategy for choices (see https://github.com/python/cpython/issues/69247)
  22. obj_types = list( pdfium_i.ObjectTypeToConst.keys() )
  23. parser.add_argument(
  24. "--filter",
  25. nargs = "+",
  26. metavar = "T",
  27. choices = obj_types,
  28. help = f"Object types to include. Choices: {obj_types}",
  29. )
  30. parser.add_argument(
  31. "--max-depth",
  32. type = int,
  33. default = 2,
  34. help = "Maximum recursion depth to consider when descending into Form XObjects.",
  35. )
  36. parser.add_argument(
  37. "--info",
  38. nargs = "+",
  39. type = str.lower,
  40. choices = INFO_PARAMS,
  41. default = INFO_PARAMS,
  42. help = "Object details to show.",
  43. )
  44. def print_img_metadata(m, n_digits, pad=""):
  45. members = OrderedDict(
  46. width = m.width,
  47. height = m.height,
  48. horizontal_dpi = round(m.horizontal_dpi, n_digits),
  49. vertical_dpi = round(m.vertical_dpi, n_digits),
  50. bits_per_pixel = m.bits_per_pixel,
  51. colorspace = pdfium_i.ColorspaceToStr.get(m.colorspace),
  52. )
  53. if m.marked_content_id != -1:
  54. members["marked_content_id"] = m.marked_content_id
  55. for key, value in members.items():
  56. print(pad + f"{key}: {value}")
  57. def main(args):
  58. pdf = get_input(args)
  59. # if no filter is given, leave it at None (make a difference in case of unhandled object types)
  60. if args.filter:
  61. args.filter = [pdfium_i.ObjectTypeToConst[t] for t in args.filter]
  62. show_pos = PARAM_POS in args.info
  63. show_imginfo = PARAM_IMGINFO in args.info
  64. assert show_pos or show_imginfo
  65. total_count = 0
  66. for i in args.pages:
  67. page = pdf[i]
  68. hasvalue, obj_searcher = iterator_hasvalue( page.get_objects(args.filter, max_depth=args.max_depth) )
  69. if not hasvalue: continue
  70. print(f"# Page {i+1}")
  71. count = 0
  72. for obj in obj_searcher:
  73. pad_0 = " " * obj.level
  74. pad_1 = pad_0 + " "
  75. print(pad_0 + pdfium_i.ObjectTypeToStr.get(obj.type))
  76. if show_pos:
  77. bounds = round_list(obj.get_bounds(), args.n_digits)
  78. print(pad_1 + f"Bounding Box: {bounds}")
  79. if obj.type in (pdfium_c.FPDF_PAGEOBJ_IMAGE, pdfium_c.FPDF_PAGEOBJ_TEXT):
  80. quad_bounds = obj.get_quad_points()
  81. print(pad_1 + f"Quad Points: {[round_list(p, args.n_digits) for p in quad_bounds]}")
  82. if show_imginfo and isinstance(obj, pdfium.PdfImage):
  83. print(pad_1 + f"Filters: {obj.get_filters()}")
  84. metadata = obj.get_metadata()
  85. assert (metadata.width, metadata.height) == obj.get_px_size()
  86. print_img_metadata(metadata, args.n_digits, pad=pad_1)
  87. count += 1
  88. if count > 0:
  89. print(f"-> Count: {count}\n")
  90. total_count += count
  91. if total_count > 0:
  92. print(f"-> Total count: {total_count}")