pageobjects.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. # SPDX-FileCopyrightText: 2026 geisserml <geisserml@gmail.com>
  2. # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
  3. __all__ = ("PdfObject", "PdfImage", "PdfTextObj", "PdfFont")
  4. import ctypes
  5. from ctypes import c_uint, c_float
  6. import logging
  7. from pathlib import Path
  8. from collections import namedtuple
  9. import pypdfium2.raw as pdfium_c
  10. import pypdfium2.internal as pdfium_i
  11. from pypdfium2._helpers.misc import PdfiumError
  12. from pypdfium2._helpers.matrix import PdfMatrix
  13. from pypdfium2._helpers.bitmap import PdfBitmap
  14. from pypdfium2._lazy import Lazy
  15. logger = logging.getLogger(__name__)
  16. class PdfObject (pdfium_i.AutoCloseable):
  17. """
  18. Pageobject helper class.
  19. When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e.g. :class:`.PdfImage`, :class:`.PdfTextObj`).
  20. Note:
  21. :meth:`.PdfObject.close` only takes effect on loose pageobjects.
  22. It is a no-op otherwise, because pageobjects that are part of a page are owned by pdfium, not the caller.
  23. Attributes:
  24. raw (FPDF_PAGEOBJECT):
  25. The underlying PDFium pageobject handle.
  26. type (int):
  27. The object's type (:data:`FPDF_PAGEOBJ_*`).
  28. page (PdfPage):
  29. Reference to the page this pageobject belongs to. May be None if not part of a page (e.g. new or detached object).
  30. pdf (PdfDocument):
  31. Reference to the document this pageobject belongs to. May be None if the object does not belong to a document yet.
  32. This attribute is always set if :attr:`.page` is set.
  33. container (PdfObject | None):
  34. PdfObject handle to parent Form XObject, if the pageobject is nested in a Form XObject, None otherwise.
  35. level (int):
  36. Nesting level signifying the number of parent Form XObjects, at the time of construction.
  37. Zero if the object is not nested in a Form XObject.
  38. """
  39. def __new__(cls, raw, *args, **kwargs):
  40. type = pdfium_c.FPDFPageObj_GetType(raw)
  41. if type == pdfium_c.FPDF_PAGEOBJ_IMAGE:
  42. instance = super().__new__(PdfImage)
  43. elif type == pdfium_c.FPDF_PAGEOBJ_TEXT:
  44. instance = super().__new__(PdfTextObj)
  45. else:
  46. instance = super().__new__(PdfObject)
  47. instance.type = type
  48. return instance
  49. def __init__(self, raw, page=None, pdf=None, container=None, level=0):
  50. self.raw = raw
  51. self.page = page
  52. self.pdf = pdf
  53. self.container = container
  54. self.level = level
  55. if page is not None:
  56. if self.pdf is None:
  57. self.pdf = page.pdf
  58. elif self.pdf is not page.pdf:
  59. raise ValueError("*page* must belong to *pdf* when constructing a pageobject.")
  60. # TODO if page is not None, hold it in the finalizer, unless the pageobject is detached from the page
  61. super().__init__(pdfium_c.FPDFPageObj_Destroy, needs_free=(page is None))
  62. @property
  63. def parent(self): # AutoCloseable hook
  64. # May be None (loose pageobject)
  65. return self.pdf if self.page is None else self.page
  66. def get_bounds(self):
  67. """
  68. Get the bounds of the object on the page.
  69. Returns:
  70. tuple[float * 4]: Left, bottom, right and top, in PDF page coordinates.
  71. """
  72. if self.page is None:
  73. raise RuntimeError("Must not call get_bounds() on a loose pageobject.")
  74. l, b, r, t = c_float(), c_float(), c_float(), c_float()
  75. ok = pdfium_c.FPDFPageObj_GetBounds(self, l, b, r, t)
  76. if not ok:
  77. raise PdfiumError("Failed to locate pageobject.")
  78. return (l.value, b.value, r.value, t.value)
  79. def get_quad_points(self):
  80. """
  81. Get the object's quadriliteral points (i.e. the positions of its corners).
  82. For transformed objects, this may provide tighter bounds than a rectangle (e.g. rotation by a non-multiple of 90°, shear).
  83. Note:
  84. This function only supports image and text objects.
  85. Returns:
  86. tuple[tuple[float*2] * 4]: Corner positions as (x, y) tuples, counter-clockwise from origin, i.e. bottom-left, bottom-right, top-right, top-left, in PDF page coordinates.
  87. """
  88. if self.type not in (pdfium_c.FPDF_PAGEOBJ_IMAGE, pdfium_c.FPDF_PAGEOBJ_TEXT):
  89. # as of pdfium 5921
  90. raise RuntimeError("Quad points only supported for image and text objects.")
  91. q = pdfium_c.FS_QUADPOINTSF()
  92. ok = pdfium_c.FPDFPageObj_GetRotatedBounds(self, q)
  93. if not ok:
  94. raise PdfiumError("Failed to get quad points.")
  95. return (q.x1, q.y1), (q.x2, q.y2), (q.x3, q.y3), (q.x4, q.y4)
  96. def get_matrix(self):
  97. """
  98. Returns:
  99. PdfMatrix: The pageobject's current transform matrix.
  100. """
  101. fs_matrix = pdfium_c.FS_MATRIX()
  102. ok = pdfium_c.FPDFPageObj_GetMatrix(self, fs_matrix)
  103. if not ok:
  104. raise PdfiumError("Failed to get matrix of pageobject.")
  105. return PdfMatrix.from_raw(fs_matrix)
  106. def set_matrix(self, matrix):
  107. """
  108. Parameters:
  109. matrix (PdfMatrix): Set this matrix as the pageobject's transform matrix.
  110. """
  111. ok = pdfium_c.FPDFPageObj_SetMatrix(self, matrix)
  112. if not ok:
  113. raise PdfiumError("Failed to set matrix of pageobject.")
  114. def transform(self, matrix):
  115. """
  116. Parameters:
  117. matrix (PdfMatrix): Multiply the pageobject's current transform matrix by this matrix.
  118. """
  119. ok = pdfium_c.FPDFPageObj_TransformF(self, matrix)
  120. if not ok:
  121. raise PdfiumError("Failed to transform pageobject with matrix.")
  122. class PdfTextObj (PdfObject):
  123. """
  124. Textobject helper class.
  125. You may want to call :meth:`.PdfPage.get_objects` or :meth:`.PdfTextPage.get_textobj` to obtain an instance of this class.
  126. """
  127. # TODO hold parent object in finalizer
  128. def __init__(self, *args, textpage=None, **kwargs):
  129. if textpage is not None:
  130. kwargs.update(page=textpage.page, pdf=textpage.page.pdf)
  131. super().__init__(*args, **kwargs)
  132. self.textpage = textpage
  133. def extract(self):
  134. """
  135. Returns:
  136. str: The objects's text content.
  137. """
  138. bufsize = pdfium_c.FPDFTextObj_GetText(self, self.textpage, None, 0)
  139. if bufsize == 0:
  140. raise PdfiumError("Failed to get text from textobject.")
  141. buffer = ctypes.create_string_buffer(bufsize)
  142. buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(pdfium_c.FPDF_WCHAR))
  143. pdfium_c.FPDFTextObj_GetText(self, self.textpage, buffer_ptr, bufsize)
  144. return buffer.raw[:bufsize-2].decode("utf-16-le")
  145. def get_font(self):
  146. """
  147. Returns:
  148. PdfFont: Handle to the object's font. Provides name and weight info.
  149. """
  150. # The font object is _not_ owned by the caller, and the PdfTextObj must remain alive while the font object lives.
  151. raw_font = pdfium_c.FPDFTextObj_GetFont(self)
  152. return PdfFont(raw_font, self)
  153. def get_font_size(self):
  154. """
  155. Returns:
  156. float: Font size used by the object's text, in PDF canvas units (typically 1/72in).
  157. """
  158. r_size = ctypes.c_float()
  159. ok = pdfium_c.FPDFTextObj_GetFontSize(self, r_size)
  160. if not ok:
  161. raise PdfiumError("Failed to get font size.")
  162. return r_size.value
  163. class PdfFont (pdfium_i.AutoCastable):
  164. """
  165. Font helper class.
  166. """
  167. # TODO hold parent in finalizer
  168. def __init__(self, raw, parent=None):
  169. self.raw = raw
  170. self.parent = parent
  171. def _get_name_impl(self, api, which):
  172. bufsize = api(self, None, 0)
  173. if bufsize == 0:
  174. raise PdfiumError(f"Failed to get font {which} name.")
  175. buffer = ctypes.create_string_buffer(bufsize)
  176. api(self, buffer, bufsize)
  177. return buffer.value.decode("utf-8")
  178. def get_base_name(self):
  179. """
  180. Returns:
  181. str: The base font name.
  182. """
  183. return self._get_name_impl(pdfium_c.FPDFFont_GetBaseFontName, "base")
  184. def get_family_name(self):
  185. """
  186. Returns:
  187. str: The font family name.
  188. """
  189. return self._get_name_impl(pdfium_c.FPDFFont_GetFamilyName, "family")
  190. def get_weight(self):
  191. """
  192. Returns:
  193. int: The font's weight. Typical values are 400 (normal) and 700 (bold).
  194. """
  195. weight = pdfium_c.FPDFFont_GetWeight(self)
  196. if weight == -1:
  197. raise PdfiumError("Failed to get font weight.")
  198. return weight
  199. class PdfImage (PdfObject):
  200. """
  201. Image object helper class (specific kind of pageobject).
  202. """
  203. # cf. https://crbug.com/pdfium/1203
  204. #: Filters applied by :func:`FPDFImageObj_GetImageDataDecoded`, referred to as "simple filters". Other filters are considered "complex filters".
  205. SIMPLE_FILTERS = ("ASCIIHexDecode", "ASCII85Decode", "RunLengthDecode", "FlateDecode", "LZWDecode")
  206. @classmethod
  207. def new(cls, pdf):
  208. """
  209. Parameters:
  210. pdf (PdfDocument): The document to which the new image object shall be added.
  211. Returns:
  212. PdfImage: Handle to a new, empty image.
  213. Note that position and size of the image are defined by its matrix, which defaults to the identity matrix.
  214. This means that new images will appear as a tiny square of 1x1 canvas units on the bottom left corner of the page.
  215. Use :class:`.PdfMatrix` and :meth:`.set_matrix` to adjust size and position.
  216. """
  217. raw_img = pdfium_c.FPDFPageObj_NewImageObj(pdf)
  218. return cls(raw_img, page=None, pdf=pdf)
  219. def get_metadata(self):
  220. """
  221. Retrieve image metadata including DPI, bits per pixel, color space, and size.
  222. If the image does not belong to a page yet, bits per pixel and color space will be unset (0).
  223. Note:
  224. * The DPI values signify the resolution of the image on the PDF page, not the DPI metadata embedded in the image file.
  225. * Due to issues in pdfium, this function might be slow on some kinds of images. If you only need size, prefer :meth:`.get_px_size` instead.
  226. Returns:
  227. FPDF_IMAGEOBJ_METADATA: Image metadata structure
  228. """
  229. # https://crbug.com/pdfium/1928
  230. metadata = pdfium_c.FPDF_IMAGEOBJ_METADATA()
  231. ok = pdfium_c.FPDFImageObj_GetImageMetadata(self, self.page, metadata)
  232. if not ok:
  233. raise PdfiumError("Failed to get image metadata.")
  234. return metadata
  235. def get_px_size(self):
  236. """
  237. Returns:
  238. (int, int): Image dimensions as a tuple of (width, height).
  239. """
  240. # https://pdfium-review.googlesource.com/c/pdfium/+/106290
  241. w, h = c_uint(), c_uint()
  242. ok = pdfium_c.FPDFImageObj_GetImagePixelSize(self, w, h)
  243. if not ok:
  244. raise PdfiumError("Failed to get image size.")
  245. return w.value, h.value
  246. def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
  247. """
  248. Set a JPEG as the image object's content.
  249. Parameters:
  250. source (str | pathlib.Path | typing.BinaryIO):
  251. Input JPEG, given as file path or readable byte stream.
  252. pages (list[PdfPage] | None):
  253. If replacing an image, pass in a list of loaded pages that might contain it, to update their cache.
  254. (The same image may be shown multiple times in different transforms across a PDF.)
  255. May be None or an empty sequence if the image is not shared.
  256. inline (bool):
  257. Whether to load the image content into memory. If True, the buffer may be closed after this function call.
  258. Otherwise, the buffer needs to remain open until the PDF is closed.
  259. autoclose (bool):
  260. If the input is a buffer, whether it should be automatically closed once not needed by the PDF anymore.
  261. """
  262. if isinstance(source, (str, Path)):
  263. buffer = open(source, "rb")
  264. autoclose = True
  265. elif pdfium_i.is_stream(source, "r"):
  266. buffer = source
  267. else:
  268. raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte stream.")
  269. bufaccess, to_hold = pdfium_i.get_bufreader(buffer)
  270. loader = pdfium_c.FPDFImageObj_LoadJpegFileInline if inline else \
  271. pdfium_c.FPDFImageObj_LoadJpegFile
  272. c_pages, page_count = pdfium_i.pages_c_array(pages)
  273. ok = loader(c_pages, page_count, self, bufaccess)
  274. if not ok:
  275. raise PdfiumError("Failed to load JPEG into image object.")
  276. if inline:
  277. for data in to_hold:
  278. id(data)
  279. if autoclose:
  280. buffer.close()
  281. else:
  282. self.pdf._data_holder += to_hold
  283. if autoclose:
  284. self.pdf._data_closer.append(buffer)
  285. def set_bitmap(self, bitmap, pages=None):
  286. """
  287. Set a bitmap as the image object's content.
  288. The pixel data will be flate compressed (as of PDFium 5418).
  289. Parameters:
  290. bitmap (PdfBitmap):
  291. The bitmap to inject into the image object.
  292. pages (list[PdfPage] | None):
  293. A list of loaded pages that might contain the image object. See :meth:`.load_jpeg`.
  294. """
  295. c_pages, page_count = pdfium_i.pages_c_array(pages)
  296. ok = pdfium_c.FPDFImageObj_SetBitmap(c_pages, page_count, self, bitmap)
  297. if not ok:
  298. raise PdfiumError("Failed to set image to bitmap.")
  299. def _get_rendered_bitmap(self, scale_to_original):
  300. """ This is a private implementation function. Do not use externally. """
  301. if self.pdf is None:
  302. raise RuntimeError("Cannot get rendered bitmap of loose pageobject.")
  303. if scale_to_original:
  304. # Suggested by pdfium dev Lei Zhang in https://groups.google.com/g/pdfium/c/2czGFBcWHHQ/m/g0wzOJR-BAAJ
  305. px_w, px_h = self.get_px_size()
  306. l, b, r, t = self.get_bounds()
  307. content_w, content_h = abs(r-l), abs(t-b)
  308. # align pixel and content width/height relation if swapped due to rotation (e.g. 90°, 270°)
  309. swap = (px_w < px_h) != (content_w < content_h)
  310. if swap:
  311. px_w, px_h = px_h, px_w
  312. # if the image is squashed/stretched, prefer partial upscaling over partial downscaling (not using separate x/y scaling, so the image will look as in the PDF)
  313. scale_factor = max(px_w/content_w, px_h/content_h)
  314. orig_mat = self.get_matrix()
  315. scaled_mat = orig_mat.scale(scale_factor, scale_factor)
  316. self.set_matrix(scaled_mat)
  317. # logger.debug(
  318. # f"Pixel size: {px_w}, {px_h} (did swap? {swap})\n"
  319. # f"Size in page coords: {content_w}, {content_h}\n"
  320. # f"Scale: {scale_factor}\n"
  321. # f"Current matrix: {orig_mat}\n"
  322. # f"Scaled matrix: {scaled_mat}"
  323. # )
  324. try:
  325. raw_bitmap = pdfium_c.FPDFImageObj_GetRenderedBitmap(self.pdf, self.page, self)
  326. finally:
  327. if scale_to_original:
  328. self.set_matrix(orig_mat)
  329. return raw_bitmap
  330. def get_bitmap(self, render=False, scale_to_original=True):
  331. """
  332. Get a bitmap rasterization of the image.
  333. Parameters:
  334. render (bool):
  335. Whether the image should be rendered, thereby applying possible transform matrices and alpha masks.
  336. scale_to_original (bool):
  337. If *render* is True, whether to temporarily scale the image to its native resolution, or close to that (defaults to True). This should improve output quality. Ignored if *render* is False.
  338. Returns:
  339. PdfBitmap: Image bitmap (with a buffer allocated by PDFium).
  340. """
  341. if render:
  342. raw_bitmap = self._get_rendered_bitmap(scale_to_original)
  343. else:
  344. raw_bitmap = pdfium_c.FPDFImageObj_GetBitmap(self)
  345. if not raw_bitmap:
  346. raise PdfiumError(f"Failed to get bitmap of image {self}.")
  347. bitmap = PdfBitmap.from_raw(raw_bitmap)
  348. if render and scale_to_original:
  349. logger.debug(f"Extracted size: {bitmap.width}, {bitmap.height}")
  350. return bitmap
  351. def get_data(self, decode_simple=False):
  352. """
  353. Parameters:
  354. decode_simple (bool):
  355. If True, decode simple filters (see :attr:`.SIMPLE_FILTERS`), so only complex filters will remain, if any. If there are no complex filters, this provides the decoded pixel data.
  356. If False, the raw stream data will be returned instead.
  357. Returns:
  358. ctypes.Array: The data of the image stream (as :class:`~ctypes.c_ubyte` array).
  359. """
  360. func = pdfium_c.FPDFImageObj_GetImageDataDecoded if decode_simple else \
  361. pdfium_c.FPDFImageObj_GetImageDataRaw
  362. n_bytes = func(self, None, 0)
  363. buffer = (ctypes.c_ubyte * n_bytes)()
  364. func(self, buffer, n_bytes)
  365. return buffer
  366. def get_filters(self, skip_simple=False):
  367. """
  368. Parameters:
  369. skip_simple (bool):
  370. If True, exclude simple filters.
  371. Returns:
  372. list[str]: A list of image filters, to be applied in order (from lowest to highest index).
  373. """
  374. filters = []
  375. count = pdfium_c.FPDFImageObj_GetImageFilterCount(self)
  376. for i in range(count):
  377. length = pdfium_c.FPDFImageObj_GetImageFilter(self, i, None, 0)
  378. buffer = ctypes.create_string_buffer(length)
  379. pdfium_c.FPDFImageObj_GetImageFilter(self, i, buffer, length)
  380. f = buffer.value.decode("utf-8")
  381. filters.append(f)
  382. if skip_simple:
  383. filters = [f for f in filters if f not in self.SIMPLE_FILTERS]
  384. return filters
  385. def extract(self, dest, *args, **kwargs):
  386. """
  387. Extract the image into an independently usable file or byte stream, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits.
  388. This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly.
  389. Otherwise, the pixel data is decoded and re-encoded using :mod:`PIL`, which is slower and loses the original encoding.
  390. For images with simple filters only, ``get_data(decode_simple=True)`` is used to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``.
  391. For images with complex filters other than those extracted directly, we have to resort to :meth:`.get_bitmap`.
  392. Note, this method is not able to account for alpha masks, and potentially other data stored separately of the main image stream, which might lead to incorrect representation of the image.
  393. Tip:
  394. The ``pikepdf`` library is capable of preserving the original encoding in many cases where this method is not.
  395. Parameters:
  396. dest (str | pathlib.Path | io.BytesIO):
  397. File path prefix or byte stream to which the image shall be written.
  398. fb_format (str):
  399. The image format to use in case it is necessary to (re-)encode the data.
  400. """
  401. # https://crbug.com/pdfium/1930
  402. extraction_gen = _extract_smart(self, *args, **kwargs)
  403. format = next(extraction_gen)
  404. if isinstance(dest, (str, Path)):
  405. with open(f"{dest}.{format}", "wb") as buf:
  406. extraction_gen.send(buf)
  407. elif pdfium_i.is_stream(dest, "w"):
  408. extraction_gen.send(dest)
  409. else:
  410. raise ValueError(f"Cannot extract to '{dest}'")
  411. _ImageInfo = namedtuple("_ImageInfo", "format mode metadata all_filters complex_filters")
  412. class _ImageExtractionError (Exception):
  413. pass
  414. def _get_pil_mode(cs, bpp):
  415. # As of Jan 2025, pdfium does not provide access to the palette, so we cannot handle indexed (palettized) color space.
  416. # TODO handle ICC-based color spaces (pdfium now provides access to the ICC profile via FPDFImageObj_GetIccProfileDataDecoded(), see commit edd7c5cf)
  417. if cs == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY:
  418. return "1" if bpp == 1 else "L"
  419. elif cs == pdfium_c.FPDF_COLORSPACE_DEVICERGB:
  420. return "RGB"
  421. elif cs == pdfium_c.FPDF_COLORSPACE_DEVICECMYK:
  422. return "CMYK"
  423. else:
  424. return None
  425. def _extract_smart(image_obj, fb_format=None):
  426. try:
  427. # TODO can we change PdfImage.get_data() to take an mmap, so the data could be written directly into a file rather than an in-memory array?
  428. data, info = _extract_direct(image_obj)
  429. except _ImageExtractionError as e:
  430. logger.debug(str(e))
  431. pil_image = image_obj.get_bitmap(render=False).to_pil()
  432. else:
  433. pil_image = None
  434. format = info.format
  435. if format == "raw":
  436. metadata = info.metadata
  437. pil_image = Lazy.PIL_Image.frombuffer(
  438. info.mode,
  439. (metadata.width, metadata.height),
  440. image_obj.get_data(decode_simple=True),
  441. "raw", info.mode, 0, 1,
  442. )
  443. if pil_image:
  444. format = fb_format
  445. if not format:
  446. format = "tiff" if pil_image.mode == "CMYK" else "png"
  447. buffer = yield format
  448. if pil_image:
  449. pil_image.save(buffer, format=format)
  450. else:
  451. buffer.write(data)
  452. yield # breakpoint preventing StopIteration on .send()
  453. def _extract_direct(image_obj):
  454. all_filters = image_obj.get_filters()
  455. complex_filters = [f for f in all_filters if f not in PdfImage.SIMPLE_FILTERS]
  456. metadata = image_obj.get_metadata()
  457. mode = _get_pil_mode(metadata.colorspace, metadata.bits_per_pixel)
  458. if len(complex_filters) == 0:
  459. if mode:
  460. out_data = image_obj.get_data(decode_simple=True)
  461. out_format = "raw"
  462. else:
  463. raise _ImageExtractionError(f"Unhandled color space {pdfium_i.ColorspaceToStr.get(metadata.colorspace)} - don't know how to treat data.")
  464. elif len(complex_filters) == 1:
  465. f = complex_filters[0]
  466. if f == "DCTDecode":
  467. out_data = image_obj.get_data(decode_simple=True)
  468. out_format = "jpg"
  469. elif f == "JPXDecode":
  470. out_data = image_obj.get_data(decode_simple=True)
  471. out_format = "jp2"
  472. else:
  473. raise _ImageExtractionError(f"Unhandled complex filter {f}.")
  474. else:
  475. raise _ImageExtractionError(f"Cannot handle multiple complex filters {complex_filters}.")
  476. info = _ImageInfo(out_format, mode, metadata, all_filters, complex_filters)
  477. return out_data, info