pdf2image.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683
  1. """
  2. pdf2image is a light wrapper for the poppler-utils tools that can convert your
  3. PDFs into Pillow images.
  4. """
  5. import os
  6. import platform
  7. import tempfile
  8. import types
  9. import shutil
  10. import subprocess
  11. from subprocess import Popen, PIPE, TimeoutExpired
  12. from typing import Any, Union, Tuple, List, Dict, Callable
  13. from pathlib import PurePath
  14. from PIL import Image
  15. from pdf2image.generators import uuid_generator, counter_generator, ThreadSafeGenerator
  16. from pdf2image.parsers import (
  17. parse_buffer_to_pgm,
  18. parse_buffer_to_ppm,
  19. parse_buffer_to_jpeg,
  20. parse_buffer_to_png,
  21. )
  22. from pdf2image.exceptions import (
  23. PDFInfoNotInstalledError,
  24. PDFPageCountError,
  25. PDFSyntaxError,
  26. PDFPopplerTimeoutError,
  27. )
  28. TRANSPARENT_FILE_TYPES = ["png", "tiff"]
  29. PDFINFO_CONVERT_TO_INT = ["Pages"]
  30. def convert_from_path(
  31. pdf_path: Union[str, PurePath],
  32. dpi: int = 200,
  33. output_folder: Union[str, PurePath] = None,
  34. first_page: int = None,
  35. last_page: int = None,
  36. fmt: str = "ppm",
  37. jpegopt: Dict = None,
  38. thread_count: int = 1,
  39. userpw: str = None,
  40. ownerpw: str = None,
  41. use_cropbox: bool = False,
  42. strict: bool = False,
  43. transparent: bool = False,
  44. single_file: bool = False,
  45. output_file: Any = uuid_generator(),
  46. poppler_path: Union[str, PurePath] = None,
  47. grayscale: bool = False,
  48. size: Union[Tuple, int] = None,
  49. paths_only: bool = False,
  50. use_pdftocairo: bool = False,
  51. timeout: int = None,
  52. hide_annotations: bool = False,
  53. ) -> List[Image.Image]:
  54. """Function wrapping pdftoppm and pdftocairo
  55. :param pdf_path: Path to the PDF that you want to convert
  56. :type pdf_path: Union[str, PurePath]
  57. :param dpi: Image quality in DPI (default 200), defaults to 200
  58. :type dpi: int, optional
  59. :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
  60. :type output_folder: Union[str, PurePath], optional
  61. :param first_page: First page to process, defaults to None
  62. :type first_page: int, optional
  63. :param last_page: Last page to process before stopping, defaults to None
  64. :type last_page: int, optional
  65. :param fmt: Output image format, defaults to "ppm"
  66. :type fmt: str, optional
  67. :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
  68. :type jpegopt: Dict, optional
  69. :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
  70. :type thread_count: int, optional
  71. :param userpw: PDF's password, defaults to None
  72. :type userpw: str, optional
  73. :param ownerpw: PDF's owner password, defaults to None
  74. :type ownerpw: str, optional
  75. :param use_cropbox: Use cropbox instead of mediabox, defaults to False
  76. :type use_cropbox: bool, optional
  77. :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
  78. :type strict: bool, optional
  79. :param transparent: Output with a transparent background instead of a white one, defaults to False
  80. :type transparent: bool, optional
  81. :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
  82. :type single_file: bool, optional
  83. :param output_file: What is the output filename or generator, defaults to uuid_generator()
  84. :type output_file: Any, optional
  85. :param poppler_path: Path to look for poppler binaries, defaults to None
  86. :type poppler_path: Union[str, PurePath], optional
  87. :param grayscale: Output grayscale image(s), defaults to False
  88. :type grayscale: bool, optional
  89. :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
  90. :type size: Union[Tuple, int], optional
  91. :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
  92. :type paths_only: bool, optional
  93. :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
  94. :type use_pdftocairo: bool, optional
  95. :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
  96. :type timeout: int, optional
  97. :param hide_annotations: Hide PDF annotations in the output, defaults to False
  98. :type hide_annotations: bool, optional
  99. :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
  100. :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
  101. :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
  102. :return: A list of Pillow images, one for each page between first_page and last_page
  103. :rtype: List[Image.Image]
  104. """
  105. if use_pdftocairo and fmt == "ppm":
  106. fmt = "png"
  107. # We make sure that if passed arguments are Path objects, they're converted to strings
  108. if isinstance(pdf_path, PurePath):
  109. pdf_path = pdf_path.as_posix()
  110. if isinstance(output_folder, PurePath):
  111. output_folder = output_folder.as_posix()
  112. if isinstance(poppler_path, PurePath):
  113. poppler_path = poppler_path.as_posix()
  114. page_count = pdfinfo_from_path(
  115. pdf_path, userpw, ownerpw, poppler_path=poppler_path
  116. )["Pages"]
  117. # We start by getting the output format, the buffer processing function and if we need pdftocairo
  118. parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format(
  119. fmt, grayscale
  120. )
  121. # We use pdftocairo is the format requires it OR we need a transparent output
  122. use_pdfcairo = (
  123. use_pdftocairo
  124. or use_pdfcairo_format
  125. or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES)
  126. )
  127. poppler_version_major, poppler_version_minor = _get_poppler_version(
  128. "pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
  129. )
  130. if poppler_version_major == 0 and poppler_version_minor <= 57:
  131. jpegopt = None
  132. if poppler_version_major == 0 and poppler_version_minor <= 83:
  133. hide_annotations = False
  134. # If output_file isn't a generator, it will be turned into one
  135. if not isinstance(output_file, types.GeneratorType) and not isinstance(
  136. output_file, ThreadSafeGenerator
  137. ):
  138. if single_file:
  139. output_file = iter([output_file])
  140. thread_count = 1
  141. else:
  142. output_file = counter_generator(output_file)
  143. if thread_count < 1:
  144. thread_count = 1
  145. if first_page is None or first_page < 1:
  146. first_page = 1
  147. if last_page is None or last_page > page_count:
  148. last_page = page_count
  149. if first_page > last_page:
  150. return []
  151. try:
  152. auto_temp_dir = False
  153. if output_folder is None and use_pdfcairo:
  154. output_folder = tempfile.mkdtemp()
  155. auto_temp_dir = True
  156. # Recalculate page count based on first and last page
  157. page_count = last_page - first_page + 1
  158. if thread_count > page_count:
  159. thread_count = page_count
  160. reminder = page_count % thread_count
  161. current_page = first_page
  162. processes = []
  163. for _ in range(thread_count):
  164. thread_output_file = next(output_file)
  165. # Get the number of pages the thread will be processing
  166. thread_page_count = page_count // thread_count + int(reminder > 0)
  167. # Build the command accordingly
  168. args = _build_command(
  169. ["-r", str(dpi), pdf_path],
  170. output_folder,
  171. current_page,
  172. current_page + thread_page_count - 1,
  173. parsed_fmt,
  174. jpegopt,
  175. thread_output_file,
  176. userpw,
  177. ownerpw,
  178. use_cropbox,
  179. transparent,
  180. single_file,
  181. grayscale,
  182. size,
  183. hide_annotations,
  184. )
  185. if use_pdfcairo:
  186. if hide_annotations:
  187. raise NotImplementedError(
  188. "Hide annotations flag not implemented in pdftocairo."
  189. )
  190. args = [_get_command_path("pdftocairo", poppler_path)] + args
  191. else:
  192. args = [_get_command_path("pdftoppm", poppler_path)] + args
  193. # Update page values
  194. current_page = current_page + thread_page_count
  195. reminder -= int(reminder > 0)
  196. # Add poppler path to LD_LIBRARY_PATH
  197. env = os.environ.copy()
  198. if poppler_path is not None:
  199. env["LD_LIBRARY_PATH"] = (
  200. poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
  201. )
  202. # Spawn the process and save its uuid
  203. startupinfo = None
  204. if platform.system() == "Windows":
  205. # this startupinfo structure prevents a console window from popping up on Windows
  206. startupinfo = subprocess.STARTUPINFO()
  207. startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
  208. processes.append(
  209. (
  210. thread_output_file,
  211. Popen(
  212. args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo
  213. ),
  214. )
  215. )
  216. images = []
  217. for uid, proc in processes:
  218. try:
  219. data, err = proc.communicate(timeout=timeout)
  220. except TimeoutExpired:
  221. proc.kill()
  222. outs, errs = proc.communicate()
  223. raise PDFPopplerTimeoutError("Run poppler timeout.")
  224. if b"Syntax Error" in err and strict:
  225. raise PDFSyntaxError(err.decode("utf8", "ignore"))
  226. if output_folder is not None:
  227. images += _load_from_output_folder(
  228. output_folder,
  229. uid,
  230. final_extension,
  231. paths_only,
  232. in_memory=auto_temp_dir,
  233. )
  234. else:
  235. images += parse_buffer_func(data)
  236. finally:
  237. if auto_temp_dir:
  238. shutil.rmtree(output_folder)
  239. return images
  240. def convert_from_bytes(
  241. pdf_file: bytes,
  242. dpi: int = 200,
  243. output_folder: Union[str, PurePath] = None,
  244. first_page: int = None,
  245. last_page: int = None,
  246. fmt: str = "ppm",
  247. jpegopt: Dict = None,
  248. thread_count: int = 1,
  249. userpw: str = None,
  250. ownerpw: str = None,
  251. use_cropbox: bool = False,
  252. strict: bool = False,
  253. transparent: bool = False,
  254. single_file: bool = False,
  255. output_file: Union[str, PurePath] = uuid_generator(),
  256. poppler_path: Union[str, PurePath] = None,
  257. grayscale: bool = False,
  258. size: Union[Tuple, int] = None,
  259. paths_only: bool = False,
  260. use_pdftocairo: bool = False,
  261. timeout: int = None,
  262. hide_annotations: bool = False,
  263. ) -> List[Image.Image]:
  264. """Function wrapping pdftoppm and pdftocairo.
  265. :param pdf_bytes: Bytes of the PDF that you want to convert
  266. :type pdf_bytes: bytes
  267. :param dpi: Image quality in DPI (default 200), defaults to 200
  268. :type dpi: int, optional
  269. :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
  270. :type output_folder: Union[str, PurePath], optional
  271. :param first_page: First page to process, defaults to None
  272. :type first_page: int, optional
  273. :param last_page: Last page to process before stopping, defaults to None
  274. :type last_page: int, optional
  275. :param fmt: Output image format, defaults to "ppm"
  276. :type fmt: str, optional
  277. :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
  278. :type jpegopt: Dict, optional
  279. :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
  280. :type thread_count: int, optional
  281. :param userpw: PDF's password, defaults to None
  282. :type userpw: str, optional
  283. :param ownerpw: PDF's owner password, defaults to None
  284. :type ownerpw: str, optional
  285. :param use_cropbox: Use cropbox instead of mediabox, defaults to False
  286. :type use_cropbox: bool, optional
  287. :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
  288. :type strict: bool, optional
  289. :param transparent: Output with a transparent background instead of a white one, defaults to False
  290. :type transparent: bool, optional
  291. :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
  292. :type single_file: bool, optional
  293. :param output_file: What is the output filename or generator, defaults to uuid_generator()
  294. :type output_file: Any, optional
  295. :param poppler_path: Path to look for poppler binaries, defaults to None
  296. :type poppler_path: Union[str, PurePath], optional
  297. :param grayscale: Output grayscale image(s), defaults to False
  298. :type grayscale: bool, optional
  299. :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
  300. :type size: Union[Tuple, int], optional
  301. :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
  302. :type paths_only: bool, optional
  303. :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
  304. :type use_pdftocairo: bool, optional
  305. :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
  306. :type timeout: int, optional
  307. :param hide_annotations: Hide PDF annotations in the output, defaults to False
  308. :type hide_annotations: bool, optional
  309. :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
  310. :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
  311. :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
  312. :return: A list of Pillow images, one for each page between first_page and last_page
  313. :rtype: List[Image.Image]
  314. """
  315. fh, temp_filename = tempfile.mkstemp()
  316. try:
  317. with open(temp_filename, "wb") as f:
  318. f.write(pdf_file)
  319. f.flush()
  320. return convert_from_path(
  321. f.name,
  322. dpi=dpi,
  323. output_folder=output_folder,
  324. first_page=first_page,
  325. last_page=last_page,
  326. fmt=fmt,
  327. jpegopt=jpegopt,
  328. thread_count=thread_count,
  329. userpw=userpw,
  330. ownerpw=ownerpw,
  331. use_cropbox=use_cropbox,
  332. strict=strict,
  333. transparent=transparent,
  334. single_file=single_file,
  335. output_file=output_file,
  336. poppler_path=poppler_path,
  337. grayscale=grayscale,
  338. size=size,
  339. paths_only=paths_only,
  340. use_pdftocairo=use_pdftocairo,
  341. timeout=timeout,
  342. hide_annotations=hide_annotations,
  343. )
  344. finally:
  345. os.close(fh)
  346. os.remove(temp_filename)
  347. def _build_command(
  348. args: List,
  349. output_folder: str,
  350. first_page: int,
  351. last_page: int,
  352. fmt: str,
  353. jpegopt: Dict,
  354. output_file: str,
  355. userpw: str,
  356. ownerpw: str,
  357. use_cropbox: bool,
  358. transparent: bool,
  359. single_file: bool,
  360. grayscale: bool,
  361. size: Union[int, Tuple[int, int]],
  362. hide_annotations: bool,
  363. ) -> List[str]:
  364. if use_cropbox:
  365. args.append("-cropbox")
  366. if hide_annotations:
  367. args.append("-hide-annotations")
  368. if transparent and fmt in TRANSPARENT_FILE_TYPES:
  369. args.append("-transp")
  370. if first_page is not None:
  371. args.extend(["-f", str(first_page)])
  372. if last_page is not None:
  373. args.extend(["-l", str(last_page)])
  374. if fmt not in ["pgm", "ppm"]:
  375. args.append("-" + fmt)
  376. if fmt in ["jpeg", "jpg"] and jpegopt:
  377. args.extend(["-jpegopt", _parse_jpegopt(jpegopt)])
  378. if single_file:
  379. args.append("-singlefile")
  380. if output_folder is not None:
  381. args.append(os.path.join(output_folder, output_file))
  382. if userpw is not None:
  383. args.extend(["-upw", userpw])
  384. if ownerpw is not None:
  385. args.extend(["-opw", ownerpw])
  386. if grayscale:
  387. args.append("-gray")
  388. if size is None:
  389. pass
  390. elif isinstance(size, tuple) and len(size) == 2:
  391. if size[0] is not None:
  392. args.extend(["-scale-to-x", str(int(size[0]))])
  393. else:
  394. args.extend(["-scale-to-x", str(-1)])
  395. if size[1] is not None:
  396. args.extend(["-scale-to-y", str(int(size[1]))])
  397. else:
  398. args.extend(["-scale-to-y", str(-1)])
  399. elif isinstance(size, tuple) and len(size) == 1:
  400. args.extend(["-scale-to", str(int(size[0]))])
  401. elif isinstance(size, int) or isinstance(size, float):
  402. args.extend(["-scale-to", str(int(size))])
  403. else:
  404. raise ValueError(f"Size {size} is not a tuple or an integer")
  405. return args
  406. def _parse_format(fmt: str, grayscale: bool = False) -> Tuple[str, str, Callable, bool]:
  407. fmt = fmt.lower()
  408. if fmt[0] == ".":
  409. fmt = fmt[1:]
  410. if fmt in ("jpeg", "jpg"):
  411. return "jpeg", "jpg", parse_buffer_to_jpeg, False
  412. if fmt == "png":
  413. return "png", "png", parse_buffer_to_png, False
  414. if fmt in ("tif", "tiff"):
  415. return "tiff", "tif", None, True
  416. if fmt == "ppm" and grayscale:
  417. return "pgm", "pgm", parse_buffer_to_pgm, False
  418. # Unable to parse the format so we'll use the default
  419. return "ppm", "ppm", parse_buffer_to_ppm, False
  420. def _parse_jpegopt(jpegopt: Dict) -> str:
  421. parts = []
  422. for k, v in jpegopt.items():
  423. if v is True:
  424. v = "y"
  425. if v is False:
  426. v = "n"
  427. parts.append("{}={}".format(k, v))
  428. return ",".join(parts)
  429. def _get_command_path(command: str, poppler_path: str = None) -> str:
  430. if platform.system() == "Windows":
  431. command = command + ".exe"
  432. if poppler_path is not None:
  433. command = os.path.join(poppler_path, command)
  434. return command
  435. def _get_poppler_version(
  436. command: str, poppler_path: str = None, timeout: int = None
  437. ) -> Tuple[int, int]:
  438. command = [_get_command_path(command, poppler_path), "-v"]
  439. env = os.environ.copy()
  440. if poppler_path is not None:
  441. env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
  442. proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
  443. try:
  444. data, err = proc.communicate(timeout=timeout)
  445. except TimeoutExpired:
  446. proc.kill()
  447. outs, errs = proc.communicate()
  448. raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
  449. try:
  450. # TODO: Make this more robust
  451. version = err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")
  452. return int(version[0]), int(version[1])
  453. except:
  454. # Lowest version that includes pdftocairo (2011)
  455. return 0, 17
  456. def pdfinfo_from_path(
  457. pdf_path: str,
  458. userpw: str = None,
  459. ownerpw: str = None,
  460. poppler_path: str = None,
  461. rawdates: bool = False,
  462. timeout: int = None,
  463. first_page: int = None,
  464. last_page: int = None,
  465. ) -> Dict:
  466. """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary.
  467. :param pdf_path: Path to the PDF that you want to convert
  468. :type pdf_path: str
  469. :param userpw: PDF's password, defaults to None
  470. :type userpw: str, optional
  471. :param ownerpw: PDF's owner password, defaults to None
  472. :type ownerpw: str, optional
  473. :param poppler_path: Path to look for poppler binaries, defaults to None
  474. :type poppler_path: Union[str, PurePath], optional
  475. :param rawdates: Return the undecoded data strings, defaults to False
  476. :type rawdates: bool, optional
  477. :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
  478. :type timeout: int, optional
  479. :param first_page: First page to process, defaults to None
  480. :type first_page: int, optional
  481. :param last_page: Last page to process before stopping, defaults to None
  482. :type last_page: int, optional
  483. :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
  484. :raises PDFInfoNotInstalledError: Raised if pdfinfo is not installed
  485. :raises PDFPageCountError: Raised if the output could not be parsed
  486. :return: Dictionary containing various information on the PDF
  487. :rtype: Dict
  488. """
  489. try:
  490. command = [_get_command_path("pdfinfo", poppler_path), pdf_path]
  491. if userpw is not None:
  492. command.extend(["-upw", userpw])
  493. if ownerpw is not None:
  494. command.extend(["-opw", ownerpw])
  495. if rawdates:
  496. command.extend(["-rawdates"])
  497. if first_page:
  498. command.extend(["-f", str(first_page)])
  499. if last_page:
  500. command.extend(["-l", str(last_page)])
  501. # Add poppler path to LD_LIBRARY_PATH
  502. env = os.environ.copy()
  503. if poppler_path is not None:
  504. env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
  505. proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
  506. try:
  507. out, err = proc.communicate(timeout=timeout)
  508. except TimeoutExpired:
  509. proc.kill()
  510. outs, errs = proc.communicate()
  511. raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
  512. d = {}
  513. for field in out.decode("utf8", "ignore").split("\n"):
  514. sf = field.split(":")
  515. key, value = sf[0], ":".join(sf[1:])
  516. if key != "":
  517. d[key] = (
  518. int(value.strip())
  519. if key in PDFINFO_CONVERT_TO_INT
  520. else value.strip()
  521. )
  522. if "Pages" not in d:
  523. raise ValueError
  524. return d
  525. except OSError:
  526. raise PDFInfoNotInstalledError(
  527. "Unable to get page count. Is poppler installed and in PATH?"
  528. )
  529. except ValueError:
  530. raise PDFPageCountError(
  531. f"Unable to get page count.\n{err.decode('utf8', 'ignore')}"
  532. )
  533. def pdfinfo_from_bytes(
  534. pdf_bytes: bytes,
  535. userpw: str = None,
  536. ownerpw: str = None,
  537. poppler_path: str = None,
  538. rawdates: bool = False,
  539. timeout: int = None,
  540. first_page: int = None,
  541. last_page: int = None,
  542. ) -> Dict:
  543. """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary.
  544. :param pdf_bytes: Bytes of the PDF that you want to convert
  545. :type pdf_bytes: bytes
  546. :param userpw: PDF's password, defaults to None
  547. :type userpw: str, optional
  548. :param ownerpw: PDF's owner password, defaults to None
  549. :type ownerpw: str, optional
  550. :param poppler_path: Path to look for poppler binaries, defaults to None
  551. :type poppler_path: Union[str, PurePath], optional
  552. :param rawdates: Return the undecoded data strings, defaults to False
  553. :type rawdates: bool, optional
  554. :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
  555. :type timeout: int, optional
  556. :param first_page: First page to process, defaults to None
  557. :type first_page: int, optional
  558. :param last_page: Last page to process before stopping, defaults to None
  559. :type last_page: int, optional
  560. :return: Dictionary containing various information on the PDF
  561. :rtype: Dict
  562. """
  563. fh, temp_filename = tempfile.mkstemp()
  564. try:
  565. with open(temp_filename, "wb") as f:
  566. f.write(pdf_bytes)
  567. f.flush()
  568. return pdfinfo_from_path(
  569. temp_filename,
  570. userpw=userpw,
  571. ownerpw=ownerpw,
  572. poppler_path=poppler_path,
  573. rawdates=rawdates,
  574. timeout=timeout,
  575. first_page=first_page,
  576. last_page=last_page,
  577. )
  578. finally:
  579. os.close(fh)
  580. os.remove(temp_filename)
  581. def _load_from_output_folder(
  582. output_folder: str,
  583. output_file: str,
  584. ext: str,
  585. paths_only: bool,
  586. in_memory: bool = False,
  587. ) -> List[Image.Image]:
  588. images = []
  589. for f in sorted(os.listdir(output_folder)):
  590. if f.startswith(output_file) and f.split(".")[-1] == ext:
  591. if paths_only:
  592. images.append(os.path.join(output_folder, f))
  593. else:
  594. images.append(Image.open(os.path.join(output_folder, f)))
  595. if in_memory:
  596. images[-1].load()
  597. return images