parsers.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. """
  2. pdf2image custom buffer parsers
  3. """
  4. from io import BytesIO
  5. from typing import List
  6. from PIL import Image
  7. def parse_buffer_to_ppm(data: bytes) -> List[Image.Image]:
  8. """Parse PPM file bytes to Pillow Image
  9. :param data: pdftoppm/pdftocairo output bytes
  10. :type data: bytes
  11. :return: List of PPM images parsed from the output
  12. :rtype: List[Image.Image]
  13. """
  14. images = []
  15. index = 0
  16. while index < len(data):
  17. code, size, rgb = tuple(data[index : index + 40].split(b"\n")[0:3])
  18. size_x, size_y = tuple(size.split(b" "))
  19. file_size = len(code) + len(size) + len(rgb) + 3 + int(size_x) * int(size_y) * 3
  20. images.append(Image.open(BytesIO(data[index : index + file_size])))
  21. index += file_size
  22. return images
  23. def parse_buffer_to_pgm(data: bytes) -> List[Image.Image]:
  24. """Parse PGM file bytes to Pillow Image
  25. :param data: pdftoppm/pdftocairo output bytes
  26. :type data: bytes
  27. :return: List of PGM images parsed from the output
  28. :rtype: List[Image.Image]
  29. """
  30. images = []
  31. index = 0
  32. while index < len(data):
  33. code, size, maxval = tuple(data[index : index + 40].split(b"\n")[0:3])
  34. size_x, size_y = tuple(size.split(b" "))
  35. file_size = len(code) + len(size) + len(maxval) + 3 + int(size_x) * int(size_y)
  36. images.append(Image.open(BytesIO(data[index : index + file_size])))
  37. index += file_size
  38. return images
  39. def parse_buffer_to_jpeg(data: bytes) -> List[Image.Image]:
  40. """Parse JPEG file bytes to Pillow Image
  41. :param data: pdftoppm/pdftocairo output bytes
  42. :type data: bytes
  43. :return: List of JPEG images parsed from the output
  44. :rtype: List[Image.Image]
  45. """
  46. return [
  47. Image.open(BytesIO(image_data + b"\xff\xd9"))
  48. for image_data in data.split(b"\xff\xd9")[
  49. :-1
  50. ] # Last element is obviously empty
  51. ]
  52. def parse_buffer_to_png(data: bytes) -> List[Image.Image]:
  53. """Parse PNG file bytes to Pillow Image
  54. :param data: pdftoppm/pdftocairo output bytes
  55. :type data: bytes
  56. :return: List of PNG images parsed from the output
  57. :rtype: List[Image.Image]
  58. """
  59. images = []
  60. c1 = 0
  61. c2 = 0
  62. data_len = len(data)
  63. while c1 < data_len:
  64. # IEND can appear in a PNG without being the actual end
  65. if data[c2 : c2 + 4] == b"IEND" and (
  66. c2 + 8 == data_len or data[c2 + 9 : c2 + 12] == b"PNG"
  67. ):
  68. images.append(Image.open(BytesIO(data[c1 : c2 + 8])))
  69. c1 = c2 + 8
  70. c2 = c1
  71. c2 += 1
  72. return images