utils.py 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169
  1. # ------------------------------------------------------------------------
  2. # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
  3. # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
  4. #
  5. # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
  6. # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
  7. # maintained and developed by Artifex Software, Inc. https://artifex.com.
  8. # ------------------------------------------------------------------------
  9. import math
  10. import typing
  11. import weakref
  12. try:
  13. from . import pymupdf
  14. except Exception:
  15. import pymupdf
  16. try:
  17. from . import mupdf
  18. except Exception:
  19. import mupdf
  20. _format_g = pymupdf.format_g
  21. g_exceptions_verbose = pymupdf.g_exceptions_verbose
  22. point_like = "point_like"
  23. rect_like = "rect_like"
  24. matrix_like = "matrix_like"
  25. quad_like = "quad_like"
  26. # ByteString is gone from typing in 3.14.
  27. # collections.abc.Buffer available from 3.12 only
  28. try:
  29. ByteString = typing.ByteString
  30. except AttributeError:
  31. # pylint: disable=unsupported-binary-operation
  32. ByteString = bytes | bytearray | memoryview
  33. AnyType = typing.Any
  34. OptInt = typing.Union[int, None]
  35. OptFloat = typing.Optional[float]
  36. OptStr = typing.Optional[str]
  37. OptDict = typing.Optional[dict]
  38. OptBytes = typing.Optional[ByteString]
  39. OptSeq = typing.Optional[typing.Sequence]
  40. """
  41. This is a collection of functions to extend PyMupdf.
  42. """
  43. def get_text_blocks(
  44. page: pymupdf.Page,
  45. clip: rect_like = None,
  46. flags: OptInt = None,
  47. textpage: pymupdf.TextPage = None,
  48. sort: bool = False,
  49. ) -> list:
  50. """Return the text blocks on a page.
  51. Notes:
  52. Lines in a block are concatenated with line breaks.
  53. Args:
  54. flags: (int) control the amount of data parsed into the textpage.
  55. Returns:
  56. A list of the blocks. Each item contains the containing rectangle
  57. coordinates, text lines, running block number and block type.
  58. """
  59. pymupdf.CheckParent(page)
  60. if flags is None:
  61. flags = pymupdf.TEXTFLAGS_BLOCKS
  62. tp = textpage
  63. if tp is None:
  64. tp = page.get_textpage(clip=clip, flags=flags)
  65. elif getattr(tp, "parent") != page:
  66. raise ValueError("not a textpage of this page")
  67. blocks = tp.extractBLOCKS()
  68. if textpage is None:
  69. del tp
  70. if sort:
  71. blocks.sort(key=lambda b: (b[3], b[0]))
  72. return blocks
  73. def get_text_words(
  74. page: pymupdf.Page,
  75. clip: rect_like = None,
  76. flags: OptInt = None,
  77. textpage: pymupdf.TextPage = None,
  78. sort: bool = False,
  79. delimiters=None,
  80. tolerance=3,
  81. ) -> list:
  82. """Return the text words as a list with the bbox for each word.
  83. Args:
  84. page: pymupdf.Page
  85. clip: (rect-like) area on page to consider
  86. flags: (int) control the amount of data parsed into the textpage.
  87. textpage: (pymupdf.TextPage) either passed-in or None.
  88. sort: (bool) sort the words in reading sequence.
  89. delimiters: (str,list) characters to use as word delimiters.
  90. tolerance: (float) consider words to be part of the same line if
  91. top or bottom coordinate are not larger than this. Relevant
  92. only if sort=True.
  93. Returns:
  94. Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
  95. """
  96. def sort_words(words):
  97. """Sort words line-wise, forgiving small deviations."""
  98. words.sort(key=lambda w: (w[3], w[0]))
  99. nwords = [] # final word list
  100. line = [words[0]] # collects words roughly in same line
  101. lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle
  102. for w in words[1:]:
  103. wrect = pymupdf.Rect(w[:4])
  104. if (
  105. abs(wrect.y0 - lrect.y0) <= tolerance
  106. or abs(wrect.y1 - lrect.y1) <= tolerance
  107. ):
  108. line.append(w)
  109. lrect |= wrect
  110. else:
  111. line.sort(key=lambda w: w[0]) # sort words in line l-t-r
  112. nwords.extend(line) # append to final words list
  113. line = [w] # start next line
  114. lrect = wrect # start next line rect
  115. line.sort(key=lambda w: w[0]) # sort words in line l-t-r
  116. nwords.extend(line) # append to final words list
  117. return nwords
  118. pymupdf.CheckParent(page)
  119. if flags is None:
  120. flags = pymupdf.TEXTFLAGS_WORDS
  121. tp = textpage
  122. if tp is None:
  123. tp = page.get_textpage(clip=clip, flags=flags)
  124. elif getattr(tp, "parent") != page:
  125. raise ValueError("not a textpage of this page")
  126. words = tp.extractWORDS(delimiters)
  127. # if textpage was given, we subselect the words in clip
  128. if textpage is not None and clip is not None:
  129. # sub-select words contained in clip
  130. clip = pymupdf.Rect(clip)
  131. words = [
  132. w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
  133. ]
  134. if textpage is None:
  135. del tp
  136. if words and sort:
  137. # advanced sort if any words found
  138. words = sort_words(words)
  139. return words
  140. def get_sorted_text(
  141. page: pymupdf.Page,
  142. clip: rect_like = None,
  143. flags: OptInt = None,
  144. textpage: pymupdf.TextPage = None,
  145. tolerance=3,
  146. ) -> str:
  147. """Extract plain text avoiding unacceptable line breaks.
  148. Text contained in clip will be sorted in reading sequence. Some effort
  149. is also spent to simulate layout vertically and horizontally.
  150. Args:
  151. page: pymupdf.Page
  152. clip: (rect-like) only consider text inside
  153. flags: (int) text extraction flags
  154. textpage: pymupdf.TextPage
  155. tolerance: (float) consider words to be on the same line if their top
  156. or bottom coordinates do not differ more than this.
  157. Notes:
  158. If a TextPage is provided, all text is checked for being inside clip
  159. with at least 50% of its bbox.
  160. This allows to use some "global" TextPage in conjunction with sub-
  161. selecting words in parts of the defined TextPage rectangle.
  162. Returns:
  163. A text string in reading sequence. Left indentation of each line,
  164. inter-line and inter-word distances strive to reflect the layout.
  165. """
  166. def line_text(clip, line):
  167. """Create the string of one text line.
  168. We are trying to simulate some horizontal layout here, too.
  169. Args:
  170. clip: (pymupdf.Rect) the area from which all text is being read.
  171. line: (list) word tuples (rect, text) contained in the line
  172. Returns:
  173. Text in this line. Generated from words in 'line'. Distance from
  174. predecessor is translated to multiple spaces, thus simulating
  175. text indentations and large horizontal distances.
  176. """
  177. line.sort(key=lambda w: w[0].x0)
  178. ltext = "" # text in the line
  179. x1 = clip.x0 # end coordinate of ltext
  180. lrect = pymupdf.EMPTY_RECT() # bbox of this line
  181. for r, t in line:
  182. lrect |= r # update line bbox
  183. # convert distance to previous word to multiple spaces
  184. dist = max(
  185. int(round((r.x0 - x1) / r.width * len(t))),
  186. 0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
  187. ) # number of space characters
  188. ltext += " " * dist + t # append word string
  189. x1 = r.x1 # update new end position
  190. return ltext
  191. # Extract words in correct sequence first.
  192. words = [
  193. (pymupdf.Rect(w[:4]), w[4])
  194. for w in get_text_words(
  195. page,
  196. clip=clip,
  197. flags=flags,
  198. textpage=textpage,
  199. sort=True,
  200. tolerance=tolerance,
  201. )
  202. ]
  203. if not words: # no text present
  204. return ""
  205. totalbox = pymupdf.EMPTY_RECT() # area covering all text
  206. for wr, text in words:
  207. totalbox |= wr
  208. lines = [] # list of reconstituted lines
  209. line = [words[0]] # current line
  210. lrect = words[0][0] # the line's rectangle
  211. # walk through the words
  212. for wr, text in words[1:]: # start with second word
  213. w0r, _ = line[-1] # read previous word in current line
  214. # if this word matches top or bottom of the line, append it
  215. if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
  216. line.append((wr, text))
  217. lrect |= wr
  218. else:
  219. # output current line and re-initialize
  220. ltext = line_text(totalbox, line)
  221. lines.append((lrect, ltext))
  222. line = [(wr, text)]
  223. lrect = wr
  224. # also append unfinished last line
  225. ltext = line_text(totalbox, line)
  226. lines.append((lrect, ltext))
  227. # sort all lines vertically
  228. lines.sort(key=lambda l: (l[0].y1))
  229. text = lines[0][1] # text of first line
  230. y1 = lines[0][0].y1 # its bottom coordinate
  231. for lrect, ltext in lines[1:]:
  232. distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
  233. breaks = "\n" * (distance + 1)
  234. text += breaks + ltext
  235. y1 = lrect.y1
  236. # return text in clip
  237. return text
  238. def get_textbox(
  239. page: pymupdf.Page,
  240. rect: rect_like,
  241. textpage: pymupdf.TextPage = None,
  242. ) -> str:
  243. tp = textpage
  244. if tp is None:
  245. tp = page.get_textpage()
  246. elif getattr(tp, "parent") != page:
  247. raise ValueError("not a textpage of this page")
  248. rc = tp.extractTextbox(rect)
  249. if textpage is None:
  250. del tp
  251. return rc
  252. def get_text_selection(
  253. page: pymupdf.Page,
  254. p1: point_like,
  255. p2: point_like,
  256. clip: rect_like = None,
  257. textpage: pymupdf.TextPage = None,
  258. ):
  259. pymupdf.CheckParent(page)
  260. tp = textpage
  261. if tp is None:
  262. tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE)
  263. elif getattr(tp, "parent") != page:
  264. raise ValueError("not a textpage of this page")
  265. rc = tp.extractSelection(p1, p2)
  266. if textpage is None:
  267. del tp
  268. return rc
  269. def get_textpage_ocr(
  270. page: pymupdf.Page,
  271. flags: int = 0,
  272. language: str = "eng",
  273. dpi: int = 72,
  274. full: bool = False,
  275. tessdata: str = None,
  276. ) -> pymupdf.TextPage:
  277. """Create a Textpage from combined results of normal and OCR text parsing.
  278. Args:
  279. flags: (int) control content becoming part of the result.
  280. language: (str) specify expected language(s). Default is "eng" (English).
  281. dpi: (int) resolution in dpi, default 72.
  282. full: (bool) whether to OCR the full page image, or only its images (default)
  283. """
  284. pymupdf.CheckParent(page)
  285. tessdata = pymupdf.get_tessdata(tessdata)
  286. def full_ocr(page, dpi, language, flags):
  287. zoom = dpi / 72
  288. mat = pymupdf.Matrix(zoom, zoom)
  289. pix = page.get_pixmap(matrix=mat)
  290. ocr_pdf = pymupdf.Document(
  291. "pdf",
  292. pix.pdfocr_tobytes(
  293. compress=False,
  294. language=language,
  295. tessdata=tessdata,
  296. ),
  297. )
  298. ocr_page = ocr_pdf.load_page(0)
  299. unzoom = page.rect.width / ocr_page.rect.width
  300. ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
  301. tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
  302. ocr_pdf.close()
  303. pix = None
  304. tpage.parent = weakref.proxy(page)
  305. return tpage
  306. # if OCR for the full page, OCR its pixmap @ desired dpi
  307. if full:
  308. return full_ocr(page, dpi, language, flags)
  309. # For partial OCR, make a normal textpage, then extend it with text that
  310. # is OCRed from each image.
  311. # Because of this, we need the images flag bit set ON.
  312. tpage = page.get_textpage(flags=flags)
  313. for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
  314. if block["type"] != 1: # only look at images
  315. continue
  316. bbox = pymupdf.Rect(block["bbox"])
  317. if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
  318. continue
  319. try:
  320. pix = pymupdf.Pixmap(block["image"]) # get image pixmap
  321. if pix.n - pix.alpha != 3: # we need to convert this to RGB!
  322. pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
  323. if pix.alpha: # must remove alpha channel
  324. pix = pymupdf.Pixmap(pix, 0)
  325. imgdoc = pymupdf.Document(
  326. "pdf",
  327. pix.pdfocr_tobytes(language=language, tessdata=tessdata),
  328. ) # pdf with OCRed page
  329. imgpage = imgdoc.load_page(0) # read image as a page
  330. pix = None
  331. # compute matrix to transform coordinates back to that of 'page'
  332. imgrect = imgpage.rect # page size of image PDF
  333. shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
  334. mat = shrink * block["transform"]
  335. imgpage.extend_textpage(tpage, flags=0, matrix=mat)
  336. imgdoc.close()
  337. except (RuntimeError, mupdf.FzErrorBase):
  338. if 0 and g_exceptions_verbose:
  339. # Don't show exception info here because it can happen in
  340. # normal operation (see test_3842b).
  341. pymupdf.exception_info()
  342. tpage = None
  343. pymupdf.message("Falling back to full page OCR")
  344. return full_ocr(page, dpi, language, flags)
  345. return tpage
  346. def get_text(
  347. page: pymupdf.Page,
  348. option: str = "text",
  349. *,
  350. clip: rect_like = None,
  351. flags: OptInt = None,
  352. textpage: pymupdf.TextPage = None,
  353. sort: bool = False,
  354. delimiters=None,
  355. tolerance=3,
  356. ):
  357. """Extract text from a page or an annotation.
  358. This is a unifying wrapper for various methods of the pymupdf.TextPage class.
  359. Args:
  360. option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
  361. clip: (rect-like) restrict output to this area.
  362. flags: bit switches to e.g. exclude images or decompose ligatures.
  363. textpage: reuse this pymupdf.TextPage and make no new one. If specified,
  364. 'flags' and 'clip' are ignored.
  365. Returns:
  366. the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
  367. methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
  368. extractXHTML or etractXML respectively.
  369. Default and misspelling choice is "text".
  370. """
  371. formats = {
  372. "text": pymupdf.TEXTFLAGS_TEXT,
  373. "html": pymupdf.TEXTFLAGS_HTML,
  374. "json": pymupdf.TEXTFLAGS_DICT,
  375. "rawjson": pymupdf.TEXTFLAGS_RAWDICT,
  376. "xml": pymupdf.TEXTFLAGS_XML,
  377. "xhtml": pymupdf.TEXTFLAGS_XHTML,
  378. "dict": pymupdf.TEXTFLAGS_DICT,
  379. "rawdict": pymupdf.TEXTFLAGS_RAWDICT,
  380. "words": pymupdf.TEXTFLAGS_WORDS,
  381. "blocks": pymupdf.TEXTFLAGS_BLOCKS,
  382. }
  383. option = option.lower()
  384. assert option in formats
  385. if option not in formats:
  386. option = "text"
  387. if flags is None:
  388. flags = formats[option]
  389. if option == "words":
  390. return get_text_words(
  391. page,
  392. clip=clip,
  393. flags=flags,
  394. textpage=textpage,
  395. sort=sort,
  396. delimiters=delimiters,
  397. )
  398. if option == "blocks":
  399. return get_text_blocks(
  400. page, clip=clip, flags=flags, textpage=textpage, sort=sort
  401. )
  402. if option == "text" and sort:
  403. return get_sorted_text(
  404. page,
  405. clip=clip,
  406. flags=flags,
  407. textpage=textpage,
  408. tolerance=tolerance,
  409. )
  410. pymupdf.CheckParent(page)
  411. cb = None
  412. if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions
  413. clip = page.cropbox
  414. if clip is not None:
  415. clip = pymupdf.Rect(clip)
  416. cb = None
  417. elif type(page) is pymupdf.Page:
  418. cb = page.cropbox
  419. # pymupdf.TextPage with or without images
  420. tp = textpage
  421. #pymupdf.exception_info()
  422. if tp is None:
  423. tp = page.get_textpage(clip=clip, flags=flags)
  424. elif getattr(tp, "parent") != page:
  425. raise ValueError("not a textpage of this page")
  426. #pymupdf.log( '{option=}')
  427. if option == "json":
  428. t = tp.extractJSON(cb=cb, sort=sort)
  429. elif option == "rawjson":
  430. t = tp.extractRAWJSON(cb=cb, sort=sort)
  431. elif option == "dict":
  432. t = tp.extractDICT(cb=cb, sort=sort)
  433. elif option == "rawdict":
  434. t = tp.extractRAWDICT(cb=cb, sort=sort)
  435. elif option == "html":
  436. t = tp.extractHTML()
  437. elif option == "xml":
  438. t = tp.extractXML()
  439. elif option == "xhtml":
  440. t = tp.extractXHTML()
  441. else:
  442. t = tp.extractText(sort=sort)
  443. if textpage is None:
  444. del tp
  445. return t
  446. def getLinkDict(ln, document=None) -> dict:
  447. if isinstance(ln, pymupdf.Outline):
  448. dest = ln.destination(document)
  449. elif isinstance(ln, pymupdf.Link):
  450. dest = ln.dest
  451. else:
  452. assert 0, f'Unexpected {type(ln)=}.'
  453. nl = {"kind": dest.kind, "xref": 0}
  454. try:
  455. if hasattr(ln, 'rect'):
  456. nl["from"] = ln.rect
  457. except Exception:
  458. # This seems to happen quite often in PyMuPDF/tests.
  459. if g_exceptions_verbose >= 2: pymupdf.exception_info()
  460. pass
  461. pnt = pymupdf.Point(0, 0)
  462. if dest.flags & pymupdf.LINK_FLAG_L_VALID:
  463. pnt.x = dest.lt.x
  464. if dest.flags & pymupdf.LINK_FLAG_T_VALID:
  465. pnt.y = dest.lt.y
  466. if dest.kind == pymupdf.LINK_URI:
  467. nl["uri"] = dest.uri
  468. elif dest.kind == pymupdf.LINK_GOTO:
  469. nl["page"] = dest.page
  470. nl["to"] = pnt
  471. if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
  472. nl["zoom"] = dest.rb.x
  473. else:
  474. nl["zoom"] = 0.0
  475. elif dest.kind == pymupdf.LINK_GOTOR:
  476. nl["file"] = dest.file_spec.replace("\\", "/")
  477. nl["page"] = dest.page
  478. if dest.page < 0:
  479. nl["to"] = dest.dest
  480. else:
  481. nl["to"] = pnt
  482. if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
  483. nl["zoom"] = dest.rb.x
  484. else:
  485. nl["zoom"] = 0.0
  486. elif dest.kind == pymupdf.LINK_LAUNCH:
  487. nl["file"] = dest.file_spec.replace("\\", "/")
  488. elif dest.kind == pymupdf.LINK_NAMED:
  489. # The dicts should not have same key(s).
  490. assert not (dest.named.keys() & nl.keys())
  491. nl.update(dest.named)
  492. if 'to' in nl:
  493. nl['to'] = pymupdf.Point(nl['to'])
  494. else:
  495. nl["page"] = dest.page
  496. return nl
  497. def getDestStr(xref: int, ddict: dict) -> str:
  498. """Calculate the PDF action string.
  499. Notes:
  500. Supports Link annotations and outline items (bookmarks).
  501. """
  502. if not ddict:
  503. return ""
  504. str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>"
  505. str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>"
  506. str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>"
  507. str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>"
  508. str_uri = lambda a: f"/A<</S/URI/URI{a}>>"
  509. if type(ddict) in (int, float):
  510. dest = str_goto(xref, 0, ddict, 0)
  511. return dest
  512. d_kind = ddict.get("kind", pymupdf.LINK_NONE)
  513. if d_kind == pymupdf.LINK_NONE:
  514. return ""
  515. if ddict["kind"] == pymupdf.LINK_GOTO:
  516. d_zoom = ddict.get("zoom", 0)
  517. to = ddict.get("to", pymupdf.Point(0, 0))
  518. d_left, d_top = to
  519. dest = str_goto(xref, d_left, d_top, d_zoom)
  520. return dest
  521. if ddict["kind"] == pymupdf.LINK_URI:
  522. dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),)
  523. return dest
  524. if ddict["kind"] == pymupdf.LINK_LAUNCH:
  525. fspec = pymupdf.get_pdf_str(ddict["file"])
  526. dest = str_launch(fspec, fspec)
  527. return dest
  528. if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0:
  529. fspec = pymupdf.get_pdf_str(ddict["file"])
  530. dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec)
  531. return dest
  532. if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0:
  533. fspec = pymupdf.get_pdf_str(ddict["file"])
  534. dest = str_gotor1(
  535. ddict["page"],
  536. ddict["to"].x,
  537. ddict["to"].y,
  538. ddict["zoom"],
  539. fspec,
  540. fspec,
  541. )
  542. return dest
  543. return ""
  544. def getLinkText(page: pymupdf.Page, lnk: dict) -> str:
  545. # --------------------------------------------------------------------------
  546. # define skeletons for /Annots object texts
  547. # --------------------------------------------------------------------------
  548. ctm = page.transformation_matrix
  549. ictm = ~ctm
  550. r = lnk["from"]
  551. rect = _format_g(tuple(r * ictm))
  552. annot = ""
  553. if lnk["kind"] == pymupdf.LINK_GOTO:
  554. if lnk["page"] >= 0:
  555. txt = pymupdf.annot_skel["goto1"] # annot_goto
  556. pno = lnk["page"]
  557. xref = page.parent.page_xref(pno)
  558. pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
  559. dest_page = page.parent[pno]
  560. dest_ctm = dest_page.transformation_matrix
  561. dest_ictm = ~dest_ctm
  562. ipnt = pnt * dest_ictm
  563. annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect)
  564. else:
  565. txt = pymupdf.annot_skel["goto2"] # annot_goto_n
  566. annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect)
  567. elif lnk["kind"] == pymupdf.LINK_GOTOR:
  568. if lnk["page"] >= 0:
  569. txt = pymupdf.annot_skel["gotor1"] # annot_gotor
  570. pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
  571. if type(pnt) is not pymupdf.Point:
  572. pnt = pymupdf.Point(0, 0)
  573. annot = txt(
  574. lnk["page"],
  575. pnt.x,
  576. pnt.y,
  577. lnk.get("zoom", 0),
  578. lnk["file"],
  579. lnk["file"],
  580. rect,
  581. )
  582. else:
  583. txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
  584. annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect)
  585. elif lnk["kind"] == pymupdf.LINK_LAUNCH:
  586. txt = pymupdf.annot_skel["launch"] # annot_launch
  587. annot = txt(lnk["file"], lnk["file"], rect)
  588. elif lnk["kind"] == pymupdf.LINK_URI:
  589. txt = pymupdf.annot_skel["uri"] # txt = annot_uri
  590. annot = txt(lnk["uri"], rect)
  591. elif lnk["kind"] == pymupdf.LINK_NAMED:
  592. txt = pymupdf.annot_skel["named"] # annot_named
  593. lname = lnk.get("name") # check presence of key
  594. if lname is None: # if missing, fall back to alternative
  595. lname = lnk["nameddest"]
  596. annot = txt(lname, rect)
  597. if not annot:
  598. return annot
  599. # add a /NM PDF key to the object definition
  600. link_names = dict( # existing ids and their xref
  601. [(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member
  602. )
  603. old_name = lnk.get("id", "") # id value in the argument
  604. if old_name and (lnk["xref"], old_name) in link_names.items():
  605. name = old_name # no new name if this is an update only
  606. else:
  607. i = 0
  608. stem = pymupdf.TOOLS.set_annot_stem() + "-L%i"
  609. while True:
  610. name = stem % i
  611. if name not in link_names.values():
  612. break
  613. i += 1
  614. # add /NM key to object definition
  615. annot = annot.replace("/Link", "/Link/NM(%s)" % name)
  616. return annot
  617. # ----------------------------------------------------------------------
  618. # Name: wx.lib.colourdb.py
  619. # Purpose: Adds a bunch of colour names and RGB values to the
  620. # colour database so they can be found by name
  621. #
  622. # Author: Robin Dunn
  623. #
  624. # Created: 13-March-2001
  625. # Copyright: (c) 2001-2017 by Total Control Software
  626. # Licence: wxWindows license
  627. # Tags: phoenix-port, unittest, documented
  628. # ----------------------------------------------------------------------
  629. def getColorList() -> list:
  630. """
  631. Returns a list of upper-case colour names.
  632. :rtype: list of strings
  633. """
  634. return [name for name, r, g, b in pymupdf.colors_wx_list()]
  635. def getColorInfoList() -> list:
  636. """
  637. Returns list of (name, red, gree, blue) tuples, where:
  638. name: upper-case color name.
  639. read, green, blue: integers in range 0..255.
  640. :rtype: list of tuples
  641. """
  642. return pymupdf.colors_wx_list()
  643. def getColor(name: str) -> tuple:
  644. """Retrieve RGB color in PDF format by name.
  645. Returns:
  646. a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
  647. """
  648. return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1))
  649. def getColorHSV(name: str) -> tuple:
  650. """Retrieve the hue, saturation, value triple of a color name.
  651. Returns:
  652. a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
  653. """
  654. try:
  655. x = getColorInfoList()[getColorList().index(name.upper())]
  656. except Exception:
  657. if g_exceptions_verbose: pymupdf.exception_info()
  658. return (-1, -1, -1)
  659. r = x[1] / 255.0
  660. g = x[2] / 255.0
  661. b = x[3] / 255.0
  662. cmax = max(r, g, b)
  663. V = round(cmax * 100, 1)
  664. cmin = min(r, g, b)
  665. delta = cmax - cmin
  666. if delta == 0:
  667. hue = 0
  668. elif cmax == r:
  669. hue = 60.0 * (((g - b) / delta) % 6)
  670. elif cmax == g:
  671. hue = 60.0 * (((b - r) / delta) + 2)
  672. else:
  673. hue = 60.0 * (((r - g) / delta) + 4)
  674. H = int(round(hue))
  675. if cmax == 0:
  676. sat = 0
  677. else:
  678. sat = delta / cmax
  679. S = int(round(sat * 100))
  680. return (H, S, V)
  681. def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple:
  682. fontname, ext, stype, buffer = doc.extract_font(xref)
  683. asc = 0.8
  684. dsc = -0.2
  685. if ext == "":
  686. return fontname, ext, stype, asc, dsc
  687. if buffer:
  688. try:
  689. font = pymupdf.Font(fontbuffer=buffer)
  690. asc = font.ascender
  691. dsc = font.descender
  692. bbox = font.bbox
  693. if asc - dsc < 1:
  694. if bbox.y0 < dsc:
  695. dsc = bbox.y0
  696. asc = 1 - dsc
  697. except Exception:
  698. pymupdf.exception_info()
  699. asc *= 1.2
  700. dsc *= 1.2
  701. return fontname, ext, stype, asc, dsc
  702. if ext != "n/a":
  703. try:
  704. font = pymupdf.Font(fontname)
  705. asc = font.ascender
  706. dsc = font.descender
  707. except Exception:
  708. pymupdf.exception_info()
  709. asc *= 1.2
  710. dsc *= 1.2
  711. else:
  712. asc *= 1.2
  713. dsc *= 1.2
  714. return fontname, ext, stype, asc, dsc
  715. def _show_fz_text( text):
  716. #if mupdf_cppyy:
  717. # assert isinstance( text, cppyy.gbl.mupdf.Text)
  718. #else:
  719. # assert isinstance( text, mupdf.Text)
  720. num_spans = 0
  721. num_chars = 0
  722. span = text.m_internal.head
  723. while 1:
  724. if not span:
  725. break
  726. num_spans += 1
  727. num_chars += span.len
  728. span = span.next
  729. return f'num_spans={num_spans} num_chars={num_chars}'
  730. """
  731. Handle page labels for PDF documents.
  732. Reading
  733. -------
  734. * compute the label of a page
  735. * find page number(s) having the given label.
  736. Writing
  737. -------
  738. Supports setting (defining) page labels for PDF documents.
  739. A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and
  740. significant parts of the following code during late December 2020
  741. through early January 2021.
  742. """
  743. def rule_dict(item):
  744. """Make a Python dict from a PDF page label rule.
  745. Args:
  746. item -- a tuple (pno, rule) with the start page number and the rule
  747. string like <</S/D...>>.
  748. Returns:
  749. A dict like
  750. {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
  751. """
  752. # Jorj McKie, 2021-01-06
  753. pno, rule = item
  754. rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>"
  755. d = {"startpage": pno, "prefix": "", "firstpagenum": 1}
  756. skip = False
  757. for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local
  758. if skip: # this item has already been processed
  759. skip = False # deactivate skipping again
  760. continue
  761. if item == "S": # style specification
  762. d["style"] = rule[i + 1] # next item has the style
  763. skip = True # do not process next item again
  764. continue
  765. if item.startswith("P"): # prefix specification: extract the string
  766. x = item[1:].replace("(", "").replace(")", "")
  767. d["prefix"] = x
  768. continue
  769. if item.startswith("St"): # start page number specification
  770. x = int(item[2:])
  771. d["firstpagenum"] = x
  772. return d
  773. def get_label_pno(pgNo, labels):
  774. """Return the label for this page number.
  775. Args:
  776. pgNo: page number, 0-based.
  777. labels: result of doc._get_page_labels().
  778. Returns:
  779. The label (str) of the page number. Errors return an empty string.
  780. """
  781. # Jorj McKie, 2021-01-06
  782. item = [x for x in labels if x[0] <= pgNo][-1]
  783. rule = rule_dict(item)
  784. prefix = rule.get("prefix", "")
  785. style = rule.get("style", "")
  786. # make sure we start at 0 when enumerating the alphabet
  787. delta = -1 if style in ("a", "A") else 0
  788. pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta
  789. return construct_label(style, prefix, pagenumber)
  790. def construct_label(style, prefix, pno) -> str:
  791. """Construct a label based on style, prefix and page number."""
  792. # William Chapman, 2021-01-06
  793. n_str = ""
  794. if style == "D":
  795. n_str = str(pno)
  796. elif style == "r":
  797. n_str = integerToRoman(pno).lower()
  798. elif style == "R":
  799. n_str = integerToRoman(pno).upper()
  800. elif style == "a":
  801. n_str = integerToLetter(pno).lower()
  802. elif style == "A":
  803. n_str = integerToLetter(pno).upper()
  804. result = prefix + n_str
  805. return result
  806. def integerToLetter(i) -> str:
  807. """Returns letter sequence string for integer i."""
  808. # William Chapman, Jorj McKie, 2021-01-06
  809. import string
  810. ls = string.ascii_uppercase
  811. n, a = 1, i
  812. while pow(26, n) <= a:
  813. a -= int(math.pow(26, n))
  814. n += 1
  815. str_t = ""
  816. for j in reversed(range(n)):
  817. f, g = divmod(a, int(math.pow(26, j)))
  818. str_t += ls[f]
  819. a = g
  820. return str_t
  821. def integerToRoman(num: int) -> str:
  822. """Return roman numeral for an integer."""
  823. # William Chapman, Jorj McKie, 2021-01-06
  824. roman = (
  825. (1000, "M"),
  826. (900, "CM"),
  827. (500, "D"),
  828. (400, "CD"),
  829. (100, "C"),
  830. (90, "XC"),
  831. (50, "L"),
  832. (40, "XL"),
  833. (10, "X"),
  834. (9, "IX"),
  835. (5, "V"),
  836. (4, "IV"),
  837. (1, "I"),
  838. )
  839. def roman_num(num):
  840. for r, ltr in roman:
  841. x, _ = divmod(num, r)
  842. yield ltr * x
  843. num -= r * x
  844. if num <= 0:
  845. break
  846. return "".join([a for a in roman_num(num)])
  847. # -------------------------------------------------------------------
  848. # Functions to recover the quad contained in a text extraction bbox
  849. # -------------------------------------------------------------------
  850. def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad:
  851. """Compute the quad located inside the bbox.
  852. The bbox may be any of the resp. tuples occurring inside the given span.
  853. Args:
  854. line_dir: (tuple) 'line["dir"]' of the owning line or None.
  855. span: (dict) the span. May be from get_texttrace() method.
  856. bbox: (tuple) the bbox of the span or any of its characters.
  857. Returns:
  858. The quad which is wrapped by the bbox.
  859. """
  860. if line_dir is None:
  861. line_dir = span["dir"]
  862. cos, sin = line_dir
  863. bbox = pymupdf.Rect(bbox) # make it a rect
  864. if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height
  865. d = 1
  866. else:
  867. d = span["ascender"] - span["descender"]
  868. height = d * span["size"] # the quad's rectangle height
  869. # The following are distances from the bbox corners, at which we find the
  870. # respective quad points. The computation depends on in which quadrant the
  871. # text writing angle is located.
  872. hs = height * sin
  873. hc = height * cos
  874. if hc >= 0 and hs <= 0: # quadrant 1
  875. ul = bbox.bl - (0, hc)
  876. ur = bbox.tr + (hs, 0)
  877. ll = bbox.bl - (hs, 0)
  878. lr = bbox.tr + (0, hc)
  879. elif hc <= 0 and hs <= 0: # quadrant 2
  880. ul = bbox.br + (hs, 0)
  881. ur = bbox.tl - (0, hc)
  882. ll = bbox.br + (0, hc)
  883. lr = bbox.tl - (hs, 0)
  884. elif hc <= 0 and hs >= 0: # quadrant 3
  885. ul = bbox.tr - (0, hc)
  886. ur = bbox.bl + (hs, 0)
  887. ll = bbox.tr - (hs, 0)
  888. lr = bbox.bl + (0, hc)
  889. else: # quadrant 4
  890. ul = bbox.tl + (hs, 0)
  891. ur = bbox.br - (0, hc)
  892. ll = bbox.tl + (0, hc)
  893. lr = bbox.br - (hs, 0)
  894. return pymupdf.Quad(ul, ur, ll, lr)
  895. def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad:
  896. """Recover the quadrilateral of a text span.
  897. Args:
  898. line_dir: (tuple) 'line["dir"]' of the owning line.
  899. span: the span.
  900. Returns:
  901. The quadrilateral enveloping the span's text.
  902. """
  903. if type(line_dir) is not tuple or len(line_dir) != 2:
  904. raise ValueError("bad line dir argument")
  905. if type(span) is not dict:
  906. raise ValueError("bad span argument")
  907. return recover_bbox_quad(line_dir, span, span["bbox"])
  908. def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad:
  909. """Calculate the line quad for 'dict' / 'rawdict' text extractions.
  910. The lower quad points are those of the first, resp. last span quad.
  911. The upper points are determined by the maximum span quad height.
  912. From this, compute a rect with bottom-left in (0, 0), convert this to a
  913. quad and rotate and shift back to cover the text of the spans.
  914. Args:
  915. spans: (list, optional) sub-list of spans to consider.
  916. Returns:
  917. pymupdf.Quad covering selected spans.
  918. """
  919. if spans is None: # no sub-selection
  920. spans = line["spans"] # all spans
  921. if len(spans) == 0:
  922. raise ValueError("bad span list")
  923. line_dir = line["dir"] # text direction
  924. cos, sin = line_dir
  925. q0 = recover_quad(line_dir, spans[0]) # quad of first span
  926. if len(spans) > 1: # get quad of last span
  927. q1 = recover_quad(line_dir, spans[-1])
  928. else:
  929. q1 = q0 # last = first
  930. line_ll = q0.ll # lower-left of line quad
  931. line_lr = q1.lr # lower-right of line quad
  932. mat0 = pymupdf.planish_line(line_ll, line_lr)
  933. # map base line to x-axis such that line_ll goes to (0, 0)
  934. x_lr = line_lr * mat0
  935. small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
  936. h = max(
  937. [s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans]
  938. )
  939. line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
  940. line_quad = line_rect.quad # make it a quad and:
  941. line_quad *= ~mat0
  942. return line_quad
  943. def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad:
  944. """Calculate the span quad for 'dict' / 'rawdict' text extractions.
  945. Notes:
  946. There are two execution paths:
  947. 1. For the full span quad, the result of 'recover_quad' is returned.
  948. 2. For the quad of a sub-list of characters, the char quads are
  949. computed and joined. This is only supported for the "rawdict"
  950. extraction option.
  951. Args:
  952. line_dir: (tuple) 'line["dir"]' of the owning line.
  953. span: (dict) the span.
  954. chars: (list, optional) sub-list of characters to consider.
  955. Returns:
  956. pymupdf.Quad covering selected characters.
  957. """
  958. if line_dir is None: # must be a span from get_texttrace()
  959. line_dir = span["dir"]
  960. if chars is None: # no sub-selection
  961. return recover_quad(line_dir, span)
  962. if "chars" not in span.keys():
  963. raise ValueError("need 'rawdict' option to sub-select chars")
  964. q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char
  965. if len(chars) > 1: # get quad of last char
  966. q1 = recover_char_quad(line_dir, span, chars[-1])
  967. else:
  968. q1 = q0 # last = first
  969. span_ll = q0.ll # lower-left of span quad
  970. span_lr = q1.lr # lower-right of span quad
  971. mat0 = pymupdf.planish_line(span_ll, span_lr)
  972. # map base line to x-axis such that span_ll goes to (0, 0)
  973. x_lr = span_lr * mat0
  974. small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
  975. h = span["size"] * (1 if small else (span["ascender"] - span["descender"]))
  976. span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
  977. span_quad = span_rect.quad # make it a quad and:
  978. span_quad *= ~mat0 # rotate back and shift back
  979. return span_quad
  980. def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
  981. """Recover the quadrilateral of a text character.
  982. This requires the "rawdict" option of text extraction.
  983. Args:
  984. line_dir: (tuple) 'line["dir"]' of the span's line.
  985. span: (dict) the span dict.
  986. char: (dict) the character dict.
  987. Returns:
  988. The quadrilateral enveloping the character.
  989. """
  990. if line_dir is None:
  991. line_dir = span["dir"]
  992. if type(line_dir) is not tuple or len(line_dir) != 2:
  993. raise ValueError("bad line dir argument")
  994. if type(span) is not dict:
  995. raise ValueError("bad span argument")
  996. if type(char) is dict:
  997. bbox = pymupdf.Rect(char["bbox"])
  998. elif type(char) is tuple:
  999. bbox = pymupdf.Rect(char[3])
  1000. else:
  1001. raise ValueError("bad span argument")
  1002. return recover_bbox_quad(line_dir, span, bbox)