| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169 |
- # ------------------------------------------------------------------------
- # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
- # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
- #
- # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
- # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
- # maintained and developed by Artifex Software, Inc. https://artifex.com.
- # ------------------------------------------------------------------------
- import math
- import typing
- import weakref
- try:
- from . import pymupdf
- except Exception:
- import pymupdf
- try:
- from . import mupdf
- except Exception:
- import mupdf
- _format_g = pymupdf.format_g
- g_exceptions_verbose = pymupdf.g_exceptions_verbose
- point_like = "point_like"
- rect_like = "rect_like"
- matrix_like = "matrix_like"
- quad_like = "quad_like"
- # ByteString is gone from typing in 3.14.
- # collections.abc.Buffer available from 3.12 only
- try:
- ByteString = typing.ByteString
- except AttributeError:
- # pylint: disable=unsupported-binary-operation
- ByteString = bytes | bytearray | memoryview
- AnyType = typing.Any
- OptInt = typing.Union[int, None]
- OptFloat = typing.Optional[float]
- OptStr = typing.Optional[str]
- OptDict = typing.Optional[dict]
- OptBytes = typing.Optional[ByteString]
- OptSeq = typing.Optional[typing.Sequence]
- """
- This is a collection of functions to extend PyMupdf.
- """
- def get_text_blocks(
- page: pymupdf.Page,
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- sort: bool = False,
- ) -> list:
- """Return the text blocks on a page.
- Notes:
- Lines in a block are concatenated with line breaks.
- Args:
- flags: (int) control the amount of data parsed into the textpage.
- Returns:
- A list of the blocks. Each item contains the containing rectangle
- coordinates, text lines, running block number and block type.
- """
- pymupdf.CheckParent(page)
- if flags is None:
- flags = pymupdf.TEXTFLAGS_BLOCKS
- tp = textpage
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=flags)
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- blocks = tp.extractBLOCKS()
- if textpage is None:
- del tp
- if sort:
- blocks.sort(key=lambda b: (b[3], b[0]))
- return blocks
- def get_text_words(
- page: pymupdf.Page,
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- sort: bool = False,
- delimiters=None,
- tolerance=3,
- ) -> list:
- """Return the text words as a list with the bbox for each word.
- Args:
- page: pymupdf.Page
- clip: (rect-like) area on page to consider
- flags: (int) control the amount of data parsed into the textpage.
- textpage: (pymupdf.TextPage) either passed-in or None.
- sort: (bool) sort the words in reading sequence.
- delimiters: (str,list) characters to use as word delimiters.
- tolerance: (float) consider words to be part of the same line if
- top or bottom coordinate are not larger than this. Relevant
- only if sort=True.
- Returns:
- Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
- """
- def sort_words(words):
- """Sort words line-wise, forgiving small deviations."""
- words.sort(key=lambda w: (w[3], w[0]))
- nwords = [] # final word list
- line = [words[0]] # collects words roughly in same line
- lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle
- for w in words[1:]:
- wrect = pymupdf.Rect(w[:4])
- if (
- abs(wrect.y0 - lrect.y0) <= tolerance
- or abs(wrect.y1 - lrect.y1) <= tolerance
- ):
- line.append(w)
- lrect |= wrect
- else:
- line.sort(key=lambda w: w[0]) # sort words in line l-t-r
- nwords.extend(line) # append to final words list
- line = [w] # start next line
- lrect = wrect # start next line rect
- line.sort(key=lambda w: w[0]) # sort words in line l-t-r
- nwords.extend(line) # append to final words list
- return nwords
- pymupdf.CheckParent(page)
- if flags is None:
- flags = pymupdf.TEXTFLAGS_WORDS
- tp = textpage
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=flags)
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- words = tp.extractWORDS(delimiters)
- # if textpage was given, we subselect the words in clip
- if textpage is not None and clip is not None:
- # sub-select words contained in clip
- clip = pymupdf.Rect(clip)
- words = [
- w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
- ]
- if textpage is None:
- del tp
- if words and sort:
- # advanced sort if any words found
- words = sort_words(words)
- return words
- def get_sorted_text(
- page: pymupdf.Page,
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- tolerance=3,
- ) -> str:
- """Extract plain text avoiding unacceptable line breaks.
- Text contained in clip will be sorted in reading sequence. Some effort
- is also spent to simulate layout vertically and horizontally.
- Args:
- page: pymupdf.Page
- clip: (rect-like) only consider text inside
- flags: (int) text extraction flags
- textpage: pymupdf.TextPage
- tolerance: (float) consider words to be on the same line if their top
- or bottom coordinates do not differ more than this.
- Notes:
- If a TextPage is provided, all text is checked for being inside clip
- with at least 50% of its bbox.
- This allows to use some "global" TextPage in conjunction with sub-
- selecting words in parts of the defined TextPage rectangle.
- Returns:
- A text string in reading sequence. Left indentation of each line,
- inter-line and inter-word distances strive to reflect the layout.
- """
- def line_text(clip, line):
- """Create the string of one text line.
- We are trying to simulate some horizontal layout here, too.
- Args:
- clip: (pymupdf.Rect) the area from which all text is being read.
- line: (list) word tuples (rect, text) contained in the line
- Returns:
- Text in this line. Generated from words in 'line'. Distance from
- predecessor is translated to multiple spaces, thus simulating
- text indentations and large horizontal distances.
- """
- line.sort(key=lambda w: w[0].x0)
- ltext = "" # text in the line
- x1 = clip.x0 # end coordinate of ltext
- lrect = pymupdf.EMPTY_RECT() # bbox of this line
- for r, t in line:
- lrect |= r # update line bbox
- # convert distance to previous word to multiple spaces
- dist = max(
- int(round((r.x0 - x1) / r.width * len(t))),
- 0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
- ) # number of space characters
- ltext += " " * dist + t # append word string
- x1 = r.x1 # update new end position
- return ltext
- # Extract words in correct sequence first.
- words = [
- (pymupdf.Rect(w[:4]), w[4])
- for w in get_text_words(
- page,
- clip=clip,
- flags=flags,
- textpage=textpage,
- sort=True,
- tolerance=tolerance,
- )
- ]
- if not words: # no text present
- return ""
- totalbox = pymupdf.EMPTY_RECT() # area covering all text
- for wr, text in words:
- totalbox |= wr
- lines = [] # list of reconstituted lines
- line = [words[0]] # current line
- lrect = words[0][0] # the line's rectangle
- # walk through the words
- for wr, text in words[1:]: # start with second word
- w0r, _ = line[-1] # read previous word in current line
- # if this word matches top or bottom of the line, append it
- if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
- line.append((wr, text))
- lrect |= wr
- else:
- # output current line and re-initialize
- ltext = line_text(totalbox, line)
- lines.append((lrect, ltext))
- line = [(wr, text)]
- lrect = wr
- # also append unfinished last line
- ltext = line_text(totalbox, line)
- lines.append((lrect, ltext))
- # sort all lines vertically
- lines.sort(key=lambda l: (l[0].y1))
- text = lines[0][1] # text of first line
- y1 = lines[0][0].y1 # its bottom coordinate
- for lrect, ltext in lines[1:]:
- distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
- breaks = "\n" * (distance + 1)
- text += breaks + ltext
- y1 = lrect.y1
- # return text in clip
- return text
- def get_textbox(
- page: pymupdf.Page,
- rect: rect_like,
- textpage: pymupdf.TextPage = None,
- ) -> str:
- tp = textpage
- if tp is None:
- tp = page.get_textpage()
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- rc = tp.extractTextbox(rect)
- if textpage is None:
- del tp
- return rc
- def get_text_selection(
- page: pymupdf.Page,
- p1: point_like,
- p2: point_like,
- clip: rect_like = None,
- textpage: pymupdf.TextPage = None,
- ):
- pymupdf.CheckParent(page)
- tp = textpage
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE)
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- rc = tp.extractSelection(p1, p2)
- if textpage is None:
- del tp
- return rc
- def get_textpage_ocr(
- page: pymupdf.Page,
- flags: int = 0,
- language: str = "eng",
- dpi: int = 72,
- full: bool = False,
- tessdata: str = None,
- ) -> pymupdf.TextPage:
- """Create a Textpage from combined results of normal and OCR text parsing.
- Args:
- flags: (int) control content becoming part of the result.
- language: (str) specify expected language(s). Default is "eng" (English).
- dpi: (int) resolution in dpi, default 72.
- full: (bool) whether to OCR the full page image, or only its images (default)
- """
- pymupdf.CheckParent(page)
- tessdata = pymupdf.get_tessdata(tessdata)
- def full_ocr(page, dpi, language, flags):
- zoom = dpi / 72
- mat = pymupdf.Matrix(zoom, zoom)
- pix = page.get_pixmap(matrix=mat)
- ocr_pdf = pymupdf.Document(
- "pdf",
- pix.pdfocr_tobytes(
- compress=False,
- language=language,
- tessdata=tessdata,
- ),
- )
- ocr_page = ocr_pdf.load_page(0)
- unzoom = page.rect.width / ocr_page.rect.width
- ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
- tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
- ocr_pdf.close()
- pix = None
- tpage.parent = weakref.proxy(page)
- return tpage
- # if OCR for the full page, OCR its pixmap @ desired dpi
- if full:
- return full_ocr(page, dpi, language, flags)
- # For partial OCR, make a normal textpage, then extend it with text that
- # is OCRed from each image.
- # Because of this, we need the images flag bit set ON.
- tpage = page.get_textpage(flags=flags)
- for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
- if block["type"] != 1: # only look at images
- continue
- bbox = pymupdf.Rect(block["bbox"])
- if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
- continue
- try:
- pix = pymupdf.Pixmap(block["image"]) # get image pixmap
- if pix.n - pix.alpha != 3: # we need to convert this to RGB!
- pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
- if pix.alpha: # must remove alpha channel
- pix = pymupdf.Pixmap(pix, 0)
- imgdoc = pymupdf.Document(
- "pdf",
- pix.pdfocr_tobytes(language=language, tessdata=tessdata),
- ) # pdf with OCRed page
- imgpage = imgdoc.load_page(0) # read image as a page
- pix = None
- # compute matrix to transform coordinates back to that of 'page'
- imgrect = imgpage.rect # page size of image PDF
- shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
- mat = shrink * block["transform"]
- imgpage.extend_textpage(tpage, flags=0, matrix=mat)
- imgdoc.close()
- except (RuntimeError, mupdf.FzErrorBase):
- if 0 and g_exceptions_verbose:
- # Don't show exception info here because it can happen in
- # normal operation (see test_3842b).
- pymupdf.exception_info()
- tpage = None
- pymupdf.message("Falling back to full page OCR")
- return full_ocr(page, dpi, language, flags)
- return tpage
- def get_text(
- page: pymupdf.Page,
- option: str = "text",
- *,
- clip: rect_like = None,
- flags: OptInt = None,
- textpage: pymupdf.TextPage = None,
- sort: bool = False,
- delimiters=None,
- tolerance=3,
- ):
- """Extract text from a page or an annotation.
- This is a unifying wrapper for various methods of the pymupdf.TextPage class.
- Args:
- option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
- clip: (rect-like) restrict output to this area.
- flags: bit switches to e.g. exclude images or decompose ligatures.
- textpage: reuse this pymupdf.TextPage and make no new one. If specified,
- 'flags' and 'clip' are ignored.
- Returns:
- the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
- methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
- extractXHTML or etractXML respectively.
- Default and misspelling choice is "text".
- """
- formats = {
- "text": pymupdf.TEXTFLAGS_TEXT,
- "html": pymupdf.TEXTFLAGS_HTML,
- "json": pymupdf.TEXTFLAGS_DICT,
- "rawjson": pymupdf.TEXTFLAGS_RAWDICT,
- "xml": pymupdf.TEXTFLAGS_XML,
- "xhtml": pymupdf.TEXTFLAGS_XHTML,
- "dict": pymupdf.TEXTFLAGS_DICT,
- "rawdict": pymupdf.TEXTFLAGS_RAWDICT,
- "words": pymupdf.TEXTFLAGS_WORDS,
- "blocks": pymupdf.TEXTFLAGS_BLOCKS,
- }
- option = option.lower()
- assert option in formats
- if option not in formats:
- option = "text"
- if flags is None:
- flags = formats[option]
- if option == "words":
- return get_text_words(
- page,
- clip=clip,
- flags=flags,
- textpage=textpage,
- sort=sort,
- delimiters=delimiters,
- )
- if option == "blocks":
- return get_text_blocks(
- page, clip=clip, flags=flags, textpage=textpage, sort=sort
- )
- if option == "text" and sort:
- return get_sorted_text(
- page,
- clip=clip,
- flags=flags,
- textpage=textpage,
- tolerance=tolerance,
- )
- pymupdf.CheckParent(page)
- cb = None
- if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions
- clip = page.cropbox
- if clip is not None:
- clip = pymupdf.Rect(clip)
- cb = None
- elif type(page) is pymupdf.Page:
- cb = page.cropbox
- # pymupdf.TextPage with or without images
- tp = textpage
- #pymupdf.exception_info()
- if tp is None:
- tp = page.get_textpage(clip=clip, flags=flags)
- elif getattr(tp, "parent") != page:
- raise ValueError("not a textpage of this page")
- #pymupdf.log( '{option=}')
- if option == "json":
- t = tp.extractJSON(cb=cb, sort=sort)
- elif option == "rawjson":
- t = tp.extractRAWJSON(cb=cb, sort=sort)
- elif option == "dict":
- t = tp.extractDICT(cb=cb, sort=sort)
- elif option == "rawdict":
- t = tp.extractRAWDICT(cb=cb, sort=sort)
- elif option == "html":
- t = tp.extractHTML()
- elif option == "xml":
- t = tp.extractXML()
- elif option == "xhtml":
- t = tp.extractXHTML()
- else:
- t = tp.extractText(sort=sort)
- if textpage is None:
- del tp
- return t
- def getLinkDict(ln, document=None) -> dict:
- if isinstance(ln, pymupdf.Outline):
- dest = ln.destination(document)
- elif isinstance(ln, pymupdf.Link):
- dest = ln.dest
- else:
- assert 0, f'Unexpected {type(ln)=}.'
- nl = {"kind": dest.kind, "xref": 0}
- try:
- if hasattr(ln, 'rect'):
- nl["from"] = ln.rect
- except Exception:
- # This seems to happen quite often in PyMuPDF/tests.
- if g_exceptions_verbose >= 2: pymupdf.exception_info()
- pass
- pnt = pymupdf.Point(0, 0)
- if dest.flags & pymupdf.LINK_FLAG_L_VALID:
- pnt.x = dest.lt.x
- if dest.flags & pymupdf.LINK_FLAG_T_VALID:
- pnt.y = dest.lt.y
- if dest.kind == pymupdf.LINK_URI:
- nl["uri"] = dest.uri
- elif dest.kind == pymupdf.LINK_GOTO:
- nl["page"] = dest.page
- nl["to"] = pnt
- if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
- nl["zoom"] = dest.rb.x
- else:
- nl["zoom"] = 0.0
- elif dest.kind == pymupdf.LINK_GOTOR:
- nl["file"] = dest.file_spec.replace("\\", "/")
- nl["page"] = dest.page
- if dest.page < 0:
- nl["to"] = dest.dest
- else:
- nl["to"] = pnt
- if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
- nl["zoom"] = dest.rb.x
- else:
- nl["zoom"] = 0.0
- elif dest.kind == pymupdf.LINK_LAUNCH:
- nl["file"] = dest.file_spec.replace("\\", "/")
- elif dest.kind == pymupdf.LINK_NAMED:
- # The dicts should not have same key(s).
- assert not (dest.named.keys() & nl.keys())
- nl.update(dest.named)
- if 'to' in nl:
- nl['to'] = pymupdf.Point(nl['to'])
- else:
- nl["page"] = dest.page
- return nl
- def getDestStr(xref: int, ddict: dict) -> str:
- """Calculate the PDF action string.
- Notes:
- Supports Link annotations and outline items (bookmarks).
- """
- if not ddict:
- return ""
- str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>"
- str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>"
- str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>"
- str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>"
- str_uri = lambda a: f"/A<</S/URI/URI{a}>>"
- if type(ddict) in (int, float):
- dest = str_goto(xref, 0, ddict, 0)
- return dest
- d_kind = ddict.get("kind", pymupdf.LINK_NONE)
- if d_kind == pymupdf.LINK_NONE:
- return ""
- if ddict["kind"] == pymupdf.LINK_GOTO:
- d_zoom = ddict.get("zoom", 0)
- to = ddict.get("to", pymupdf.Point(0, 0))
- d_left, d_top = to
- dest = str_goto(xref, d_left, d_top, d_zoom)
- return dest
- if ddict["kind"] == pymupdf.LINK_URI:
- dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),)
- return dest
- if ddict["kind"] == pymupdf.LINK_LAUNCH:
- fspec = pymupdf.get_pdf_str(ddict["file"])
- dest = str_launch(fspec, fspec)
- return dest
- if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0:
- fspec = pymupdf.get_pdf_str(ddict["file"])
- dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec)
- return dest
- if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0:
- fspec = pymupdf.get_pdf_str(ddict["file"])
- dest = str_gotor1(
- ddict["page"],
- ddict["to"].x,
- ddict["to"].y,
- ddict["zoom"],
- fspec,
- fspec,
- )
- return dest
- return ""
- def getLinkText(page: pymupdf.Page, lnk: dict) -> str:
- # --------------------------------------------------------------------------
- # define skeletons for /Annots object texts
- # --------------------------------------------------------------------------
- ctm = page.transformation_matrix
- ictm = ~ctm
- r = lnk["from"]
- rect = _format_g(tuple(r * ictm))
- annot = ""
- if lnk["kind"] == pymupdf.LINK_GOTO:
- if lnk["page"] >= 0:
- txt = pymupdf.annot_skel["goto1"] # annot_goto
- pno = lnk["page"]
- xref = page.parent.page_xref(pno)
- pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
- dest_page = page.parent[pno]
- dest_ctm = dest_page.transformation_matrix
- dest_ictm = ~dest_ctm
- ipnt = pnt * dest_ictm
- annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect)
- else:
- txt = pymupdf.annot_skel["goto2"] # annot_goto_n
- annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect)
- elif lnk["kind"] == pymupdf.LINK_GOTOR:
- if lnk["page"] >= 0:
- txt = pymupdf.annot_skel["gotor1"] # annot_gotor
- pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
- if type(pnt) is not pymupdf.Point:
- pnt = pymupdf.Point(0, 0)
- annot = txt(
- lnk["page"],
- pnt.x,
- pnt.y,
- lnk.get("zoom", 0),
- lnk["file"],
- lnk["file"],
- rect,
- )
- else:
- txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
- annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect)
- elif lnk["kind"] == pymupdf.LINK_LAUNCH:
- txt = pymupdf.annot_skel["launch"] # annot_launch
- annot = txt(lnk["file"], lnk["file"], rect)
- elif lnk["kind"] == pymupdf.LINK_URI:
- txt = pymupdf.annot_skel["uri"] # txt = annot_uri
- annot = txt(lnk["uri"], rect)
- elif lnk["kind"] == pymupdf.LINK_NAMED:
- txt = pymupdf.annot_skel["named"] # annot_named
- lname = lnk.get("name") # check presence of key
- if lname is None: # if missing, fall back to alternative
- lname = lnk["nameddest"]
- annot = txt(lname, rect)
- if not annot:
- return annot
- # add a /NM PDF key to the object definition
- link_names = dict( # existing ids and their xref
- [(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member
- )
- old_name = lnk.get("id", "") # id value in the argument
- if old_name and (lnk["xref"], old_name) in link_names.items():
- name = old_name # no new name if this is an update only
- else:
- i = 0
- stem = pymupdf.TOOLS.set_annot_stem() + "-L%i"
- while True:
- name = stem % i
- if name not in link_names.values():
- break
- i += 1
- # add /NM key to object definition
- annot = annot.replace("/Link", "/Link/NM(%s)" % name)
- return annot
- # ----------------------------------------------------------------------
- # Name: wx.lib.colourdb.py
- # Purpose: Adds a bunch of colour names and RGB values to the
- # colour database so they can be found by name
- #
- # Author: Robin Dunn
- #
- # Created: 13-March-2001
- # Copyright: (c) 2001-2017 by Total Control Software
- # Licence: wxWindows license
- # Tags: phoenix-port, unittest, documented
- # ----------------------------------------------------------------------
- def getColorList() -> list:
- """
- Returns a list of upper-case colour names.
- :rtype: list of strings
- """
- return [name for name, r, g, b in pymupdf.colors_wx_list()]
- def getColorInfoList() -> list:
- """
- Returns list of (name, red, gree, blue) tuples, where:
- name: upper-case color name.
- read, green, blue: integers in range 0..255.
- :rtype: list of tuples
- """
- return pymupdf.colors_wx_list()
- def getColor(name: str) -> tuple:
- """Retrieve RGB color in PDF format by name.
- Returns:
- a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
- """
- return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1))
- def getColorHSV(name: str) -> tuple:
- """Retrieve the hue, saturation, value triple of a color name.
- Returns:
- a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
- """
- try:
- x = getColorInfoList()[getColorList().index(name.upper())]
- except Exception:
- if g_exceptions_verbose: pymupdf.exception_info()
- return (-1, -1, -1)
- r = x[1] / 255.0
- g = x[2] / 255.0
- b = x[3] / 255.0
- cmax = max(r, g, b)
- V = round(cmax * 100, 1)
- cmin = min(r, g, b)
- delta = cmax - cmin
- if delta == 0:
- hue = 0
- elif cmax == r:
- hue = 60.0 * (((g - b) / delta) % 6)
- elif cmax == g:
- hue = 60.0 * (((b - r) / delta) + 2)
- else:
- hue = 60.0 * (((r - g) / delta) + 4)
- H = int(round(hue))
- if cmax == 0:
- sat = 0
- else:
- sat = delta / cmax
- S = int(round(sat * 100))
- return (H, S, V)
- def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple:
- fontname, ext, stype, buffer = doc.extract_font(xref)
- asc = 0.8
- dsc = -0.2
- if ext == "":
- return fontname, ext, stype, asc, dsc
- if buffer:
- try:
- font = pymupdf.Font(fontbuffer=buffer)
- asc = font.ascender
- dsc = font.descender
- bbox = font.bbox
- if asc - dsc < 1:
- if bbox.y0 < dsc:
- dsc = bbox.y0
- asc = 1 - dsc
- except Exception:
- pymupdf.exception_info()
- asc *= 1.2
- dsc *= 1.2
- return fontname, ext, stype, asc, dsc
- if ext != "n/a":
- try:
- font = pymupdf.Font(fontname)
- asc = font.ascender
- dsc = font.descender
- except Exception:
- pymupdf.exception_info()
- asc *= 1.2
- dsc *= 1.2
- else:
- asc *= 1.2
- dsc *= 1.2
- return fontname, ext, stype, asc, dsc
- def _show_fz_text( text):
- #if mupdf_cppyy:
- # assert isinstance( text, cppyy.gbl.mupdf.Text)
- #else:
- # assert isinstance( text, mupdf.Text)
- num_spans = 0
- num_chars = 0
- span = text.m_internal.head
- while 1:
- if not span:
- break
- num_spans += 1
- num_chars += span.len
- span = span.next
- return f'num_spans={num_spans} num_chars={num_chars}'
- """
- Handle page labels for PDF documents.
- Reading
- -------
- * compute the label of a page
- * find page number(s) having the given label.
- Writing
- -------
- Supports setting (defining) page labels for PDF documents.
- A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and
- significant parts of the following code during late December 2020
- through early January 2021.
- """
- def rule_dict(item):
- """Make a Python dict from a PDF page label rule.
- Args:
- item -- a tuple (pno, rule) with the start page number and the rule
- string like <</S/D...>>.
- Returns:
- A dict like
- {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
- """
- # Jorj McKie, 2021-01-06
- pno, rule = item
- rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>"
- d = {"startpage": pno, "prefix": "", "firstpagenum": 1}
- skip = False
- for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local
- if skip: # this item has already been processed
- skip = False # deactivate skipping again
- continue
- if item == "S": # style specification
- d["style"] = rule[i + 1] # next item has the style
- skip = True # do not process next item again
- continue
- if item.startswith("P"): # prefix specification: extract the string
- x = item[1:].replace("(", "").replace(")", "")
- d["prefix"] = x
- continue
- if item.startswith("St"): # start page number specification
- x = int(item[2:])
- d["firstpagenum"] = x
- return d
- def get_label_pno(pgNo, labels):
- """Return the label for this page number.
- Args:
- pgNo: page number, 0-based.
- labels: result of doc._get_page_labels().
- Returns:
- The label (str) of the page number. Errors return an empty string.
- """
- # Jorj McKie, 2021-01-06
- item = [x for x in labels if x[0] <= pgNo][-1]
- rule = rule_dict(item)
- prefix = rule.get("prefix", "")
- style = rule.get("style", "")
- # make sure we start at 0 when enumerating the alphabet
- delta = -1 if style in ("a", "A") else 0
- pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta
- return construct_label(style, prefix, pagenumber)
- def construct_label(style, prefix, pno) -> str:
- """Construct a label based on style, prefix and page number."""
- # William Chapman, 2021-01-06
- n_str = ""
- if style == "D":
- n_str = str(pno)
- elif style == "r":
- n_str = integerToRoman(pno).lower()
- elif style == "R":
- n_str = integerToRoman(pno).upper()
- elif style == "a":
- n_str = integerToLetter(pno).lower()
- elif style == "A":
- n_str = integerToLetter(pno).upper()
- result = prefix + n_str
- return result
- def integerToLetter(i) -> str:
- """Returns letter sequence string for integer i."""
- # William Chapman, Jorj McKie, 2021-01-06
- import string
- ls = string.ascii_uppercase
- n, a = 1, i
- while pow(26, n) <= a:
- a -= int(math.pow(26, n))
- n += 1
- str_t = ""
- for j in reversed(range(n)):
- f, g = divmod(a, int(math.pow(26, j)))
- str_t += ls[f]
- a = g
- return str_t
- def integerToRoman(num: int) -> str:
- """Return roman numeral for an integer."""
- # William Chapman, Jorj McKie, 2021-01-06
- roman = (
- (1000, "M"),
- (900, "CM"),
- (500, "D"),
- (400, "CD"),
- (100, "C"),
- (90, "XC"),
- (50, "L"),
- (40, "XL"),
- (10, "X"),
- (9, "IX"),
- (5, "V"),
- (4, "IV"),
- (1, "I"),
- )
- def roman_num(num):
- for r, ltr in roman:
- x, _ = divmod(num, r)
- yield ltr * x
- num -= r * x
- if num <= 0:
- break
- return "".join([a for a in roman_num(num)])
- # -------------------------------------------------------------------
- # Functions to recover the quad contained in a text extraction bbox
- # -------------------------------------------------------------------
- def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad:
- """Compute the quad located inside the bbox.
- The bbox may be any of the resp. tuples occurring inside the given span.
- Args:
- line_dir: (tuple) 'line["dir"]' of the owning line or None.
- span: (dict) the span. May be from get_texttrace() method.
- bbox: (tuple) the bbox of the span or any of its characters.
- Returns:
- The quad which is wrapped by the bbox.
- """
- if line_dir is None:
- line_dir = span["dir"]
- cos, sin = line_dir
- bbox = pymupdf.Rect(bbox) # make it a rect
- if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height
- d = 1
- else:
- d = span["ascender"] - span["descender"]
- height = d * span["size"] # the quad's rectangle height
- # The following are distances from the bbox corners, at which we find the
- # respective quad points. The computation depends on in which quadrant the
- # text writing angle is located.
- hs = height * sin
- hc = height * cos
- if hc >= 0 and hs <= 0: # quadrant 1
- ul = bbox.bl - (0, hc)
- ur = bbox.tr + (hs, 0)
- ll = bbox.bl - (hs, 0)
- lr = bbox.tr + (0, hc)
- elif hc <= 0 and hs <= 0: # quadrant 2
- ul = bbox.br + (hs, 0)
- ur = bbox.tl - (0, hc)
- ll = bbox.br + (0, hc)
- lr = bbox.tl - (hs, 0)
- elif hc <= 0 and hs >= 0: # quadrant 3
- ul = bbox.tr - (0, hc)
- ur = bbox.bl + (hs, 0)
- ll = bbox.tr - (hs, 0)
- lr = bbox.bl + (0, hc)
- else: # quadrant 4
- ul = bbox.tl + (hs, 0)
- ur = bbox.br - (0, hc)
- ll = bbox.tl + (0, hc)
- lr = bbox.br - (hs, 0)
- return pymupdf.Quad(ul, ur, ll, lr)
- def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad:
- """Recover the quadrilateral of a text span.
- Args:
- line_dir: (tuple) 'line["dir"]' of the owning line.
- span: the span.
- Returns:
- The quadrilateral enveloping the span's text.
- """
- if type(line_dir) is not tuple or len(line_dir) != 2:
- raise ValueError("bad line dir argument")
- if type(span) is not dict:
- raise ValueError("bad span argument")
- return recover_bbox_quad(line_dir, span, span["bbox"])
- def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad:
- """Calculate the line quad for 'dict' / 'rawdict' text extractions.
- The lower quad points are those of the first, resp. last span quad.
- The upper points are determined by the maximum span quad height.
- From this, compute a rect with bottom-left in (0, 0), convert this to a
- quad and rotate and shift back to cover the text of the spans.
- Args:
- spans: (list, optional) sub-list of spans to consider.
- Returns:
- pymupdf.Quad covering selected spans.
- """
- if spans is None: # no sub-selection
- spans = line["spans"] # all spans
- if len(spans) == 0:
- raise ValueError("bad span list")
- line_dir = line["dir"] # text direction
- cos, sin = line_dir
- q0 = recover_quad(line_dir, spans[0]) # quad of first span
- if len(spans) > 1: # get quad of last span
- q1 = recover_quad(line_dir, spans[-1])
- else:
- q1 = q0 # last = first
- line_ll = q0.ll # lower-left of line quad
- line_lr = q1.lr # lower-right of line quad
- mat0 = pymupdf.planish_line(line_ll, line_lr)
- # map base line to x-axis such that line_ll goes to (0, 0)
- x_lr = line_lr * mat0
- small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
- h = max(
- [s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans]
- )
- line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
- line_quad = line_rect.quad # make it a quad and:
- line_quad *= ~mat0
- return line_quad
- def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad:
- """Calculate the span quad for 'dict' / 'rawdict' text extractions.
- Notes:
- There are two execution paths:
- 1. For the full span quad, the result of 'recover_quad' is returned.
- 2. For the quad of a sub-list of characters, the char quads are
- computed and joined. This is only supported for the "rawdict"
- extraction option.
- Args:
- line_dir: (tuple) 'line["dir"]' of the owning line.
- span: (dict) the span.
- chars: (list, optional) sub-list of characters to consider.
- Returns:
- pymupdf.Quad covering selected characters.
- """
- if line_dir is None: # must be a span from get_texttrace()
- line_dir = span["dir"]
- if chars is None: # no sub-selection
- return recover_quad(line_dir, span)
- if "chars" not in span.keys():
- raise ValueError("need 'rawdict' option to sub-select chars")
- q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char
- if len(chars) > 1: # get quad of last char
- q1 = recover_char_quad(line_dir, span, chars[-1])
- else:
- q1 = q0 # last = first
- span_ll = q0.ll # lower-left of span quad
- span_lr = q1.lr # lower-right of span quad
- mat0 = pymupdf.planish_line(span_ll, span_lr)
- # map base line to x-axis such that span_ll goes to (0, 0)
- x_lr = span_lr * mat0
- small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
- h = span["size"] * (1 if small else (span["ascender"] - span["descender"]))
- span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
- span_quad = span_rect.quad # make it a quad and:
- span_quad *= ~mat0 # rotate back and shift back
- return span_quad
- def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
- """Recover the quadrilateral of a text character.
- This requires the "rawdict" option of text extraction.
- Args:
- line_dir: (tuple) 'line["dir"]' of the span's line.
- span: (dict) the span dict.
- char: (dict) the character dict.
- Returns:
- The quadrilateral enveloping the character.
- """
- if line_dir is None:
- line_dir = span["dir"]
- if type(line_dir) is not tuple or len(line_dir) != 2:
- raise ValueError("bad line dir argument")
- if type(span) is not dict:
- raise ValueError("bad span argument")
- if type(char) is dict:
- bbox = pymupdf.Rect(char["bbox"])
- elif type(char) is tuple:
- bbox = pymupdf.Rect(char[3])
- else:
- raise ValueError("bad span argument")
- return recover_bbox_quad(line_dir, span, bbox)
|