| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149 |
- # -----------------------------------------------------------------------------
- # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
- # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
- # Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a
- # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
- # maintained and developed by Artifex Software, Inc. https://artifex.com.
- # -----------------------------------------------------------------------------
- import argparse
- import bisect
- import os
- import sys
- import statistics
- from typing import Dict, List, Set
- from . import pymupdf
- def mycenter(x):
- return (" %s " % x).center(75, "-")
- def recoverpix(doc, item):
- """Return image for a given XREF."""
- x = item[0] # xref of PDF image
- s = item[1] # xref of its /SMask
- if s == 0: # no smask: use direct image output
- return doc.extract_image(x)
- def getimage(pix):
- if pix.colorspace.n != 4:
- return pix
- tpix = pymupdf.Pixmap(pymupdf.csRGB, pix)
- return tpix
- # we need to reconstruct the alpha channel with the smask
- pix1 = pymupdf.Pixmap(doc, x)
- pix2 = pymupdf.Pixmap(doc, s) # create pixmap of the /SMask entry
- """Sanity check:
- - both pixmaps must have the same rectangle
- - both pixmaps must have alpha=0
- - pix2 must consist of 1 byte per pixel
- """
- if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
- pymupdf.message("Warning: unsupported /SMask %i for %i:" % (s, x))
- pymupdf.message(pix2)
- pix2 = None
- return getimage(pix1) # return the pixmap as is
- pix = pymupdf.Pixmap(pix1) # copy of pix1, with an alpha channel added
- pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values
- pix1 = pix2 = None # free temp pixmaps
- # we may need to adjust something for CMYK pixmaps here:
- return getimage(pix)
- def open_file(filename, password, show=False, pdf=True):
- """Open and authenticate a document."""
- doc = pymupdf.open(filename)
- if not doc.is_pdf and pdf is True:
- sys.exit("this command supports PDF files only")
- rc = -1
- if not doc.needs_pass:
- return doc
- if password:
- rc = doc.authenticate(password)
- if not rc:
- sys.exit("authentication unsuccessful")
- if show is True:
- pymupdf.message("authenticated as %s" % "owner" if rc > 2 else "user")
- else:
- sys.exit("'%s' requires a password" % doc.name)
- return doc
- def print_dict(item):
- """Print a Python dictionary."""
- l = max([len(k) for k in item.keys()]) + 1
- for k, v in item.items():
- msg = "%s: %s" % (k.rjust(l), v)
- pymupdf.message(msg)
- def print_xref(doc, xref):
- """Print an object given by XREF number.
- Simulate the PDF source in "pretty" format.
- For a stream also print its size.
- """
- pymupdf.message("%i 0 obj" % xref)
- xref_str = doc.xref_object(xref)
- pymupdf.message(xref_str)
- if doc.xref_is_stream(xref):
- temp = xref_str.split()
- try:
- idx = temp.index("/Length") + 1
- size = temp[idx]
- if size.endswith("0 R"):
- size = "unknown"
- except Exception:
- size = "unknown"
- pymupdf.message("stream\n...%s bytes" % size)
- pymupdf.message("endstream")
- pymupdf.message("endobj")
- def get_list(rlist, limit, what="page"):
- """Transform a page / xref specification into a list of integers.
- Args
- ----
- rlist: (str) the specification
- limit: maximum number, i.e. number of pages, number of objects
- what: a string to be used in error messages
- Returns
- -------
- A list of integers representing the specification.
- """
- N = str(limit - 1)
- rlist = rlist.replace("N", N).replace(" ", "")
- rlist_arr = rlist.split(",")
- out_list = []
- for seq, item in enumerate(rlist_arr):
- n = seq + 1
- if item.isdecimal(): # a single integer
- i = int(item)
- if 1 <= i < limit:
- out_list.append(int(item))
- else:
- sys.exit("bad %s specification at item %i" % (what, n))
- continue
- try: # this must be a range now, and all of the following must work:
- i1, i2 = item.split("-") # will fail if not 2 items produced
- i1 = int(i1) # will fail on non-integers
- i2 = int(i2)
- except Exception:
- sys.exit("bad %s range specification at item %i" % (what, n))
- if not (1 <= i1 < limit and 1 <= i2 < limit):
- sys.exit("bad %s range specification at item %i" % (what, n))
- if i1 == i2: # just in case: a range of equal numbers
- out_list.append(i1)
- continue
- if i1 < i2: # first less than second
- out_list += list(range(i1, i2 + 1))
- else: # first larger than second
- out_list += list(range(i1, i2 - 1, -1))
- return out_list
- def show(args):
- doc = open_file(args.input, args.password, True)
- size = os.path.getsize(args.input) / 1024
- flag = "KB"
- if size > 1000:
- size /= 1024
- flag = "MB"
- size = round(size, 1)
- meta = doc.metadata # pylint: disable=no-member
- pymupdf.message(
- "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
- % (
- args.input,
- doc.page_count,
- doc.xref_length() - 1,
- size,
- flag,
- meta["format"],
- meta["encryption"],
- )
- )
- n = doc.is_form_pdf
- if n > 0:
- s = doc.get_sigflags()
- pymupdf.message(
- "document contains %i root form fields and is %ssigned"
- % (n, "not " if s != 3 else "")
- )
- n = doc.embfile_count()
- if n > 0:
- pymupdf.message("document contains %i embedded files" % n)
- pymupdf.message()
- if args.catalog:
- pymupdf.message(mycenter("PDF catalog"))
- xref = doc.pdf_catalog()
- print_xref(doc, xref)
- pymupdf.message()
- if args.metadata:
- pymupdf.message(mycenter("PDF metadata"))
- print_dict(doc.metadata) # pylint: disable=no-member
- pymupdf.message()
- if args.xrefs:
- pymupdf.message(mycenter("object information"))
- xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
- for xref in xrefl:
- print_xref(doc, xref)
- pymupdf.message()
- if args.pages:
- pymupdf.message(mycenter("page information"))
- pagel = get_list(args.pages, doc.page_count + 1)
- for pno in pagel:
- n = pno - 1
- xref = doc.page_xref(n)
- pymupdf.message("Page %i:" % pno)
- print_xref(doc, xref)
- pymupdf.message()
- if args.trailer:
- pymupdf.message(mycenter("PDF trailer"))
- pymupdf.message(doc.pdf_trailer())
- pymupdf.message()
- doc.close()
- def clean(args):
- doc = open_file(args.input, args.password, pdf=True)
- encryption = args.encryption
- encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
- encryption
- )
- if not args.pages: # simple cleaning
- doc.save(
- args.output,
- garbage=args.garbage,
- deflate=args.compress,
- pretty=args.pretty,
- clean=args.sanitize,
- ascii=args.ascii,
- linear=args.linear,
- encryption=encrypt,
- owner_pw=args.owner,
- user_pw=args.user,
- permissions=args.permission,
- )
- return
- # create sub document from page numbers
- pages = get_list(args.pages, doc.page_count + 1)
- outdoc = pymupdf.open()
- for pno in pages:
- n = pno - 1
- outdoc.insert_pdf(doc, from_page=n, to_page=n)
- outdoc.save(
- args.output,
- garbage=args.garbage,
- deflate=args.compress,
- pretty=args.pretty,
- clean=args.sanitize,
- ascii=args.ascii,
- linear=args.linear,
- encryption=encrypt,
- owner_pw=args.owner,
- user_pw=args.user,
- permissions=args.permission,
- )
- doc.close()
- outdoc.close()
- return
- def doc_join(args):
- """Join pages from several PDF documents."""
- doc_list = args.input # a list of input PDFs
- doc = pymupdf.open() # output PDF
- for src_item in doc_list: # process one input PDF
- src_list = src_item.split(",")
- password = src_list[1] if len(src_list) > 1 else None
- src = open_file(src_list[0], password, pdf=True)
- pages = ",".join(src_list[2:]) # get 'pages' specifications
- if pages: # if anything there, retrieve a list of desired pages
- page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
- else: # take all pages
- page_list = range(1, src.page_count + 1)
- for i in page_list:
- doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page
- src.close()
- doc.save(args.output, garbage=4, deflate=True)
- doc.close()
- def embedded_copy(args):
- """Copy embedded files between PDFs."""
- doc = open_file(args.input, args.password, pdf=True)
- if not doc.can_save_incrementally() and (
- not args.output or args.output == args.input
- ):
- sys.exit("cannot save PDF incrementally")
- src = open_file(args.source, args.pwdsource)
- names = set(args.name) if args.name else set()
- src_names = set(src.embfile_names())
- if names:
- if not names <= src_names:
- sys.exit("not all names are contained in source")
- else:
- names = src_names
- if not names:
- sys.exit("nothing to copy")
- intersect = names & set(doc.embfile_names()) # any equal name already in target?
- if intersect:
- sys.exit("following names already exist in receiving PDF: %s" % str(intersect))
- for item in names:
- info = src.embfile_info(item)
- buff = src.embfile_get(item)
- doc.embfile_add(
- item,
- buff,
- filename=info["filename"],
- ufilename=info["ufilename"],
- desc=info["desc"],
- )
- pymupdf.message("copied entry '%s' from '%s'" % (item, src.name))
- src.close()
- if args.output and args.output != args.input:
- doc.save(args.output, garbage=3)
- else:
- doc.saveIncr()
- doc.close()
- def embedded_del(args):
- """Delete an embedded file entry."""
- doc = open_file(args.input, args.password, pdf=True)
- if not doc.can_save_incrementally() and (
- not args.output or args.output == args.input
- ):
- sys.exit("cannot save PDF incrementally")
- try:
- doc.embfile_del(args.name)
- except (ValueError, pymupdf.mupdf.FzErrorBase) as e:
- sys.exit(f'no such embedded file {args.name!r}: {e}')
- if not args.output or args.output == args.input:
- doc.saveIncr()
- else:
- doc.save(args.output, garbage=1)
- doc.close()
- def embedded_get(args):
- """Retrieve contents of an embedded file."""
- doc = open_file(args.input, args.password, pdf=True)
- try:
- stream = doc.embfile_get(args.name)
- d = doc.embfile_info(args.name)
- except (ValueError, pymupdf.mupdf.FzErrorBase) as e:
- sys.exit(f'no such embedded file {args.name!r}: {e}')
- filename = args.output if args.output else d["filename"]
- if not args.unsafe and not args.output:
- if os.path.exists(filename):
- sys.exit(f'refusing to overwrite existing file with stored name: {filename}')
- filename_abs = os.path.abspath(filename)
- if not filename_abs.startswith(os.getcwd() + os.sep):
- sys.exit(f'refusing to write stored name outside current directory: {filename}')
- with open(filename, "wb") as output:
- output.write(stream)
- pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename))
- doc.close()
- def embedded_add(args):
- """Insert a new embedded file."""
- doc = open_file(args.input, args.password, pdf=True)
- if not doc.can_save_incrementally() and (
- args.output is None or args.output == args.input
- ):
- sys.exit("cannot save PDF incrementally")
- try:
- doc.embfile_del(args.name)
- sys.exit("entry '%s' already exists" % args.name)
- except Exception:
- pass
- if not os.path.exists(args.path) or not os.path.isfile(args.path):
- sys.exit("no such file '%s'" % args.path)
- with open(args.path, "rb") as f:
- stream = f.read()
- filename = args.path
- ufilename = filename
- if not args.desc:
- desc = filename
- else:
- desc = args.desc
- doc.embfile_add(
- args.name, stream, filename=filename, ufilename=ufilename, desc=desc
- )
- if not args.output or args.output == args.input:
- doc.saveIncr()
- else:
- doc.save(args.output, garbage=3)
- doc.close()
- def embedded_upd(args):
- """Update contents or metadata of an embedded file."""
- doc = open_file(args.input, args.password, pdf=True)
- if not doc.can_save_incrementally() and (
- args.output is None or args.output == args.input
- ):
- sys.exit("cannot save PDF incrementally")
- try:
- doc.embfile_info(args.name)
- except Exception:
- sys.exit("no such embedded file '%s'" % args.name)
- if (
- args.path is not None
- and os.path.exists(args.path)
- and os.path.isfile(args.path)
- ):
- with open(args.path, "rb") as f:
- stream = f.read()
- else:
- stream = None
- if args.filename:
- filename = args.filename
- else:
- filename = None
- if args.ufilename:
- ufilename = args.ufilename
- elif args.filename:
- ufilename = args.filename
- else:
- ufilename = None
- if args.desc:
- desc = args.desc
- else:
- desc = None
- doc.embfile_upd(
- args.name, stream, filename=filename, ufilename=ufilename, desc=desc
- )
- if args.output is None or args.output == args.input:
- doc.saveIncr()
- else:
- doc.save(args.output, garbage=3)
- doc.close()
- def embedded_list(args):
- """List embedded files."""
- doc = open_file(args.input, args.password, pdf=True)
- names = doc.embfile_names()
- if args.name is not None:
- if args.name not in names:
- sys.exit("no such embedded file '%s'" % args.name)
- else:
- pymupdf.message()
- pymupdf.message(
- "printing 1 of %i embedded file%s:"
- % (len(names), "s" if len(names) > 1 else "")
- )
- pymupdf.message()
- print_dict(doc.embfile_info(args.name))
- pymupdf.message()
- return
- if not names:
- pymupdf.message("'%s' contains no embedded files" % doc.name)
- return
- if len(names) > 1:
- msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
- else:
- msg = "'%s' contains the following embedded file" % doc.name
- pymupdf.message(msg)
- pymupdf.message()
- for name in names:
- if not args.detail:
- pymupdf.message(name)
- continue
- _ = doc.embfile_info(name)
- print_dict(doc.embfile_info(name))
- pymupdf.message()
- doc.close()
- def extract_objects(args):
- """Extract images and / or fonts from a PDF."""
- if not args.fonts and not args.images:
- sys.exit("neither fonts nor images requested")
- doc = open_file(args.input, args.password, pdf=True)
- if args.pages:
- pages = get_list(args.pages, doc.page_count + 1)
- else:
- pages = range(1, doc.page_count + 1)
- if not args.output:
- out_dir = os.path.abspath(os.curdir)
- else:
- out_dir = args.output
- if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
- sys.exit("output directory %s does not exist" % out_dir)
- font_xrefs = set() # already saved fonts
- image_xrefs = set() # already saved images
- for pno in pages:
- if args.fonts:
- itemlist = doc.get_page_fonts(pno - 1)
- for item in itemlist:
- xref = item[0]
- if xref not in font_xrefs:
- font_xrefs.add(xref)
- fontname, ext, _, buffer = doc.extract_font(xref)
- if ext == "n/a" or not buffer:
- continue
- outname = os.path.join(
- out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}"
- )
- with open(outname, "wb") as outfile:
- outfile.write(buffer)
- buffer = None
- if args.images:
- itemlist = doc.get_page_images(pno - 1)
- for item in itemlist:
- xref = item[0]
- if xref not in image_xrefs:
- image_xrefs.add(xref)
- pix = recoverpix(doc, item)
- if type(pix) is dict:
- ext = pix["ext"]
- imgdata = pix["image"]
- outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
- with open(outname, "wb") as outfile:
- outfile.write(imgdata)
- else:
- outname = os.path.join(out_dir, "img-%i.png" % xref)
- pix2 = (
- pix
- if pix.colorspace.n < 4
- else pymupdf.Pixmap(pymupdf.csRGB, pix)
- )
- pix2.save(outname)
- if args.fonts:
- pymupdf.message("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
- if args.images:
- pymupdf.message("saved %i images to '%s'" % (len(image_xrefs), out_dir))
- doc.close()
- def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
- eop = b"\n" if noformfeed else bytes([12])
- text = page.get_text("text", flags=flags)
- if not text:
- if not skip_empty:
- textout.write(eop) # write formfeed
- return
- textout.write(text.encode("utf8", errors="surrogatepass"))
- textout.write(eop)
- return
- def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
- eop = b"\n" if noformfeed else bytes([12])
- blocks = page.get_text("blocks", flags=flags)
- if blocks == []:
- if not skip_empty:
- textout.write(eop) # write formfeed
- return
- blocks.sort(key=lambda b: (b[3], b[0]))
- for b in blocks:
- textout.write(b[4].encode("utf8", errors="surrogatepass"))
- textout.write(eop)
- return
- def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
- eop = b"\n" if noformfeed else bytes([12])
- # --------------------------------------------------------------------
- def find_line_index(values: List[int], value: int) -> int:
- """Find the right row coordinate.
- Args:
- values: (list) y-coordinates of rows.
- value: (int) lookup for this value (y-origin of char).
- Returns:
- y-ccordinate of appropriate line for value.
- """
- i = bisect.bisect_right(values, value)
- if i:
- return values[i - 1]
- raise RuntimeError("Line for %g not found in %s" % (value, values))
- # --------------------------------------------------------------------
- def curate_rows(rows: Set[int], GRID) -> List:
- rows = list(rows)
- rows.sort() # sort ascending
- nrows = [rows[0]]
- for h in rows[1:]:
- if h >= nrows[-1] + GRID: # only keep significant differences
- nrows.append(h)
- return nrows # curated list of line bottom coordinates
- def process_blocks(blocks: List[Dict], page: pymupdf.Page):
- rows = set()
- page_width = page.rect.width
- page_height = page.rect.height
- rowheight = page_height
- left = page_width
- right = 0
- chars = []
- for block in blocks:
- for line in block["lines"]:
- if line["dir"] != (1, 0): # ignore non-horizontal text
- continue
- x0, y0, x1, y1 = line["bbox"]
- if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox
- continue
- # upd row height
- height = y1 - y0
- if rowheight > height:
- rowheight = height
- for span in line["spans"]:
- if span["size"] <= fontsize:
- continue
- for c in span["chars"]:
- x0, _, x1, _ = c["bbox"]
- cwidth = x1 - x0
- ox, oy = c["origin"]
- oy = int(round(oy))
- rows.add(oy)
- ch = c["c"]
- if left > ox and ch != " ":
- left = ox # update left coordinate
- if right < x1:
- right = x1 # update right coordinate
- # handle ligatures:
- if cwidth == 0 and chars != []: # potential ligature
- old_ch, old_ox, old_oy, old_cwidth = chars[-1]
- if old_oy == oy: # ligature
- if old_ch != chr(0xFB00): # previous "ff" char lig?
- lig = joinligature(old_ch + ch) # no
- # convert to one of the 3-char ligatures:
- elif ch == "i":
- lig = chr(0xFB03) # "ffi"
- elif ch == "l":
- lig = chr(0xFB04) # "ffl"
- else: # something wrong, leave old char in place
- lig = old_ch
- chars[-1] = (lig, old_ox, old_oy, old_cwidth)
- continue
- chars.append((ch, ox, oy, cwidth)) # all chars on page
- return chars, rows, left, right, rowheight
- def joinligature(lig: str) -> str:
- """Return ligature character for a given pair / triple of characters.
- Args:
- lig: (str) 2/3 characters, e.g. "ff"
- Returns:
- Ligature, e.g. "ff" -> chr(0xFB00)
- """
- if lig == "ff":
- return chr(0xFB00)
- elif lig == "fi":
- return chr(0xFB01)
- elif lig == "fl":
- return chr(0xFB02)
- elif lig == "ffi":
- return chr(0xFB03)
- elif lig == "ffl":
- return chr(0xFB04)
- elif lig == "ft":
- return chr(0xFB05)
- elif lig == "st":
- return chr(0xFB06)
- return lig
- # --------------------------------------------------------------------
- def make_textline(left, slot, minslot, lchars):
- """Produce the text of one output line.
- Args:
- left: (float) left most coordinate used on page
- slot: (float) avg width of one character in any font in use.
- minslot: (float) min width for the characters in this line.
- chars: (list[tuple]) characters of this line.
- Returns:
- text: (str) text string for this line
- """
- text = "" # we output this
- old_char = ""
- old_x1 = 0 # end coordinate of last char
- old_ox = 0 # x-origin of last char
- if minslot <= pymupdf.EPSILON:
- raise RuntimeError("program error: minslot too small = %g" % minslot)
- for c in lchars: # loop over characters
- char, ox, _, cwidth = c
- ox = ox - left # its (relative) start coordinate
- x1 = ox + cwidth # ending coordinate
- # eliminate overprint effect
- if old_char == char and ox - old_ox <= cwidth * 0.2:
- continue
- # omit spaces overlapping previous char
- if char == " " and (old_x1 - ox) / cwidth > 0.8:
- continue
- old_char = char
- # close enough to previous?
- if ox < old_x1 + minslot: # assume char adjacent to previous
- text += char # append to output
- old_x1 = x1 # new end coord
- old_ox = ox # new origin.x
- continue
- # else next char starts after some gap:
- # fill in right number of spaces, so char is positioned
- # in the right slot of the line
- if char == " ": # rest relevant for non-space only
- continue
- delta = int(ox / slot) - len(text)
- if ox > old_x1 and delta > 1:
- text += " " * delta
- # now append char
- text += char
- old_x1 = x1 # new end coordinate
- old_ox = ox # new origin
- return text.rstrip()
- # extract page text by single characters ("rawdict")
- blocks = page.get_text("rawdict", flags=flags)["blocks"]
- chars, rows, left, right, rowheight = process_blocks(blocks, page)
- if chars == []:
- if not skip_empty:
- textout.write(eop) # write formfeed
- return
- # compute list of line coordinates - ignoring small (GRID) differences
- rows = curate_rows(rows, GRID)
- # sort all chars by x-coordinates, so every line will receive char info,
- # sorted from left to right.
- chars.sort(key=lambda c: c[1])
- # populate the lines with their char info
- lines = {} # key: y1-ccordinate, value: char list
- for c in chars:
- _, _, oy, _ = c
- y = find_line_index(rows, oy) # y-coord of the right line
- lchars = lines.get(y, []) # read line chars so far
- lchars.append(c) # append this char
- lines[y] = lchars # write back to line
- # ensure line coordinates are ascending
- keys = list(lines.keys())
- keys.sort()
- # -------------------------------------------------------------------------
- # Compute "char resolution" for the page: the char width corresponding to
- # 1 text char position on output - call it 'slot'.
- # For each line, compute median of its char widths. The minimum across all
- # lines is 'slot'.
- # The minimum char width of each line is used to determine if spaces must
- # be inserted in between two characters.
- # -------------------------------------------------------------------------
- slot = right - left
- minslots = {}
- for k in keys:
- lchars = lines[k]
- ccount = len(lchars)
- if ccount < 2:
- minslots[k] = 1
- continue
- widths = [c[3] for c in lchars]
- widths.sort()
- this_slot = statistics.median(widths) # take median value
- if this_slot < slot:
- slot = this_slot
- minslots[k] = widths[0]
- # compute line advance in text output
- rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2
- rowpos = rows[0] # first line positioned here
- textout.write(b"\n")
- for k in keys: # walk through the lines
- while rowpos < k: # honor distance between lines
- textout.write(b"\n")
- rowpos += rowheight
- text = make_textline(left, slot, minslots[k], lines[k])
- textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
- rowpos = k + rowheight
- textout.write(eop) # write formfeed
- def gettext(args):
- doc = open_file(args.input, args.password, pdf=False)
- pagel = get_list(args.pages, doc.page_count + 1)
- output = args.output
- if output is None:
- filename, _ = os.path.splitext(doc.name)
- output = filename + ".txt"
- with open(output, "wb") as textout:
- flags = pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE
- if args.convert_white:
- flags ^= pymupdf.TEXT_PRESERVE_WHITESPACE
- if args.noligatures:
- flags ^= pymupdf.TEXT_PRESERVE_LIGATURES
- if args.extra_spaces:
- flags ^= pymupdf.TEXT_INHIBIT_SPACES
- func = {
- "simple": page_simple,
- "blocks": page_blocksort,
- "layout": page_layout,
- }
- for pno in pagel:
- page = doc[pno - 1]
- func[args.mode](
- page,
- textout,
- args.grid,
- args.fontsize,
- args.noformfeed,
- args.skip_empty,
- flags=flags,
- )
- def _internal(args):
- pymupdf.message('This is from PyMuPDF message().')
- pymupdf.log('This is from PyMuPDF log().')
- def main():
- """Define command configurations."""
- parser = argparse.ArgumentParser(
- prog="pymupdf",
- description=mycenter("Basic PyMuPDF Functions"),
- )
- subps = parser.add_subparsers(
- title="Subcommands", help="Enter 'command -h' for subcommand specific help"
- )
- # -------------------------------------------------------------------------
- # 'show' command
- # -------------------------------------------------------------------------
- ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
- ps_show.add_argument("input", type=str, help="PDF filename")
- ps_show.add_argument("-password", help="password")
- ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
- ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
- ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
- ps_show.add_argument(
- "-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
- )
- ps_show.add_argument(
- "-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
- )
- ps_show.set_defaults(func=show)
- # -------------------------------------------------------------------------
- # 'clean' command
- # -------------------------------------------------------------------------
- ps_clean = subps.add_parser(
- "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
- )
- ps_clean.add_argument("input", type=str, help="PDF filename")
- ps_clean.add_argument("output", type=str, help="output PDF filename")
- ps_clean.add_argument("-password", help="password")
- ps_clean.add_argument(
- "-encryption",
- help="encryption method",
- choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
- default="none",
- )
- ps_clean.add_argument("-owner", type=str, help="owner password")
- ps_clean.add_argument("-user", type=str, help="user password")
- ps_clean.add_argument(
- "-garbage",
- type=int,
- help="garbage collection level",
- choices=range(5),
- default=0,
- )
- ps_clean.add_argument(
- "-compress",
- action="store_true",
- default=False,
- help="compress (deflate) output",
- )
- ps_clean.add_argument(
- "-ascii", action="store_true", default=False, help="ASCII encode binary data"
- )
- ps_clean.add_argument(
- "-linear",
- action="store_true",
- default=False,
- help="format for fast web display",
- )
- ps_clean.add_argument(
- "-permission", type=int, default=-1, help="integer with permission levels"
- )
- ps_clean.add_argument(
- "-sanitize",
- action="store_true",
- default=False,
- help="sanitize / clean contents",
- )
- ps_clean.add_argument(
- "-pretty", action="store_true", default=False, help="prettify PDF structure"
- )
- ps_clean.add_argument(
- "-pages", help="output selected pages pages, format: 1,5-7,50-N"
- )
- ps_clean.set_defaults(func=clean)
- # -------------------------------------------------------------------------
- # 'join' command
- # -------------------------------------------------------------------------
- ps_join = subps.add_parser(
- "join",
- description=mycenter("join PDF documents"),
- epilog="specify each input as 'filename[,password[,pages]]'",
- )
- ps_join.add_argument("input", nargs="*", help="input filenames")
- ps_join.add_argument("-output", required=True, help="output filename")
- ps_join.set_defaults(func=doc_join)
- # -------------------------------------------------------------------------
- # 'extract' command
- # -------------------------------------------------------------------------
- ps_extract = subps.add_parser(
- "extract", description=mycenter("extract images and fonts to disk")
- )
- ps_extract.add_argument("input", type=str, help="PDF filename")
- ps_extract.add_argument("-images", action="store_true", help="extract images")
- ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
- ps_extract.add_argument(
- "-output", help="folder to receive output, defaults to current"
- )
- ps_extract.add_argument("-password", help="password")
- ps_extract.add_argument(
- "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
- )
- ps_extract.set_defaults(func=extract_objects)
- # -------------------------------------------------------------------------
- # 'embed-info'
- # -------------------------------------------------------------------------
- ps_show = subps.add_parser(
- "embed-info", description=mycenter("list embedded files")
- )
- ps_show.add_argument("input", help="PDF filename")
- ps_show.add_argument("-name", help="if given, report only this one")
- ps_show.add_argument("-detail", action="store_true", help="detail information")
- ps_show.add_argument("-password", help="password")
- ps_show.set_defaults(func=embedded_list)
- # -------------------------------------------------------------------------
- # 'embed-add' command
- # -------------------------------------------------------------------------
- ps_embed_add = subps.add_parser(
- "embed-add", description=mycenter("add embedded file")
- )
- ps_embed_add.add_argument("input", help="PDF filename")
- ps_embed_add.add_argument("-password", help="password")
- ps_embed_add.add_argument(
- "-output", help="output PDF filename, incremental save if none"
- )
- ps_embed_add.add_argument("-name", required=True, help="name of new entry")
- ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
- ps_embed_add.add_argument("-desc", help="description of new entry")
- ps_embed_add.set_defaults(func=embedded_add)
- # -------------------------------------------------------------------------
- # 'embed-del' command
- # -------------------------------------------------------------------------
- ps_embed_del = subps.add_parser(
- "embed-del", description=mycenter("delete embedded file")
- )
- ps_embed_del.add_argument("input", help="PDF filename")
- ps_embed_del.add_argument("-password", help="password")
- ps_embed_del.add_argument(
- "-output", help="output PDF filename, incremental save if none"
- )
- ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
- ps_embed_del.set_defaults(func=embedded_del)
- # -------------------------------------------------------------------------
- # 'embed-upd' command
- # -------------------------------------------------------------------------
- ps_embed_upd = subps.add_parser(
- "embed-upd",
- description=mycenter("update embedded file"),
- epilog="except '-name' all parameters are optional",
- )
- ps_embed_upd.add_argument("input", help="PDF filename")
- ps_embed_upd.add_argument("-name", required=True, help="name of entry")
- ps_embed_upd.add_argument("-password", help="password")
- ps_embed_upd.add_argument(
- "-output", help="Output PDF filename, incremental save if none"
- )
- ps_embed_upd.add_argument("-path", help="path to new data for entry")
- ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
- ps_embed_upd.add_argument(
- "-ufilename", help="new unicode filename to store in entry"
- )
- ps_embed_upd.add_argument("-desc", help="new description to store in entry")
- ps_embed_upd.set_defaults(func=embedded_upd)
- # -------------------------------------------------------------------------
- # 'embed-extract' command
- # -------------------------------------------------------------------------
- ps_embed_extract = subps.add_parser(
- "embed-extract", description=mycenter("extract embedded file to disk")
- )
- ps_embed_extract.add_argument("input", type=str, help="PDF filename")
- ps_embed_extract.add_argument("-name", required=True, help="name of entry")
- ps_embed_extract.add_argument("-password", help="password")
- ps_embed_extract.add_argument("-unsafe", default=False, action="store_true",
- help="allow write to stored name even if an existing file or outside current directory"
- )
- ps_embed_extract.add_argument(
- "-output", help="output filename, default is stored name"
- )
- ps_embed_extract.set_defaults(func=embedded_get)
- # -------------------------------------------------------------------------
- # 'embed-copy' command
- # -------------------------------------------------------------------------
- ps_embed_copy = subps.add_parser(
- "embed-copy", description=mycenter("copy embedded files between PDFs")
- )
- ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
- ps_embed_copy.add_argument("-password", help="password of input")
- ps_embed_copy.add_argument(
- "-output", help="output PDF, incremental save to 'input' if omitted"
- )
- ps_embed_copy.add_argument(
- "-source", required=True, help="copy embedded files from here"
- )
- ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
- ps_embed_copy.add_argument(
- "-name", nargs="*", help="restrict copy to these entries"
- )
- ps_embed_copy.set_defaults(func=embedded_copy)
- # -------------------------------------------------------------------------
- # 'textlayout' command
- # -------------------------------------------------------------------------
- ps_gettext = subps.add_parser(
- "gettext", description=mycenter("extract text in various formatting modes")
- )
- ps_gettext.add_argument("input", type=str, help="input document filename")
- ps_gettext.add_argument("-password", help="password for input document")
- ps_gettext.add_argument(
- "-mode",
- type=str,
- help="mode: simple, block sort, or layout (default)",
- choices=("simple", "blocks", "layout"),
- default="layout",
- )
- ps_gettext.add_argument(
- "-pages",
- type=str,
- help="select pages, format: 1,5-7,50-N",
- default="1-N",
- )
- ps_gettext.add_argument(
- "-noligatures",
- action="store_true",
- help="expand ligature characters (default False)",
- default=False,
- )
- ps_gettext.add_argument(
- "-convert-white",
- action="store_true",
- help="convert whitespace characters to white (default False)",
- default=False,
- )
- ps_gettext.add_argument(
- "-extra-spaces",
- action="store_true",
- help="fill gaps with spaces (default False)",
- default=False,
- )
- ps_gettext.add_argument(
- "-noformfeed",
- action="store_true",
- help="write linefeeds, no formfeeds (default False)",
- default=False,
- )
- ps_gettext.add_argument(
- "-skip-empty",
- action="store_true",
- help="suppress pages with no text (default False)",
- default=False,
- )
- ps_gettext.add_argument(
- "-output",
- help="store text in this file (default inputfilename.txt)",
- )
- ps_gettext.add_argument(
- "-grid",
- type=float,
- help="merge lines if closer than this (default 2)",
- default=2,
- )
- ps_gettext.add_argument(
- "-fontsize",
- type=float,
- help="only include text with a larger fontsize (default 3)",
- default=3,
- )
- ps_gettext.set_defaults(func=gettext)
- # -------------------------------------------------------------------------
- # '_internal' command
- # -------------------------------------------------------------------------
- ps_internal = subps.add_parser(
- "internal", description=mycenter("internal testing")
- )
- ps_internal.set_defaults(func=_internal)
- # -------------------------------------------------------------------------
- # start program
- # -------------------------------------------------------------------------
- args = parser.parse_args() # create parameter arguments class
- if not hasattr(args, "func"): # no function selected
- parser.print_help() # so print top level help
- else:
- args.func(args) # execute requested command
- if __name__ == "__main__":
- main()
|