__main__.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149
  1. # -----------------------------------------------------------------------------
  2. # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
  3. # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
  4. # Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a
  5. # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
  6. # maintained and developed by Artifex Software, Inc. https://artifex.com.
  7. # -----------------------------------------------------------------------------
  8. import argparse
  9. import bisect
  10. import os
  11. import sys
  12. import statistics
  13. from typing import Dict, List, Set
  14. from . import pymupdf
  15. def mycenter(x):
  16. return (" %s " % x).center(75, "-")
  17. def recoverpix(doc, item):
  18. """Return image for a given XREF."""
  19. x = item[0] # xref of PDF image
  20. s = item[1] # xref of its /SMask
  21. if s == 0: # no smask: use direct image output
  22. return doc.extract_image(x)
  23. def getimage(pix):
  24. if pix.colorspace.n != 4:
  25. return pix
  26. tpix = pymupdf.Pixmap(pymupdf.csRGB, pix)
  27. return tpix
  28. # we need to reconstruct the alpha channel with the smask
  29. pix1 = pymupdf.Pixmap(doc, x)
  30. pix2 = pymupdf.Pixmap(doc, s) # create pixmap of the /SMask entry
  31. """Sanity check:
  32. - both pixmaps must have the same rectangle
  33. - both pixmaps must have alpha=0
  34. - pix2 must consist of 1 byte per pixel
  35. """
  36. if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
  37. pymupdf.message("Warning: unsupported /SMask %i for %i:" % (s, x))
  38. pymupdf.message(pix2)
  39. pix2 = None
  40. return getimage(pix1) # return the pixmap as is
  41. pix = pymupdf.Pixmap(pix1) # copy of pix1, with an alpha channel added
  42. pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values
  43. pix1 = pix2 = None # free temp pixmaps
  44. # we may need to adjust something for CMYK pixmaps here:
  45. return getimage(pix)
  46. def open_file(filename, password, show=False, pdf=True):
  47. """Open and authenticate a document."""
  48. doc = pymupdf.open(filename)
  49. if not doc.is_pdf and pdf is True:
  50. sys.exit("this command supports PDF files only")
  51. rc = -1
  52. if not doc.needs_pass:
  53. return doc
  54. if password:
  55. rc = doc.authenticate(password)
  56. if not rc:
  57. sys.exit("authentication unsuccessful")
  58. if show is True:
  59. pymupdf.message("authenticated as %s" % "owner" if rc > 2 else "user")
  60. else:
  61. sys.exit("'%s' requires a password" % doc.name)
  62. return doc
  63. def print_dict(item):
  64. """Print a Python dictionary."""
  65. l = max([len(k) for k in item.keys()]) + 1
  66. for k, v in item.items():
  67. msg = "%s: %s" % (k.rjust(l), v)
  68. pymupdf.message(msg)
  69. def print_xref(doc, xref):
  70. """Print an object given by XREF number.
  71. Simulate the PDF source in "pretty" format.
  72. For a stream also print its size.
  73. """
  74. pymupdf.message("%i 0 obj" % xref)
  75. xref_str = doc.xref_object(xref)
  76. pymupdf.message(xref_str)
  77. if doc.xref_is_stream(xref):
  78. temp = xref_str.split()
  79. try:
  80. idx = temp.index("/Length") + 1
  81. size = temp[idx]
  82. if size.endswith("0 R"):
  83. size = "unknown"
  84. except Exception:
  85. size = "unknown"
  86. pymupdf.message("stream\n...%s bytes" % size)
  87. pymupdf.message("endstream")
  88. pymupdf.message("endobj")
  89. def get_list(rlist, limit, what="page"):
  90. """Transform a page / xref specification into a list of integers.
  91. Args
  92. ----
  93. rlist: (str) the specification
  94. limit: maximum number, i.e. number of pages, number of objects
  95. what: a string to be used in error messages
  96. Returns
  97. -------
  98. A list of integers representing the specification.
  99. """
  100. N = str(limit - 1)
  101. rlist = rlist.replace("N", N).replace(" ", "")
  102. rlist_arr = rlist.split(",")
  103. out_list = []
  104. for seq, item in enumerate(rlist_arr):
  105. n = seq + 1
  106. if item.isdecimal(): # a single integer
  107. i = int(item)
  108. if 1 <= i < limit:
  109. out_list.append(int(item))
  110. else:
  111. sys.exit("bad %s specification at item %i" % (what, n))
  112. continue
  113. try: # this must be a range now, and all of the following must work:
  114. i1, i2 = item.split("-") # will fail if not 2 items produced
  115. i1 = int(i1) # will fail on non-integers
  116. i2 = int(i2)
  117. except Exception:
  118. sys.exit("bad %s range specification at item %i" % (what, n))
  119. if not (1 <= i1 < limit and 1 <= i2 < limit):
  120. sys.exit("bad %s range specification at item %i" % (what, n))
  121. if i1 == i2: # just in case: a range of equal numbers
  122. out_list.append(i1)
  123. continue
  124. if i1 < i2: # first less than second
  125. out_list += list(range(i1, i2 + 1))
  126. else: # first larger than second
  127. out_list += list(range(i1, i2 - 1, -1))
  128. return out_list
  129. def show(args):
  130. doc = open_file(args.input, args.password, True)
  131. size = os.path.getsize(args.input) / 1024
  132. flag = "KB"
  133. if size > 1000:
  134. size /= 1024
  135. flag = "MB"
  136. size = round(size, 1)
  137. meta = doc.metadata # pylint: disable=no-member
  138. pymupdf.message(
  139. "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
  140. % (
  141. args.input,
  142. doc.page_count,
  143. doc.xref_length() - 1,
  144. size,
  145. flag,
  146. meta["format"],
  147. meta["encryption"],
  148. )
  149. )
  150. n = doc.is_form_pdf
  151. if n > 0:
  152. s = doc.get_sigflags()
  153. pymupdf.message(
  154. "document contains %i root form fields and is %ssigned"
  155. % (n, "not " if s != 3 else "")
  156. )
  157. n = doc.embfile_count()
  158. if n > 0:
  159. pymupdf.message("document contains %i embedded files" % n)
  160. pymupdf.message()
  161. if args.catalog:
  162. pymupdf.message(mycenter("PDF catalog"))
  163. xref = doc.pdf_catalog()
  164. print_xref(doc, xref)
  165. pymupdf.message()
  166. if args.metadata:
  167. pymupdf.message(mycenter("PDF metadata"))
  168. print_dict(doc.metadata) # pylint: disable=no-member
  169. pymupdf.message()
  170. if args.xrefs:
  171. pymupdf.message(mycenter("object information"))
  172. xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
  173. for xref in xrefl:
  174. print_xref(doc, xref)
  175. pymupdf.message()
  176. if args.pages:
  177. pymupdf.message(mycenter("page information"))
  178. pagel = get_list(args.pages, doc.page_count + 1)
  179. for pno in pagel:
  180. n = pno - 1
  181. xref = doc.page_xref(n)
  182. pymupdf.message("Page %i:" % pno)
  183. print_xref(doc, xref)
  184. pymupdf.message()
  185. if args.trailer:
  186. pymupdf.message(mycenter("PDF trailer"))
  187. pymupdf.message(doc.pdf_trailer())
  188. pymupdf.message()
  189. doc.close()
  190. def clean(args):
  191. doc = open_file(args.input, args.password, pdf=True)
  192. encryption = args.encryption
  193. encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
  194. encryption
  195. )
  196. if not args.pages: # simple cleaning
  197. doc.save(
  198. args.output,
  199. garbage=args.garbage,
  200. deflate=args.compress,
  201. pretty=args.pretty,
  202. clean=args.sanitize,
  203. ascii=args.ascii,
  204. linear=args.linear,
  205. encryption=encrypt,
  206. owner_pw=args.owner,
  207. user_pw=args.user,
  208. permissions=args.permission,
  209. )
  210. return
  211. # create sub document from page numbers
  212. pages = get_list(args.pages, doc.page_count + 1)
  213. outdoc = pymupdf.open()
  214. for pno in pages:
  215. n = pno - 1
  216. outdoc.insert_pdf(doc, from_page=n, to_page=n)
  217. outdoc.save(
  218. args.output,
  219. garbage=args.garbage,
  220. deflate=args.compress,
  221. pretty=args.pretty,
  222. clean=args.sanitize,
  223. ascii=args.ascii,
  224. linear=args.linear,
  225. encryption=encrypt,
  226. owner_pw=args.owner,
  227. user_pw=args.user,
  228. permissions=args.permission,
  229. )
  230. doc.close()
  231. outdoc.close()
  232. return
  233. def doc_join(args):
  234. """Join pages from several PDF documents."""
  235. doc_list = args.input # a list of input PDFs
  236. doc = pymupdf.open() # output PDF
  237. for src_item in doc_list: # process one input PDF
  238. src_list = src_item.split(",")
  239. password = src_list[1] if len(src_list) > 1 else None
  240. src = open_file(src_list[0], password, pdf=True)
  241. pages = ",".join(src_list[2:]) # get 'pages' specifications
  242. if pages: # if anything there, retrieve a list of desired pages
  243. page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
  244. else: # take all pages
  245. page_list = range(1, src.page_count + 1)
  246. for i in page_list:
  247. doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page
  248. src.close()
  249. doc.save(args.output, garbage=4, deflate=True)
  250. doc.close()
  251. def embedded_copy(args):
  252. """Copy embedded files between PDFs."""
  253. doc = open_file(args.input, args.password, pdf=True)
  254. if not doc.can_save_incrementally() and (
  255. not args.output or args.output == args.input
  256. ):
  257. sys.exit("cannot save PDF incrementally")
  258. src = open_file(args.source, args.pwdsource)
  259. names = set(args.name) if args.name else set()
  260. src_names = set(src.embfile_names())
  261. if names:
  262. if not names <= src_names:
  263. sys.exit("not all names are contained in source")
  264. else:
  265. names = src_names
  266. if not names:
  267. sys.exit("nothing to copy")
  268. intersect = names & set(doc.embfile_names()) # any equal name already in target?
  269. if intersect:
  270. sys.exit("following names already exist in receiving PDF: %s" % str(intersect))
  271. for item in names:
  272. info = src.embfile_info(item)
  273. buff = src.embfile_get(item)
  274. doc.embfile_add(
  275. item,
  276. buff,
  277. filename=info["filename"],
  278. ufilename=info["ufilename"],
  279. desc=info["desc"],
  280. )
  281. pymupdf.message("copied entry '%s' from '%s'" % (item, src.name))
  282. src.close()
  283. if args.output and args.output != args.input:
  284. doc.save(args.output, garbage=3)
  285. else:
  286. doc.saveIncr()
  287. doc.close()
  288. def embedded_del(args):
  289. """Delete an embedded file entry."""
  290. doc = open_file(args.input, args.password, pdf=True)
  291. if not doc.can_save_incrementally() and (
  292. not args.output or args.output == args.input
  293. ):
  294. sys.exit("cannot save PDF incrementally")
  295. try:
  296. doc.embfile_del(args.name)
  297. except (ValueError, pymupdf.mupdf.FzErrorBase) as e:
  298. sys.exit(f'no such embedded file {args.name!r}: {e}')
  299. if not args.output or args.output == args.input:
  300. doc.saveIncr()
  301. else:
  302. doc.save(args.output, garbage=1)
  303. doc.close()
  304. def embedded_get(args):
  305. """Retrieve contents of an embedded file."""
  306. doc = open_file(args.input, args.password, pdf=True)
  307. try:
  308. stream = doc.embfile_get(args.name)
  309. d = doc.embfile_info(args.name)
  310. except (ValueError, pymupdf.mupdf.FzErrorBase) as e:
  311. sys.exit(f'no such embedded file {args.name!r}: {e}')
  312. filename = args.output if args.output else d["filename"]
  313. if not args.unsafe and not args.output:
  314. if os.path.exists(filename):
  315. sys.exit(f'refusing to overwrite existing file with stored name: {filename}')
  316. filename_abs = os.path.abspath(filename)
  317. if not filename_abs.startswith(os.getcwd() + os.sep):
  318. sys.exit(f'refusing to write stored name outside current directory: {filename}')
  319. with open(filename, "wb") as output:
  320. output.write(stream)
  321. pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename))
  322. doc.close()
  323. def embedded_add(args):
  324. """Insert a new embedded file."""
  325. doc = open_file(args.input, args.password, pdf=True)
  326. if not doc.can_save_incrementally() and (
  327. args.output is None or args.output == args.input
  328. ):
  329. sys.exit("cannot save PDF incrementally")
  330. try:
  331. doc.embfile_del(args.name)
  332. sys.exit("entry '%s' already exists" % args.name)
  333. except Exception:
  334. pass
  335. if not os.path.exists(args.path) or not os.path.isfile(args.path):
  336. sys.exit("no such file '%s'" % args.path)
  337. with open(args.path, "rb") as f:
  338. stream = f.read()
  339. filename = args.path
  340. ufilename = filename
  341. if not args.desc:
  342. desc = filename
  343. else:
  344. desc = args.desc
  345. doc.embfile_add(
  346. args.name, stream, filename=filename, ufilename=ufilename, desc=desc
  347. )
  348. if not args.output or args.output == args.input:
  349. doc.saveIncr()
  350. else:
  351. doc.save(args.output, garbage=3)
  352. doc.close()
  353. def embedded_upd(args):
  354. """Update contents or metadata of an embedded file."""
  355. doc = open_file(args.input, args.password, pdf=True)
  356. if not doc.can_save_incrementally() and (
  357. args.output is None or args.output == args.input
  358. ):
  359. sys.exit("cannot save PDF incrementally")
  360. try:
  361. doc.embfile_info(args.name)
  362. except Exception:
  363. sys.exit("no such embedded file '%s'" % args.name)
  364. if (
  365. args.path is not None
  366. and os.path.exists(args.path)
  367. and os.path.isfile(args.path)
  368. ):
  369. with open(args.path, "rb") as f:
  370. stream = f.read()
  371. else:
  372. stream = None
  373. if args.filename:
  374. filename = args.filename
  375. else:
  376. filename = None
  377. if args.ufilename:
  378. ufilename = args.ufilename
  379. elif args.filename:
  380. ufilename = args.filename
  381. else:
  382. ufilename = None
  383. if args.desc:
  384. desc = args.desc
  385. else:
  386. desc = None
  387. doc.embfile_upd(
  388. args.name, stream, filename=filename, ufilename=ufilename, desc=desc
  389. )
  390. if args.output is None or args.output == args.input:
  391. doc.saveIncr()
  392. else:
  393. doc.save(args.output, garbage=3)
  394. doc.close()
  395. def embedded_list(args):
  396. """List embedded files."""
  397. doc = open_file(args.input, args.password, pdf=True)
  398. names = doc.embfile_names()
  399. if args.name is not None:
  400. if args.name not in names:
  401. sys.exit("no such embedded file '%s'" % args.name)
  402. else:
  403. pymupdf.message()
  404. pymupdf.message(
  405. "printing 1 of %i embedded file%s:"
  406. % (len(names), "s" if len(names) > 1 else "")
  407. )
  408. pymupdf.message()
  409. print_dict(doc.embfile_info(args.name))
  410. pymupdf.message()
  411. return
  412. if not names:
  413. pymupdf.message("'%s' contains no embedded files" % doc.name)
  414. return
  415. if len(names) > 1:
  416. msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
  417. else:
  418. msg = "'%s' contains the following embedded file" % doc.name
  419. pymupdf.message(msg)
  420. pymupdf.message()
  421. for name in names:
  422. if not args.detail:
  423. pymupdf.message(name)
  424. continue
  425. _ = doc.embfile_info(name)
  426. print_dict(doc.embfile_info(name))
  427. pymupdf.message()
  428. doc.close()
  429. def extract_objects(args):
  430. """Extract images and / or fonts from a PDF."""
  431. if not args.fonts and not args.images:
  432. sys.exit("neither fonts nor images requested")
  433. doc = open_file(args.input, args.password, pdf=True)
  434. if args.pages:
  435. pages = get_list(args.pages, doc.page_count + 1)
  436. else:
  437. pages = range(1, doc.page_count + 1)
  438. if not args.output:
  439. out_dir = os.path.abspath(os.curdir)
  440. else:
  441. out_dir = args.output
  442. if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
  443. sys.exit("output directory %s does not exist" % out_dir)
  444. font_xrefs = set() # already saved fonts
  445. image_xrefs = set() # already saved images
  446. for pno in pages:
  447. if args.fonts:
  448. itemlist = doc.get_page_fonts(pno - 1)
  449. for item in itemlist:
  450. xref = item[0]
  451. if xref not in font_xrefs:
  452. font_xrefs.add(xref)
  453. fontname, ext, _, buffer = doc.extract_font(xref)
  454. if ext == "n/a" or not buffer:
  455. continue
  456. outname = os.path.join(
  457. out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}"
  458. )
  459. with open(outname, "wb") as outfile:
  460. outfile.write(buffer)
  461. buffer = None
  462. if args.images:
  463. itemlist = doc.get_page_images(pno - 1)
  464. for item in itemlist:
  465. xref = item[0]
  466. if xref not in image_xrefs:
  467. image_xrefs.add(xref)
  468. pix = recoverpix(doc, item)
  469. if type(pix) is dict:
  470. ext = pix["ext"]
  471. imgdata = pix["image"]
  472. outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
  473. with open(outname, "wb") as outfile:
  474. outfile.write(imgdata)
  475. else:
  476. outname = os.path.join(out_dir, "img-%i.png" % xref)
  477. pix2 = (
  478. pix
  479. if pix.colorspace.n < 4
  480. else pymupdf.Pixmap(pymupdf.csRGB, pix)
  481. )
  482. pix2.save(outname)
  483. if args.fonts:
  484. pymupdf.message("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
  485. if args.images:
  486. pymupdf.message("saved %i images to '%s'" % (len(image_xrefs), out_dir))
  487. doc.close()
  488. def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
  489. eop = b"\n" if noformfeed else bytes([12])
  490. text = page.get_text("text", flags=flags)
  491. if not text:
  492. if not skip_empty:
  493. textout.write(eop) # write formfeed
  494. return
  495. textout.write(text.encode("utf8", errors="surrogatepass"))
  496. textout.write(eop)
  497. return
  498. def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
  499. eop = b"\n" if noformfeed else bytes([12])
  500. blocks = page.get_text("blocks", flags=flags)
  501. if blocks == []:
  502. if not skip_empty:
  503. textout.write(eop) # write formfeed
  504. return
  505. blocks.sort(key=lambda b: (b[3], b[0]))
  506. for b in blocks:
  507. textout.write(b[4].encode("utf8", errors="surrogatepass"))
  508. textout.write(eop)
  509. return
  510. def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
  511. eop = b"\n" if noformfeed else bytes([12])
  512. # --------------------------------------------------------------------
  513. def find_line_index(values: List[int], value: int) -> int:
  514. """Find the right row coordinate.
  515. Args:
  516. values: (list) y-coordinates of rows.
  517. value: (int) lookup for this value (y-origin of char).
  518. Returns:
  519. y-ccordinate of appropriate line for value.
  520. """
  521. i = bisect.bisect_right(values, value)
  522. if i:
  523. return values[i - 1]
  524. raise RuntimeError("Line for %g not found in %s" % (value, values))
  525. # --------------------------------------------------------------------
  526. def curate_rows(rows: Set[int], GRID) -> List:
  527. rows = list(rows)
  528. rows.sort() # sort ascending
  529. nrows = [rows[0]]
  530. for h in rows[1:]:
  531. if h >= nrows[-1] + GRID: # only keep significant differences
  532. nrows.append(h)
  533. return nrows # curated list of line bottom coordinates
  534. def process_blocks(blocks: List[Dict], page: pymupdf.Page):
  535. rows = set()
  536. page_width = page.rect.width
  537. page_height = page.rect.height
  538. rowheight = page_height
  539. left = page_width
  540. right = 0
  541. chars = []
  542. for block in blocks:
  543. for line in block["lines"]:
  544. if line["dir"] != (1, 0): # ignore non-horizontal text
  545. continue
  546. x0, y0, x1, y1 = line["bbox"]
  547. if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox
  548. continue
  549. # upd row height
  550. height = y1 - y0
  551. if rowheight > height:
  552. rowheight = height
  553. for span in line["spans"]:
  554. if span["size"] <= fontsize:
  555. continue
  556. for c in span["chars"]:
  557. x0, _, x1, _ = c["bbox"]
  558. cwidth = x1 - x0
  559. ox, oy = c["origin"]
  560. oy = int(round(oy))
  561. rows.add(oy)
  562. ch = c["c"]
  563. if left > ox and ch != " ":
  564. left = ox # update left coordinate
  565. if right < x1:
  566. right = x1 # update right coordinate
  567. # handle ligatures:
  568. if cwidth == 0 and chars != []: # potential ligature
  569. old_ch, old_ox, old_oy, old_cwidth = chars[-1]
  570. if old_oy == oy: # ligature
  571. if old_ch != chr(0xFB00): # previous "ff" char lig?
  572. lig = joinligature(old_ch + ch) # no
  573. # convert to one of the 3-char ligatures:
  574. elif ch == "i":
  575. lig = chr(0xFB03) # "ffi"
  576. elif ch == "l":
  577. lig = chr(0xFB04) # "ffl"
  578. else: # something wrong, leave old char in place
  579. lig = old_ch
  580. chars[-1] = (lig, old_ox, old_oy, old_cwidth)
  581. continue
  582. chars.append((ch, ox, oy, cwidth)) # all chars on page
  583. return chars, rows, left, right, rowheight
  584. def joinligature(lig: str) -> str:
  585. """Return ligature character for a given pair / triple of characters.
  586. Args:
  587. lig: (str) 2/3 characters, e.g. "ff"
  588. Returns:
  589. Ligature, e.g. "ff" -> chr(0xFB00)
  590. """
  591. if lig == "ff":
  592. return chr(0xFB00)
  593. elif lig == "fi":
  594. return chr(0xFB01)
  595. elif lig == "fl":
  596. return chr(0xFB02)
  597. elif lig == "ffi":
  598. return chr(0xFB03)
  599. elif lig == "ffl":
  600. return chr(0xFB04)
  601. elif lig == "ft":
  602. return chr(0xFB05)
  603. elif lig == "st":
  604. return chr(0xFB06)
  605. return lig
  606. # --------------------------------------------------------------------
  607. def make_textline(left, slot, minslot, lchars):
  608. """Produce the text of one output line.
  609. Args:
  610. left: (float) left most coordinate used on page
  611. slot: (float) avg width of one character in any font in use.
  612. minslot: (float) min width for the characters in this line.
  613. chars: (list[tuple]) characters of this line.
  614. Returns:
  615. text: (str) text string for this line
  616. """
  617. text = "" # we output this
  618. old_char = ""
  619. old_x1 = 0 # end coordinate of last char
  620. old_ox = 0 # x-origin of last char
  621. if minslot <= pymupdf.EPSILON:
  622. raise RuntimeError("program error: minslot too small = %g" % minslot)
  623. for c in lchars: # loop over characters
  624. char, ox, _, cwidth = c
  625. ox = ox - left # its (relative) start coordinate
  626. x1 = ox + cwidth # ending coordinate
  627. # eliminate overprint effect
  628. if old_char == char and ox - old_ox <= cwidth * 0.2:
  629. continue
  630. # omit spaces overlapping previous char
  631. if char == " " and (old_x1 - ox) / cwidth > 0.8:
  632. continue
  633. old_char = char
  634. # close enough to previous?
  635. if ox < old_x1 + minslot: # assume char adjacent to previous
  636. text += char # append to output
  637. old_x1 = x1 # new end coord
  638. old_ox = ox # new origin.x
  639. continue
  640. # else next char starts after some gap:
  641. # fill in right number of spaces, so char is positioned
  642. # in the right slot of the line
  643. if char == " ": # rest relevant for non-space only
  644. continue
  645. delta = int(ox / slot) - len(text)
  646. if ox > old_x1 and delta > 1:
  647. text += " " * delta
  648. # now append char
  649. text += char
  650. old_x1 = x1 # new end coordinate
  651. old_ox = ox # new origin
  652. return text.rstrip()
  653. # extract page text by single characters ("rawdict")
  654. blocks = page.get_text("rawdict", flags=flags)["blocks"]
  655. chars, rows, left, right, rowheight = process_blocks(blocks, page)
  656. if chars == []:
  657. if not skip_empty:
  658. textout.write(eop) # write formfeed
  659. return
  660. # compute list of line coordinates - ignoring small (GRID) differences
  661. rows = curate_rows(rows, GRID)
  662. # sort all chars by x-coordinates, so every line will receive char info,
  663. # sorted from left to right.
  664. chars.sort(key=lambda c: c[1])
  665. # populate the lines with their char info
  666. lines = {} # key: y1-ccordinate, value: char list
  667. for c in chars:
  668. _, _, oy, _ = c
  669. y = find_line_index(rows, oy) # y-coord of the right line
  670. lchars = lines.get(y, []) # read line chars so far
  671. lchars.append(c) # append this char
  672. lines[y] = lchars # write back to line
  673. # ensure line coordinates are ascending
  674. keys = list(lines.keys())
  675. keys.sort()
  676. # -------------------------------------------------------------------------
  677. # Compute "char resolution" for the page: the char width corresponding to
  678. # 1 text char position on output - call it 'slot'.
  679. # For each line, compute median of its char widths. The minimum across all
  680. # lines is 'slot'.
  681. # The minimum char width of each line is used to determine if spaces must
  682. # be inserted in between two characters.
  683. # -------------------------------------------------------------------------
  684. slot = right - left
  685. minslots = {}
  686. for k in keys:
  687. lchars = lines[k]
  688. ccount = len(lchars)
  689. if ccount < 2:
  690. minslots[k] = 1
  691. continue
  692. widths = [c[3] for c in lchars]
  693. widths.sort()
  694. this_slot = statistics.median(widths) # take median value
  695. if this_slot < slot:
  696. slot = this_slot
  697. minslots[k] = widths[0]
  698. # compute line advance in text output
  699. rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2
  700. rowpos = rows[0] # first line positioned here
  701. textout.write(b"\n")
  702. for k in keys: # walk through the lines
  703. while rowpos < k: # honor distance between lines
  704. textout.write(b"\n")
  705. rowpos += rowheight
  706. text = make_textline(left, slot, minslots[k], lines[k])
  707. textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
  708. rowpos = k + rowheight
  709. textout.write(eop) # write formfeed
  710. def gettext(args):
  711. doc = open_file(args.input, args.password, pdf=False)
  712. pagel = get_list(args.pages, doc.page_count + 1)
  713. output = args.output
  714. if output is None:
  715. filename, _ = os.path.splitext(doc.name)
  716. output = filename + ".txt"
  717. with open(output, "wb") as textout:
  718. flags = pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE
  719. if args.convert_white:
  720. flags ^= pymupdf.TEXT_PRESERVE_WHITESPACE
  721. if args.noligatures:
  722. flags ^= pymupdf.TEXT_PRESERVE_LIGATURES
  723. if args.extra_spaces:
  724. flags ^= pymupdf.TEXT_INHIBIT_SPACES
  725. func = {
  726. "simple": page_simple,
  727. "blocks": page_blocksort,
  728. "layout": page_layout,
  729. }
  730. for pno in pagel:
  731. page = doc[pno - 1]
  732. func[args.mode](
  733. page,
  734. textout,
  735. args.grid,
  736. args.fontsize,
  737. args.noformfeed,
  738. args.skip_empty,
  739. flags=flags,
  740. )
  741. def _internal(args):
  742. pymupdf.message('This is from PyMuPDF message().')
  743. pymupdf.log('This is from PyMuPDF log().')
  744. def main():
  745. """Define command configurations."""
  746. parser = argparse.ArgumentParser(
  747. prog="pymupdf",
  748. description=mycenter("Basic PyMuPDF Functions"),
  749. )
  750. subps = parser.add_subparsers(
  751. title="Subcommands", help="Enter 'command -h' for subcommand specific help"
  752. )
  753. # -------------------------------------------------------------------------
  754. # 'show' command
  755. # -------------------------------------------------------------------------
  756. ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
  757. ps_show.add_argument("input", type=str, help="PDF filename")
  758. ps_show.add_argument("-password", help="password")
  759. ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
  760. ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
  761. ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
  762. ps_show.add_argument(
  763. "-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
  764. )
  765. ps_show.add_argument(
  766. "-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
  767. )
  768. ps_show.set_defaults(func=show)
  769. # -------------------------------------------------------------------------
  770. # 'clean' command
  771. # -------------------------------------------------------------------------
  772. ps_clean = subps.add_parser(
  773. "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
  774. )
  775. ps_clean.add_argument("input", type=str, help="PDF filename")
  776. ps_clean.add_argument("output", type=str, help="output PDF filename")
  777. ps_clean.add_argument("-password", help="password")
  778. ps_clean.add_argument(
  779. "-encryption",
  780. help="encryption method",
  781. choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
  782. default="none",
  783. )
  784. ps_clean.add_argument("-owner", type=str, help="owner password")
  785. ps_clean.add_argument("-user", type=str, help="user password")
  786. ps_clean.add_argument(
  787. "-garbage",
  788. type=int,
  789. help="garbage collection level",
  790. choices=range(5),
  791. default=0,
  792. )
  793. ps_clean.add_argument(
  794. "-compress",
  795. action="store_true",
  796. default=False,
  797. help="compress (deflate) output",
  798. )
  799. ps_clean.add_argument(
  800. "-ascii", action="store_true", default=False, help="ASCII encode binary data"
  801. )
  802. ps_clean.add_argument(
  803. "-linear",
  804. action="store_true",
  805. default=False,
  806. help="format for fast web display",
  807. )
  808. ps_clean.add_argument(
  809. "-permission", type=int, default=-1, help="integer with permission levels"
  810. )
  811. ps_clean.add_argument(
  812. "-sanitize",
  813. action="store_true",
  814. default=False,
  815. help="sanitize / clean contents",
  816. )
  817. ps_clean.add_argument(
  818. "-pretty", action="store_true", default=False, help="prettify PDF structure"
  819. )
  820. ps_clean.add_argument(
  821. "-pages", help="output selected pages pages, format: 1,5-7,50-N"
  822. )
  823. ps_clean.set_defaults(func=clean)
  824. # -------------------------------------------------------------------------
  825. # 'join' command
  826. # -------------------------------------------------------------------------
  827. ps_join = subps.add_parser(
  828. "join",
  829. description=mycenter("join PDF documents"),
  830. epilog="specify each input as 'filename[,password[,pages]]'",
  831. )
  832. ps_join.add_argument("input", nargs="*", help="input filenames")
  833. ps_join.add_argument("-output", required=True, help="output filename")
  834. ps_join.set_defaults(func=doc_join)
  835. # -------------------------------------------------------------------------
  836. # 'extract' command
  837. # -------------------------------------------------------------------------
  838. ps_extract = subps.add_parser(
  839. "extract", description=mycenter("extract images and fonts to disk")
  840. )
  841. ps_extract.add_argument("input", type=str, help="PDF filename")
  842. ps_extract.add_argument("-images", action="store_true", help="extract images")
  843. ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
  844. ps_extract.add_argument(
  845. "-output", help="folder to receive output, defaults to current"
  846. )
  847. ps_extract.add_argument("-password", help="password")
  848. ps_extract.add_argument(
  849. "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
  850. )
  851. ps_extract.set_defaults(func=extract_objects)
  852. # -------------------------------------------------------------------------
  853. # 'embed-info'
  854. # -------------------------------------------------------------------------
  855. ps_show = subps.add_parser(
  856. "embed-info", description=mycenter("list embedded files")
  857. )
  858. ps_show.add_argument("input", help="PDF filename")
  859. ps_show.add_argument("-name", help="if given, report only this one")
  860. ps_show.add_argument("-detail", action="store_true", help="detail information")
  861. ps_show.add_argument("-password", help="password")
  862. ps_show.set_defaults(func=embedded_list)
  863. # -------------------------------------------------------------------------
  864. # 'embed-add' command
  865. # -------------------------------------------------------------------------
  866. ps_embed_add = subps.add_parser(
  867. "embed-add", description=mycenter("add embedded file")
  868. )
  869. ps_embed_add.add_argument("input", help="PDF filename")
  870. ps_embed_add.add_argument("-password", help="password")
  871. ps_embed_add.add_argument(
  872. "-output", help="output PDF filename, incremental save if none"
  873. )
  874. ps_embed_add.add_argument("-name", required=True, help="name of new entry")
  875. ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
  876. ps_embed_add.add_argument("-desc", help="description of new entry")
  877. ps_embed_add.set_defaults(func=embedded_add)
  878. # -------------------------------------------------------------------------
  879. # 'embed-del' command
  880. # -------------------------------------------------------------------------
  881. ps_embed_del = subps.add_parser(
  882. "embed-del", description=mycenter("delete embedded file")
  883. )
  884. ps_embed_del.add_argument("input", help="PDF filename")
  885. ps_embed_del.add_argument("-password", help="password")
  886. ps_embed_del.add_argument(
  887. "-output", help="output PDF filename, incremental save if none"
  888. )
  889. ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
  890. ps_embed_del.set_defaults(func=embedded_del)
  891. # -------------------------------------------------------------------------
  892. # 'embed-upd' command
  893. # -------------------------------------------------------------------------
  894. ps_embed_upd = subps.add_parser(
  895. "embed-upd",
  896. description=mycenter("update embedded file"),
  897. epilog="except '-name' all parameters are optional",
  898. )
  899. ps_embed_upd.add_argument("input", help="PDF filename")
  900. ps_embed_upd.add_argument("-name", required=True, help="name of entry")
  901. ps_embed_upd.add_argument("-password", help="password")
  902. ps_embed_upd.add_argument(
  903. "-output", help="Output PDF filename, incremental save if none"
  904. )
  905. ps_embed_upd.add_argument("-path", help="path to new data for entry")
  906. ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
  907. ps_embed_upd.add_argument(
  908. "-ufilename", help="new unicode filename to store in entry"
  909. )
  910. ps_embed_upd.add_argument("-desc", help="new description to store in entry")
  911. ps_embed_upd.set_defaults(func=embedded_upd)
  912. # -------------------------------------------------------------------------
  913. # 'embed-extract' command
  914. # -------------------------------------------------------------------------
  915. ps_embed_extract = subps.add_parser(
  916. "embed-extract", description=mycenter("extract embedded file to disk")
  917. )
  918. ps_embed_extract.add_argument("input", type=str, help="PDF filename")
  919. ps_embed_extract.add_argument("-name", required=True, help="name of entry")
  920. ps_embed_extract.add_argument("-password", help="password")
  921. ps_embed_extract.add_argument("-unsafe", default=False, action="store_true",
  922. help="allow write to stored name even if an existing file or outside current directory"
  923. )
  924. ps_embed_extract.add_argument(
  925. "-output", help="output filename, default is stored name"
  926. )
  927. ps_embed_extract.set_defaults(func=embedded_get)
  928. # -------------------------------------------------------------------------
  929. # 'embed-copy' command
  930. # -------------------------------------------------------------------------
  931. ps_embed_copy = subps.add_parser(
  932. "embed-copy", description=mycenter("copy embedded files between PDFs")
  933. )
  934. ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
  935. ps_embed_copy.add_argument("-password", help="password of input")
  936. ps_embed_copy.add_argument(
  937. "-output", help="output PDF, incremental save to 'input' if omitted"
  938. )
  939. ps_embed_copy.add_argument(
  940. "-source", required=True, help="copy embedded files from here"
  941. )
  942. ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
  943. ps_embed_copy.add_argument(
  944. "-name", nargs="*", help="restrict copy to these entries"
  945. )
  946. ps_embed_copy.set_defaults(func=embedded_copy)
  947. # -------------------------------------------------------------------------
  948. # 'textlayout' command
  949. # -------------------------------------------------------------------------
  950. ps_gettext = subps.add_parser(
  951. "gettext", description=mycenter("extract text in various formatting modes")
  952. )
  953. ps_gettext.add_argument("input", type=str, help="input document filename")
  954. ps_gettext.add_argument("-password", help="password for input document")
  955. ps_gettext.add_argument(
  956. "-mode",
  957. type=str,
  958. help="mode: simple, block sort, or layout (default)",
  959. choices=("simple", "blocks", "layout"),
  960. default="layout",
  961. )
  962. ps_gettext.add_argument(
  963. "-pages",
  964. type=str,
  965. help="select pages, format: 1,5-7,50-N",
  966. default="1-N",
  967. )
  968. ps_gettext.add_argument(
  969. "-noligatures",
  970. action="store_true",
  971. help="expand ligature characters (default False)",
  972. default=False,
  973. )
  974. ps_gettext.add_argument(
  975. "-convert-white",
  976. action="store_true",
  977. help="convert whitespace characters to white (default False)",
  978. default=False,
  979. )
  980. ps_gettext.add_argument(
  981. "-extra-spaces",
  982. action="store_true",
  983. help="fill gaps with spaces (default False)",
  984. default=False,
  985. )
  986. ps_gettext.add_argument(
  987. "-noformfeed",
  988. action="store_true",
  989. help="write linefeeds, no formfeeds (default False)",
  990. default=False,
  991. )
  992. ps_gettext.add_argument(
  993. "-skip-empty",
  994. action="store_true",
  995. help="suppress pages with no text (default False)",
  996. default=False,
  997. )
  998. ps_gettext.add_argument(
  999. "-output",
  1000. help="store text in this file (default inputfilename.txt)",
  1001. )
  1002. ps_gettext.add_argument(
  1003. "-grid",
  1004. type=float,
  1005. help="merge lines if closer than this (default 2)",
  1006. default=2,
  1007. )
  1008. ps_gettext.add_argument(
  1009. "-fontsize",
  1010. type=float,
  1011. help="only include text with a larger fontsize (default 3)",
  1012. default=3,
  1013. )
  1014. ps_gettext.set_defaults(func=gettext)
  1015. # -------------------------------------------------------------------------
  1016. # '_internal' command
  1017. # -------------------------------------------------------------------------
  1018. ps_internal = subps.add_parser(
  1019. "internal", description=mycenter("internal testing")
  1020. )
  1021. ps_internal.set_defaults(func=_internal)
  1022. # -------------------------------------------------------------------------
  1023. # start program
  1024. # -------------------------------------------------------------------------
  1025. args = parser.parse_args() # create parameter arguments class
  1026. if not hasattr(args, "func"): # no function selected
  1027. parser.print_help() # so print top level help
  1028. else:
  1029. args.func(args) # execute requested command
  1030. if __name__ == "__main__":
  1031. main()