table.py 92 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697
  1. """
  2. Copyright (C) 2023 Artifex Software, Inc.
  3. This file is part of PyMuPDF.
  4. PyMuPDF is free software: you can redistribute it and/or modify it under the
  5. terms of the GNU Affero General Public License as published by the Free
  6. Software Foundation, either version 3 of the License, or (at your option)
  7. any later version.
  8. PyMuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
  9. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  10. FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
  11. details.
  12. You should have received a copy of the GNU Affero General Public License
  13. along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
  14. Alternative licensing terms are available from the licensor.
  15. For commercial licensing, see <https://www.artifex.com/> or contact
  16. Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
  17. CA 94129, USA, for further information.
  18. ---------------------------------------------------------------------
  19. Portions of this code have been ported from pdfplumber, see
  20. https://pypi.org/project/pdfplumber/.
  21. The ported code is under the following MIT license:
  22. ---------------------------------------------------------------------
  23. The MIT License (MIT)
  24. Copyright (c) 2015, Jeremy Singer-Vine
  25. Permission is hereby granted, free of charge, to any person obtaining a copy
  26. of this software and associated documentation files (the "Software"), to deal
  27. in the Software without restriction, including without limitation the rights
  28. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  29. copies of the Software, and to permit persons to whom the Software is
  30. furnished to do so, subject to the following conditions:
  31. The above copyright notice and this permission notice shall be included in all
  32. copies or substantial portions of the Software.
  33. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  34. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  35. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  36. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  37. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  38. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  39. SOFTWARE.
  40. ---------------------------------------------------------------------
  41. Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
  42. ---------------------------------------------------------------------
  43. The porting mainly pertains to files "table.py" and relevant parts of
  44. "utils/text.py" within pdfplumber's repository on Github.
  45. With respect to "text.py", we have removed functions or features that are not
  46. used by table processing. Examples are:
  47. * the text search function
  48. * simple text extraction
  49. * text extraction by lines
  50. Original pdfplumber code does neither detect, nor identify table headers.
  51. This PyMuPDF port adds respective code to the 'Table' class as method '_get_header'.
  52. This is implemented as new class TableHeader with the properties:
  53. * bbox: A tuple for the header's bbox
  54. * cells: A tuple for each bbox of a column header
  55. * names: A list of strings with column header text
  56. * external: A bool indicating whether the header is outside the table cells.
  57. """
  58. import inspect
  59. import itertools
  60. import string
  61. import html
  62. from collections.abc import Sequence
  63. from dataclasses import dataclass
  64. from operator import itemgetter
  65. import weakref
  66. import pymupdf
  67. from pymupdf import mupdf
  68. # -------------------------------------------------------------------
  69. # Start of PyMuPDF interface code
  70. # -------------------------------------------------------------------
  71. EDGES = [] # vector graphics from PyMuPDF
  72. CHARS = [] # text characters from PyMuPDF
  73. TEXTPAGE = None
  74. TEXT_BOLD = mupdf.FZ_STEXT_BOLD
  75. TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
  76. FLAGS = (
  77. 0
  78. | pymupdf.TEXTFLAGS_TEXT
  79. | pymupdf.TEXT_COLLECT_STYLES
  80. | pymupdf.TEXT_ACCURATE_BBOXES
  81. | pymupdf.TEXT_MEDIABOX_CLIP
  82. )
  83. # needed by mupdf function fz_find_table_within_bounds().
  84. TABLE_DETECTOR_FLAGS = (
  85. 0
  86. | pymupdf.TEXT_ACCURATE_BBOXES
  87. | pymupdf.TEXT_SEGMENT
  88. | pymupdf.TEXT_COLLECT_VECTORS
  89. | pymupdf.TEXT_MEDIABOX_CLIP
  90. )
  91. white_spaces = set(string.whitespace) # for checking white space only cells
  92. def _iou(r1, r2):
  93. """Compute intersection over union of two rectangles."""
  94. ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0]))
  95. iy = max(0, min(r1[3], r2[3]) - max(r1[1], r2[1]))
  96. intersection = ix * iy # intersection area
  97. if not intersection:
  98. return 0
  99. area1 = (r1[2] - r1[0]) * (r1[3] - r1[1])
  100. area2 = (r2[2] - r2[0]) * (r2[3] - r2[1])
  101. return intersection / (area1 + area2 - intersection)
  102. def intersects_words_h(bbox, y, word_rects) -> bool:
  103. """Check whether any of the words in bbox are cut through by
  104. horizontal line y.
  105. """
  106. return any(r.y0 < y < r.y1 for r in word_rects if r in bbox)
  107. def get_table_dict_from_rect(textpage, rect):
  108. """Extract MuPDF table structure information from a given rectangle."""
  109. table_dict = {}
  110. pymupdf.extra.make_table_dict(textpage.this.m_internal, table_dict, rect)
  111. return table_dict
  112. def make_table_from_bbox(textpage, word_rects, rect):
  113. """Detect table structure within a given rectangle."""
  114. cells = [] # table cells as (x0,y0,x1,y1) tuples
  115. # calls fz_find_table_within_bounds
  116. block = get_table_dict_from_rect(textpage, rect)
  117. # No table structure found if not a grid block
  118. if block.get("type") != mupdf.FZ_STEXT_BLOCK_GRID:
  119. return cells
  120. bbox = pymupdf.Rect(block["bbox"]) # resulting table bbox
  121. # lists of (pos,uncertainty) tuples
  122. xpos = sorted(block["xpos"], key=lambda x: x[0])
  123. ypos = sorted(block["ypos"], key=lambda y: y[0])
  124. # maximum uncertainties in x and y directions
  125. xmaxu, ymaxu = block["max_uncertain"]
  126. # Modify ypos to remove uncertain positions, and y positions
  127. # that cut through words.
  128. nypos = []
  129. for y, yunc in ypos:
  130. if yunc > 0: # allow no uncertain y values
  131. continue
  132. if intersects_words_h(bbox, y, word_rects):
  133. continue # allow no y that cuts through words
  134. if nypos and (y - nypos[-1] < 3):
  135. nypos[-1] = y # snap close positions
  136. else:
  137. nypos.append(y)
  138. # New max y uncertainty: 35% of remaining y positions.
  139. # Omit x positions that intersect too many words, otherwise
  140. # only remove x for the affected cells.
  141. ymaxu = max(0, round((len(nypos) - 2) * 0.35))
  142. # Exclude x positions with too high uncertainty
  143. # (we allow more uncertainty in x direction)
  144. nxpos = [x[0] for x in xpos if x[1] <= ymaxu]
  145. if bbox.x1 > nxpos[-1] + 3:
  146. nxpos.append(bbox.x1) # ensure right table border
  147. # Compose cells from the remaining x and y positions.
  148. for i in range(len(nypos) - 1):
  149. row_box = pymupdf.Rect(bbox.x0, nypos[i], bbox.x1, nypos[i + 1])
  150. # Sub-select words in this row and sort them by left coordinate
  151. row_words = sorted([r for r in word_rects if r in row_box], key=lambda r: r.x0)
  152. # Sub-select x values that do not cut through words
  153. this_xpos = [x for x in nxpos if not any(r.x0 < x < r.x1 for r in row_words)]
  154. for j in range(len(this_xpos) - 1):
  155. cell = pymupdf.Rect(this_xpos[j], nypos[i], this_xpos[j + 1], nypos[i + 1])
  156. if not cell.is_empty: # valid cell
  157. cells.append(tuple(cell))
  158. # Add new table to TableFinder tables
  159. return cells
  160. def extract_cells(textpage, cell, markdown=False):
  161. """Extract text from a rect-like 'cell' as plain or MD styled text.
  162. This function should ultimately be used to extract text from a table cell.
  163. Markdown output will only work correctly if extraction flag bit
  164. TEXT_COLLECT_STYLES is set.
  165. Args:
  166. textpage: A PyMuPDF TextPage object. Must have been created with
  167. TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
  168. cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
  169. markdown: If True, return text formatted for Markdown.
  170. Returns:
  171. A string with the text extracted from the cell.
  172. """
  173. text = ""
  174. for block in textpage.extractRAWDICT()["blocks"]:
  175. if block["type"] != 0:
  176. continue
  177. block_bbox = block["bbox"]
  178. if (
  179. 0
  180. or block_bbox[0] > cell[2]
  181. or block_bbox[2] < cell[0]
  182. or block_bbox[1] > cell[3]
  183. or block_bbox[3] < cell[1]
  184. ):
  185. continue # skip block outside cell
  186. for line in block["lines"]:
  187. lbbox = line["bbox"]
  188. if (
  189. 0
  190. or lbbox[0] > cell[2]
  191. or lbbox[2] < cell[0]
  192. or lbbox[1] > cell[3]
  193. or lbbox[3] < cell[1]
  194. ):
  195. continue # skip line outside cell
  196. if text: # must be a new line in the cell
  197. text += "<br>" if markdown else "\n"
  198. # strikeout detection only works with horizontal text
  199. horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
  200. for span in line["spans"]:
  201. sbbox = span["bbox"]
  202. if (
  203. 0
  204. or sbbox[0] > cell[2]
  205. or sbbox[2] < cell[0]
  206. or sbbox[1] > cell[3]
  207. or sbbox[3] < cell[1]
  208. ):
  209. continue # skip spans outside cell
  210. # only include chars with more than 50% bbox overlap
  211. span_text = ""
  212. for char in span["chars"]:
  213. this_char = char["c"]
  214. bbox = pymupdf.Rect(char["bbox"])
  215. if abs(bbox & cell) > 0.5 * abs(bbox):
  216. span_text += this_char
  217. elif this_char in white_spaces:
  218. span_text += " "
  219. if not span_text:
  220. continue # skip empty span
  221. if not markdown: # no MD styling
  222. text += span_text
  223. continue
  224. prefix = ""
  225. suffix = ""
  226. if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
  227. prefix += "~~"
  228. suffix = "~~" + suffix
  229. if span["char_flags"] & TEXT_BOLD:
  230. prefix += "**"
  231. suffix = "**" + suffix
  232. if span["flags"] & pymupdf.TEXT_FONT_ITALIC:
  233. prefix += "_"
  234. suffix = "_" + suffix
  235. if span["flags"] & pymupdf.TEXT_FONT_MONOSPACED:
  236. prefix += "`"
  237. suffix = "`" + suffix
  238. if len(span["chars"]) > 2:
  239. span_text = span_text.rstrip()
  240. # if span continues previous styling: extend cell text
  241. if (ls := len(suffix)) and text.endswith(suffix):
  242. text = text[:-ls] + span_text + suffix
  243. else: # append the span with new styling
  244. if not span_text.strip():
  245. text += " "
  246. else:
  247. text += prefix + span_text + suffix
  248. return text.strip()
  249. # -------------------------------------------------------------------
  250. # End of PyMuPDF interface code
  251. # -------------------------------------------------------------------
  252. class UnsetFloat(float):
  253. pass
  254. NON_NEGATIVE_SETTINGS = [
  255. "snap_tolerance",
  256. "snap_x_tolerance",
  257. "snap_y_tolerance",
  258. "join_tolerance",
  259. "join_x_tolerance",
  260. "join_y_tolerance",
  261. "edge_min_length",
  262. "min_words_vertical",
  263. "min_words_horizontal",
  264. "intersection_tolerance",
  265. "intersection_x_tolerance",
  266. "intersection_y_tolerance",
  267. ]
  268. TABLE_STRATEGIES = ["lines", "lines_strict", "text", "explicit"]
  269. UNSET = UnsetFloat(0)
  270. DEFAULT_SNAP_TOLERANCE = 3
  271. DEFAULT_JOIN_TOLERANCE = 3
  272. DEFAULT_MIN_WORDS_VERTICAL = 3
  273. DEFAULT_MIN_WORDS_HORIZONTAL = 1
  274. DEFAULT_X_TOLERANCE = 3
  275. DEFAULT_Y_TOLERANCE = 3
  276. DEFAULT_X_DENSITY = 7.25
  277. DEFAULT_Y_DENSITY = 13
  278. bbox_getter = itemgetter("x0", "top", "x1", "bottom")
  279. LIGATURES = {
  280. "ff": "ff",
  281. "ffi": "ffi",
  282. "ffl": "ffl",
  283. "fi": "fi",
  284. "fl": "fl",
  285. "st": "st",
  286. "ſt": "st",
  287. }
  288. def to_list(collection) -> list:
  289. if isinstance(collection, list):
  290. return collection
  291. elif isinstance(collection, Sequence):
  292. return list(collection)
  293. elif hasattr(collection, "to_dict"):
  294. res = collection.to_dict("records") # pragma: nocover
  295. return res
  296. else:
  297. return list(collection)
  298. class TextMap:
  299. """
  300. A TextMap maps each unicode character in the text to an individual `char`
  301. object (or, in the case of layout-implied whitespace, `None`).
  302. """
  303. def __init__(self, tuples=None) -> None:
  304. self.tuples = tuples
  305. self.as_string = "".join(map(itemgetter(0), tuples))
  306. def match_to_dict(
  307. self,
  308. m,
  309. main_group: int = 0,
  310. return_groups: bool = True,
  311. return_chars: bool = True,
  312. ) -> dict:
  313. subset = self.tuples[m.start(main_group) : m.end(main_group)]
  314. chars = [c for (text, c) in subset if c is not None]
  315. x0, top, x1, bottom = objects_to_bbox(chars)
  316. result = {
  317. "text": m.group(main_group),
  318. "x0": x0,
  319. "top": top,
  320. "x1": x1,
  321. "bottom": bottom,
  322. }
  323. if return_groups:
  324. result["groups"] = m.groups()
  325. if return_chars:
  326. result["chars"] = chars
  327. return result
  328. class WordMap:
  329. """
  330. A WordMap maps words->chars.
  331. """
  332. def __init__(self, tuples) -> None:
  333. self.tuples = tuples
  334. def to_textmap(
  335. self,
  336. layout: bool = False,
  337. layout_width=0,
  338. layout_height=0,
  339. layout_width_chars: int = 0,
  340. layout_height_chars: int = 0,
  341. x_density=DEFAULT_X_DENSITY,
  342. y_density=DEFAULT_Y_DENSITY,
  343. x_shift=0,
  344. y_shift=0,
  345. y_tolerance=DEFAULT_Y_TOLERANCE,
  346. use_text_flow: bool = False,
  347. presorted: bool = False,
  348. expand_ligatures: bool = True,
  349. ) -> TextMap:
  350. """
  351. Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
  352. (char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
  353. structural layout of the text on the page(s), using the following approach:
  354. - Sort the words by (doctop, x0) if not already sorted.
  355. - Calculate the initial doctop for the starting page.
  356. - Cluster the words by doctop (taking `y_tolerance` into account), and
  357. iterate through them.
  358. - For each cluster, calculate the distance between that doctop and the
  359. initial doctop, in points, minus `y_shift`. Divide that distance by
  360. `y_density` to calculate the minimum number of newlines that should come
  361. before this cluster. Append that number of newlines *minus* the number of
  362. newlines already appended, with a minimum of one.
  363. - Then for each cluster, iterate through each word in it. Divide each
  364. word's x0, minus `x_shift`, by `x_density` to calculate the minimum
  365. number of characters that should come before this cluster. Append that
  366. number of spaces *minus* the number of characters and spaces already
  367. appended, with a minimum of one. Then append the word's text.
  368. - At the termination of each line, add more spaces if necessary to
  369. mimic `layout_width`.
  370. - Finally, add newlines to the end if necessary to mimic to
  371. `layout_height`.
  372. Note: This approach currently works best for horizontal, left-to-right
  373. text, but will display all words regardless of orientation. There is room
  374. for improvement in better supporting right-to-left text, as well as
  375. vertical text.
  376. """
  377. _textmap = []
  378. if not len(self.tuples):
  379. return TextMap(_textmap)
  380. expansions = LIGATURES if expand_ligatures else {}
  381. if layout:
  382. if layout_width_chars:
  383. if layout_width:
  384. raise ValueError(
  385. "`layout_width` and `layout_width_chars` cannot both be set."
  386. )
  387. else:
  388. layout_width_chars = int(round(layout_width / x_density))
  389. if layout_height_chars:
  390. if layout_height:
  391. raise ValueError(
  392. "`layout_height` and `layout_height_chars` cannot both be set."
  393. )
  394. else:
  395. layout_height_chars = int(round(layout_height / y_density))
  396. blank_line = [(" ", None)] * layout_width_chars
  397. else:
  398. blank_line = []
  399. num_newlines = 0
  400. words_sorted_doctop = (
  401. self.tuples
  402. if presorted or use_text_flow
  403. else sorted(self.tuples, key=lambda x: float(x[0]["doctop"]))
  404. )
  405. first_word = words_sorted_doctop[0][0]
  406. doctop_start = first_word["doctop"] - first_word["top"]
  407. for i, ws in enumerate(
  408. cluster_objects(
  409. words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance
  410. )
  411. ):
  412. y_dist = (
  413. (ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density
  414. if layout
  415. else 0
  416. )
  417. num_newlines_prepend = max(
  418. # At least one newline, unless this iis the first line
  419. int(i > 0),
  420. # ... or as many as needed to get the imputed "distance" from the top
  421. round(y_dist) - num_newlines,
  422. )
  423. for i in range(num_newlines_prepend):
  424. if not len(_textmap) or _textmap[-1][0] == "\n":
  425. _textmap += blank_line
  426. _textmap.append(("\n", None))
  427. num_newlines += num_newlines_prepend
  428. line_len = 0
  429. line_words_sorted_x0 = (
  430. ws
  431. if presorted or use_text_flow
  432. else sorted(ws, key=lambda x: float(x[0]["x0"]))
  433. )
  434. for word, chars in line_words_sorted_x0:
  435. x_dist = (word["x0"] - x_shift) / x_density if layout else 0
  436. num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len)
  437. _textmap += [(" ", None)] * num_spaces_prepend
  438. line_len += num_spaces_prepend
  439. for c in chars:
  440. letters = expansions.get(c["text"], c["text"])
  441. for letter in letters:
  442. _textmap.append((letter, c))
  443. line_len += 1
  444. # Append spaces at end of line
  445. if layout:
  446. _textmap += [(" ", None)] * (layout_width_chars - line_len)
  447. # Append blank lines at end of text
  448. if layout:
  449. num_newlines_append = layout_height_chars - (num_newlines + 1)
  450. for i in range(num_newlines_append):
  451. if i > 0:
  452. _textmap += blank_line
  453. _textmap.append(("\n", None))
  454. # Remove terminal newline
  455. if _textmap[-1] == ("\n", None):
  456. _textmap = _textmap[:-1]
  457. return TextMap(_textmap)
  458. class WordExtractor:
  459. def __init__(
  460. self,
  461. x_tolerance=DEFAULT_X_TOLERANCE,
  462. y_tolerance=DEFAULT_Y_TOLERANCE,
  463. keep_blank_chars: bool = False,
  464. use_text_flow=False,
  465. horizontal_ltr=True, # Should words be read left-to-right?
  466. vertical_ttb=False, # Should vertical words be read top-to-bottom?
  467. extra_attrs=None,
  468. split_at_punctuation=False,
  469. expand_ligatures=True,
  470. ):
  471. self.x_tolerance = x_tolerance
  472. self.y_tolerance = y_tolerance
  473. self.keep_blank_chars = keep_blank_chars
  474. self.use_text_flow = use_text_flow
  475. self.horizontal_ltr = horizontal_ltr
  476. self.vertical_ttb = vertical_ttb
  477. self.extra_attrs = [] if extra_attrs is None else extra_attrs
  478. # Note: string.punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  479. self.split_at_punctuation = (
  480. string.punctuation
  481. if split_at_punctuation is True
  482. else (split_at_punctuation or "")
  483. )
  484. self.expansions = LIGATURES if expand_ligatures else {}
  485. def merge_chars(self, ordered_chars: list):
  486. x0, top, x1, bottom = objects_to_bbox(ordered_chars)
  487. doctop_adj = ordered_chars[0]["doctop"] - ordered_chars[0]["top"]
  488. upright = ordered_chars[0]["upright"]
  489. direction = 1 if (self.horizontal_ltr if upright else self.vertical_ttb) else -1
  490. matrix = ordered_chars[0]["matrix"]
  491. rotation = 0
  492. if not upright and matrix[1] < 0:
  493. ordered_chars = reversed(ordered_chars)
  494. rotation = 270
  495. if matrix[0] < 0 and matrix[3] < 0:
  496. rotation = 180
  497. elif matrix[1] > 0:
  498. rotation = 90
  499. word = {
  500. "text": "".join(
  501. self.expansions.get(c["text"], c["text"]) for c in ordered_chars
  502. ),
  503. "x0": x0,
  504. "x1": x1,
  505. "top": top,
  506. "doctop": top + doctop_adj,
  507. "bottom": bottom,
  508. "upright": upright,
  509. "direction": direction,
  510. "rotation": rotation,
  511. }
  512. for key in self.extra_attrs:
  513. word[key] = ordered_chars[0][key]
  514. return word
  515. def char_begins_new_word(
  516. self,
  517. prev_char,
  518. curr_char,
  519. ) -> bool:
  520. """This method takes several factors into account to determine if
  521. `curr_char` represents the beginning of a new word:
  522. - Whether the text is "upright" (i.e., non-rotated)
  523. - Whether the user has specified that horizontal text runs
  524. left-to-right (default) or right-to-left, as represented by
  525. self.horizontal_ltr
  526. - Whether the user has specified that vertical text the text runs
  527. top-to-bottom (default) or bottom-to-top, as represented by
  528. self.vertical_ttb
  529. - The x0, top, x1, and bottom attributes of prev_char and
  530. curr_char
  531. - The self.x_tolerance and self.y_tolerance settings. Note: In
  532. this case, x/y refer to those directions for non-rotated text.
  533. For vertical text, they are flipped. A more accurate terminology
  534. might be "*intra*line character distance tolerance" and
  535. "*inter*line character distance tolerance"
  536. An important note: The *intra*line distance is measured from the
  537. *end* of the previous character to the *beginning* of the current
  538. character, while the *inter*line distance is measured from the
  539. *top* of the previous character to the *top* of the next
  540. character. The reasons for this are partly repository-historical,
  541. and partly logical, as successive text lines' bounding boxes often
  542. overlap slightly (and we don't want that overlap to be interpreted
  543. as the two lines being the same line).
  544. The upright-ness of the character determines the attributes to
  545. compare, while horizontal_ltr/vertical_ttb determine the direction
  546. of the comparison.
  547. """
  548. # Note: Due to the grouping step earlier in the process,
  549. # curr_char["upright"] will always equal prev_char["upright"].
  550. if curr_char["upright"]:
  551. x = self.x_tolerance
  552. y = self.y_tolerance
  553. ay = prev_char["top"]
  554. cy = curr_char["top"]
  555. if self.horizontal_ltr:
  556. ax = prev_char["x0"]
  557. bx = prev_char["x1"]
  558. cx = curr_char["x0"]
  559. else:
  560. ax = -prev_char["x1"]
  561. bx = -prev_char["x0"]
  562. cx = -curr_char["x1"]
  563. else:
  564. x = self.y_tolerance
  565. y = self.x_tolerance
  566. ay = prev_char["x0"]
  567. cy = curr_char["x0"]
  568. if self.vertical_ttb:
  569. ax = prev_char["top"]
  570. bx = prev_char["bottom"]
  571. cx = curr_char["top"]
  572. else:
  573. ax = -prev_char["bottom"]
  574. bx = -prev_char["top"]
  575. cx = -curr_char["bottom"]
  576. return bool(
  577. # Intraline test
  578. (cx < ax)
  579. or (cx > bx + x)
  580. # Interline test
  581. or (cy > ay + y)
  582. )
  583. def iter_chars_to_words(self, ordered_chars):
  584. current_word: list = []
  585. def start_next_word(new_char=None):
  586. nonlocal current_word
  587. if current_word:
  588. yield current_word
  589. current_word = [] if new_char is None else [new_char]
  590. for char in ordered_chars:
  591. text = char["text"]
  592. if not self.keep_blank_chars and text.isspace():
  593. yield from start_next_word(None)
  594. elif text in self.split_at_punctuation:
  595. yield from start_next_word(char)
  596. yield from start_next_word(None)
  597. elif current_word and self.char_begins_new_word(current_word[-1], char):
  598. yield from start_next_word(char)
  599. else:
  600. current_word.append(char)
  601. # Finally, after all chars processed
  602. if current_word:
  603. yield current_word
  604. def iter_sort_chars(self, chars):
  605. def upright_key(x) -> int:
  606. return -int(x["upright"])
  607. for upright_cluster in cluster_objects(list(chars), upright_key, 0):
  608. upright = upright_cluster[0]["upright"]
  609. cluster_key = "doctop" if upright else "x0"
  610. # Cluster by line
  611. subclusters = cluster_objects(
  612. upright_cluster, itemgetter(cluster_key), self.y_tolerance
  613. )
  614. for sc in subclusters:
  615. # Sort within line
  616. sort_key = "x0" if upright else "doctop"
  617. to_yield = sorted(sc, key=itemgetter(sort_key))
  618. # Reverse order if necessary
  619. if not (self.horizontal_ltr if upright else self.vertical_ttb):
  620. yield from reversed(to_yield)
  621. else:
  622. yield from to_yield
  623. def iter_extract_tuples(self, chars):
  624. ordered_chars = chars if self.use_text_flow else self.iter_sort_chars(chars)
  625. grouping_key = itemgetter("upright", *self.extra_attrs)
  626. grouped_chars = itertools.groupby(ordered_chars, grouping_key)
  627. for keyvals, char_group in grouped_chars:
  628. for word_chars in self.iter_chars_to_words(char_group):
  629. yield (self.merge_chars(word_chars), word_chars)
  630. def extract_wordmap(self, chars) -> WordMap:
  631. return WordMap(list(self.iter_extract_tuples(chars)))
  632. def extract_words(self, chars: list) -> list:
  633. words = list(word for word, word_chars in self.iter_extract_tuples(chars))
  634. return words
  635. def extract_words(chars: list, **kwargs) -> list:
  636. return WordExtractor(**kwargs).extract_words(chars)
  637. TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
  638. WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
  639. def chars_to_textmap(chars: list, **kwargs) -> TextMap:
  640. kwargs.update({"presorted": True})
  641. extractor = WordExtractor(
  642. **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
  643. )
  644. wordmap = extractor.extract_wordmap(chars)
  645. textmap = wordmap.to_textmap(
  646. **{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
  647. )
  648. return textmap
  649. def extract_text(chars: list, **kwargs) -> str:
  650. chars = to_list(chars)
  651. if len(chars) == 0:
  652. return ""
  653. if kwargs.get("layout"):
  654. return chars_to_textmap(chars, **kwargs).as_string
  655. else:
  656. y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
  657. extractor = WordExtractor(
  658. **{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
  659. )
  660. words = extractor.extract_words(chars)
  661. if words:
  662. rotation = words[0]["rotation"] # rotation cannot change within a cell
  663. else:
  664. rotation = 0
  665. if rotation == 90:
  666. words.sort(key=lambda w: (w["x1"], -w["top"]))
  667. lines = " ".join([w["text"] for w in words])
  668. elif rotation == 270:
  669. words.sort(key=lambda w: (-w["x1"], w["top"]))
  670. lines = " ".join([w["text"] for w in words])
  671. else:
  672. lines = cluster_objects(words, itemgetter("doctop"), y_tolerance)
  673. lines = "\n".join(" ".join(word["text"] for word in line) for line in lines)
  674. if rotation == 180: # needs extra treatment
  675. lines = "".join([(c if c != "\n" else " ") for c in reversed(lines)])
  676. return lines
  677. def collate_line(
  678. line_chars: list,
  679. tolerance=DEFAULT_X_TOLERANCE,
  680. ) -> str:
  681. coll = ""
  682. last_x1 = None
  683. for char in sorted(line_chars, key=itemgetter("x0")):
  684. if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
  685. coll += " "
  686. last_x1 = char["x1"]
  687. coll += char["text"]
  688. return coll
  689. def dedupe_chars(chars: list, tolerance=1) -> list:
  690. """
  691. Removes duplicate chars — those sharing the same text, fontname, size,
  692. and positioning (within `tolerance`) as other characters in the set.
  693. """
  694. key = itemgetter("fontname", "size", "upright", "text")
  695. pos_key = itemgetter("doctop", "x0")
  696. def yield_unique_chars(chars: list):
  697. sorted_chars = sorted(chars, key=key)
  698. for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
  699. for y_cluster in cluster_objects(
  700. list(grp_chars), itemgetter("doctop"), tolerance
  701. ):
  702. for x_cluster in cluster_objects(
  703. y_cluster, itemgetter("x0"), tolerance
  704. ):
  705. yield sorted(x_cluster, key=pos_key)[0]
  706. deduped = yield_unique_chars(chars)
  707. return sorted(deduped, key=chars.index)
  708. def line_to_edge(line):
  709. edge = dict(line)
  710. edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v"
  711. return edge
  712. def rect_to_edges(rect) -> list:
  713. top, bottom, left, right = [dict(rect) for x in range(4)]
  714. top.update(
  715. {
  716. "object_type": "rect_edge",
  717. "height": 0,
  718. "y0": rect["y1"],
  719. "bottom": rect["top"],
  720. "orientation": "h",
  721. }
  722. )
  723. bottom.update(
  724. {
  725. "object_type": "rect_edge",
  726. "height": 0,
  727. "y1": rect["y0"],
  728. "top": rect["top"] + rect["height"],
  729. "doctop": rect["doctop"] + rect["height"],
  730. "orientation": "h",
  731. }
  732. )
  733. left.update(
  734. {
  735. "object_type": "rect_edge",
  736. "width": 0,
  737. "x1": rect["x0"],
  738. "orientation": "v",
  739. }
  740. )
  741. right.update(
  742. {
  743. "object_type": "rect_edge",
  744. "width": 0,
  745. "x0": rect["x1"],
  746. "orientation": "v",
  747. }
  748. )
  749. return [top, bottom, left, right]
  750. def curve_to_edges(curve) -> list:
  751. point_pairs = zip(curve["pts"], curve["pts"][1:])
  752. return [
  753. {
  754. "object_type": "curve_edge",
  755. "x0": min(p0[0], p1[0]),
  756. "x1": max(p0[0], p1[0]),
  757. "top": min(p0[1], p1[1]),
  758. "doctop": min(p0[1], p1[1]) + (curve["doctop"] - curve["top"]),
  759. "bottom": max(p0[1], p1[1]),
  760. "width": abs(p0[0] - p1[0]),
  761. "height": abs(p0[1] - p1[1]),
  762. "orientation": "v" if p0[0] == p1[0] else ("h" if p0[1] == p1[1] else None),
  763. }
  764. for p0, p1 in point_pairs
  765. ]
  766. def obj_to_edges(obj) -> list:
  767. t = obj["object_type"]
  768. if "_edge" in t:
  769. return [obj]
  770. elif t == "line":
  771. return [line_to_edge(obj)]
  772. else:
  773. return {"rect": rect_to_edges, "curve": curve_to_edges}[t](obj)
  774. def filter_edges(
  775. edges,
  776. orientation=None,
  777. edge_type=None,
  778. min_length=1,
  779. ) -> list:
  780. if orientation not in ("v", "h", None):
  781. raise ValueError("Orientation must be 'v' or 'h'")
  782. def test(e) -> bool:
  783. dim = "height" if e["orientation"] == "v" else "width"
  784. et_correct = e["object_type"] == edge_type if edge_type is not None else True
  785. orient_correct = orientation is None or e["orientation"] == orientation
  786. return bool(et_correct and orient_correct and (e[dim] >= min_length))
  787. return list(filter(test, edges))
  788. def cluster_list(xs, tolerance=0) -> list:
  789. if tolerance == 0:
  790. return [[x] for x in sorted(xs)]
  791. if len(xs) < 2:
  792. return [[x] for x in sorted(xs)]
  793. groups = []
  794. xs = list(sorted(xs))
  795. current_group = [xs[0]]
  796. last = xs[0]
  797. for x in xs[1:]:
  798. if x <= (last + tolerance):
  799. current_group.append(x)
  800. else:
  801. groups.append(current_group)
  802. current_group = [x]
  803. last = x
  804. groups.append(current_group)
  805. return groups
  806. def make_cluster_dict(values, tolerance) -> dict:
  807. clusters = cluster_list(list(set(values)), tolerance)
  808. nested_tuples = [
  809. [(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
  810. ]
  811. return dict(itertools.chain(*nested_tuples))
  812. def cluster_objects(xs, key_fn, tolerance) -> list:
  813. if not callable(key_fn):
  814. key_fn = itemgetter(key_fn)
  815. values = map(key_fn, xs)
  816. cluster_dict = make_cluster_dict(values, tolerance)
  817. get_0, get_1 = itemgetter(0), itemgetter(1)
  818. cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
  819. grouped = itertools.groupby(cluster_tuples, key=get_1)
  820. return [list(map(get_0, v)) for k, v in grouped]
  821. def move_object(obj, axis: str, value):
  822. assert axis in ("h", "v")
  823. if axis == "h":
  824. new_items = [
  825. ("x0", obj["x0"] + value),
  826. ("x1", obj["x1"] + value),
  827. ]
  828. if axis == "v":
  829. new_items = [
  830. ("top", obj["top"] + value),
  831. ("bottom", obj["bottom"] + value),
  832. ]
  833. if "doctop" in obj:
  834. new_items += [("doctop", obj["doctop"] + value)]
  835. if "y0" in obj:
  836. new_items += [
  837. ("y0", obj["y0"] - value),
  838. ("y1", obj["y1"] - value),
  839. ]
  840. return obj.__class__(tuple(obj.items()) + tuple(new_items))
  841. def snap_objects(objs, attr: str, tolerance) -> list:
  842. axis = {"x0": "h", "x1": "h", "top": "v", "bottom": "v"}[attr]
  843. list_objs = list(objs)
  844. clusters = cluster_objects(list_objs, itemgetter(attr), tolerance)
  845. avgs = [sum(map(itemgetter(attr), cluster)) / len(cluster) for cluster in clusters]
  846. snapped_clusters = [
  847. [move_object(obj, axis, avg - obj[attr]) for obj in cluster]
  848. for cluster, avg in zip(clusters, avgs)
  849. ]
  850. return list(itertools.chain(*snapped_clusters))
  851. def snap_edges(
  852. edges,
  853. x_tolerance=DEFAULT_SNAP_TOLERANCE,
  854. y_tolerance=DEFAULT_SNAP_TOLERANCE,
  855. ):
  856. """
  857. Given a list of edges, snap any within `tolerance` pixels of one another
  858. to their positional average.
  859. """
  860. by_orientation = {"v": [], "h": []}
  861. for e in edges:
  862. by_orientation[e["orientation"]].append(e)
  863. snapped_v = snap_objects(by_orientation["v"], "x0", x_tolerance)
  864. snapped_h = snap_objects(by_orientation["h"], "top", y_tolerance)
  865. return snapped_v + snapped_h
  866. def resize_object(obj, key: str, value):
  867. assert key in ("x0", "x1", "top", "bottom")
  868. old_value = obj[key]
  869. diff = value - old_value
  870. new_items = [
  871. (key, value),
  872. ]
  873. if key == "x0":
  874. assert value <= obj["x1"]
  875. new_items.append(("width", obj["x1"] - value))
  876. elif key == "x1":
  877. assert value >= obj["x0"]
  878. new_items.append(("width", value - obj["x0"]))
  879. elif key == "top":
  880. assert value <= obj["bottom"]
  881. new_items.append(("doctop", obj["doctop"] + diff))
  882. new_items.append(("height", obj["height"] - diff))
  883. if "y1" in obj:
  884. new_items.append(("y1", obj["y1"] - diff))
  885. elif key == "bottom":
  886. assert value >= obj["top"]
  887. new_items.append(("height", obj["height"] + diff))
  888. if "y0" in obj:
  889. new_items.append(("y0", obj["y0"] - diff))
  890. return obj.__class__(tuple(obj.items()) + tuple(new_items))
  891. def join_edge_group(edges, orientation: str, tolerance=DEFAULT_JOIN_TOLERANCE):
  892. """
  893. Given a list of edges along the same infinite line, join those that
  894. are within `tolerance` pixels of one another.
  895. """
  896. if orientation == "h":
  897. min_prop, max_prop = "x0", "x1"
  898. elif orientation == "v":
  899. min_prop, max_prop = "top", "bottom"
  900. else:
  901. raise ValueError("Orientation must be 'v' or 'h'")
  902. sorted_edges = list(sorted(edges, key=itemgetter(min_prop)))
  903. joined = [sorted_edges[0]]
  904. for e in sorted_edges[1:]:
  905. last = joined[-1]
  906. if e[min_prop] <= (last[max_prop] + tolerance):
  907. if e[max_prop] > last[max_prop]:
  908. # Extend current edge to new extremity
  909. joined[-1] = resize_object(last, max_prop, e[max_prop])
  910. else:
  911. # Edge is separate from previous edges
  912. joined.append(e)
  913. return joined
  914. def merge_edges(
  915. edges,
  916. snap_x_tolerance,
  917. snap_y_tolerance,
  918. join_x_tolerance,
  919. join_y_tolerance,
  920. ):
  921. """
  922. Using the `snap_edges` and `join_edge_group` methods above,
  923. merge a list of edges into a more "seamless" list.
  924. """
  925. def get_group(edge):
  926. if edge["orientation"] == "h":
  927. return ("h", edge["top"])
  928. else:
  929. return ("v", edge["x0"])
  930. if snap_x_tolerance > 0 or snap_y_tolerance > 0:
  931. edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance)
  932. _sorted = sorted(edges, key=get_group)
  933. edge_groups = itertools.groupby(_sorted, key=get_group)
  934. edge_gen = (
  935. join_edge_group(
  936. items, k[0], (join_x_tolerance if k[0] == "h" else join_y_tolerance)
  937. )
  938. for k, items in edge_groups
  939. )
  940. edges = list(itertools.chain(*edge_gen))
  941. return edges
  942. def bbox_to_rect(bbox) -> dict:
  943. """
  944. Return the rectangle (i.e a dict with keys "x0", "top", "x1",
  945. "bottom") for an object.
  946. """
  947. return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
  948. def objects_to_rect(objects) -> dict:
  949. """
  950. Given an iterable of objects, return the smallest rectangle (i.e. a
  951. dict with "x0", "top", "x1", and "bottom" keys) that contains them
  952. all.
  953. """
  954. return bbox_to_rect(objects_to_bbox(objects))
  955. def merge_bboxes(bboxes):
  956. """
  957. Given an iterable of bounding boxes, return the smallest bounding box
  958. that contains them all.
  959. """
  960. x0, top, x1, bottom = zip(*bboxes)
  961. return (min(x0), min(top), max(x1), max(bottom))
  962. def objects_to_bbox(objects):
  963. """
  964. Given an iterable of objects, return the smallest bounding box that
  965. contains them all.
  966. """
  967. return merge_bboxes(map(bbox_getter, objects))
  968. def words_to_edges_h(words, word_threshold: int = DEFAULT_MIN_WORDS_HORIZONTAL):
  969. """
  970. Find (imaginary) horizontal lines that connect the tops
  971. of at least `word_threshold` words.
  972. """
  973. by_top = cluster_objects(words, itemgetter("top"), 1)
  974. large_clusters = filter(lambda x: len(x) >= word_threshold, by_top)
  975. rects = list(map(objects_to_rect, large_clusters))
  976. if len(rects) == 0:
  977. return []
  978. min_x0 = min(map(itemgetter("x0"), rects))
  979. max_x1 = max(map(itemgetter("x1"), rects))
  980. edges = []
  981. for r in rects:
  982. edges += [
  983. # Top of text
  984. {
  985. "x0": min_x0,
  986. "x1": max_x1,
  987. "top": r["top"],
  988. "bottom": r["top"],
  989. "width": max_x1 - min_x0,
  990. "orientation": "h",
  991. },
  992. # For each detected row, we also add the 'bottom' line. This will
  993. # generate extra edges, (some will be redundant with the next row
  994. # 'top' line), but this catches the last row of every table.
  995. {
  996. "x0": min_x0,
  997. "x1": max_x1,
  998. "top": r["bottom"],
  999. "bottom": r["bottom"],
  1000. "width": max_x1 - min_x0,
  1001. "orientation": "h",
  1002. },
  1003. ]
  1004. return edges
  1005. def get_bbox_overlap(a, b):
  1006. a_left, a_top, a_right, a_bottom = a
  1007. b_left, b_top, b_right, b_bottom = b
  1008. o_left = max(a_left, b_left)
  1009. o_right = min(a_right, b_right)
  1010. o_bottom = min(a_bottom, b_bottom)
  1011. o_top = max(a_top, b_top)
  1012. o_width = o_right - o_left
  1013. o_height = o_bottom - o_top
  1014. if o_height >= 0 and o_width >= 0 and o_height + o_width > 0:
  1015. return (o_left, o_top, o_right, o_bottom)
  1016. else:
  1017. return None
  1018. def words_to_edges_v(words, word_threshold: int = DEFAULT_MIN_WORDS_VERTICAL):
  1019. """
  1020. Find (imaginary) vertical lines that connect the left, right, or
  1021. center of at least `word_threshold` words.
  1022. """
  1023. # Find words that share the same left, right, or centerpoints
  1024. by_x0 = cluster_objects(words, itemgetter("x0"), 1)
  1025. by_x1 = cluster_objects(words, itemgetter("x1"), 1)
  1026. def get_center(word):
  1027. return float(word["x0"] + word["x1"]) / 2
  1028. by_center = cluster_objects(words, get_center, 1)
  1029. clusters = by_x0 + by_x1 + by_center
  1030. # Find the points that align with the most words
  1031. sorted_clusters = sorted(clusters, key=lambda x: -len(x))
  1032. large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
  1033. # For each of those points, find the bboxes fitting all matching words
  1034. bboxes = list(map(objects_to_bbox, large_clusters))
  1035. # Iterate through those bboxes, condensing overlapping bboxes
  1036. condensed_bboxes = []
  1037. for bbox in bboxes:
  1038. overlap = any(get_bbox_overlap(bbox, c) for c in condensed_bboxes)
  1039. if not overlap:
  1040. condensed_bboxes.append(bbox)
  1041. if not condensed_bboxes:
  1042. return []
  1043. condensed_rects = map(bbox_to_rect, condensed_bboxes)
  1044. sorted_rects = list(sorted(condensed_rects, key=itemgetter("x0")))
  1045. max_x1 = max(map(itemgetter("x1"), sorted_rects))
  1046. min_top = min(map(itemgetter("top"), sorted_rects))
  1047. max_bottom = max(map(itemgetter("bottom"), sorted_rects))
  1048. return [
  1049. {
  1050. "x0": b["x0"],
  1051. "x1": b["x0"],
  1052. "top": min_top,
  1053. "bottom": max_bottom,
  1054. "height": max_bottom - min_top,
  1055. "orientation": "v",
  1056. }
  1057. for b in sorted_rects
  1058. ] + [
  1059. {
  1060. "x0": max_x1,
  1061. "x1": max_x1,
  1062. "top": min_top,
  1063. "bottom": max_bottom,
  1064. "height": max_bottom - min_top,
  1065. "orientation": "v",
  1066. }
  1067. ]
  1068. def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1) -> dict:
  1069. """
  1070. Given a list of edges, return the points at which they intersect
  1071. within `tolerance` pixels.
  1072. """
  1073. intersections = {}
  1074. v_edges, h_edges = [
  1075. list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
  1076. ]
  1077. for v in sorted(v_edges, key=itemgetter("x0", "top")):
  1078. for h in sorted(h_edges, key=itemgetter("top", "x0")):
  1079. if (
  1080. (v["top"] <= (h["top"] + y_tolerance))
  1081. and (v["bottom"] >= (h["top"] - y_tolerance))
  1082. and (v["x0"] >= (h["x0"] - x_tolerance))
  1083. and (v["x0"] <= (h["x1"] + x_tolerance))
  1084. ):
  1085. vertex = (v["x0"], h["top"])
  1086. if vertex not in intersections:
  1087. intersections[vertex] = {"v": [], "h": []}
  1088. intersections[vertex]["v"].append(v)
  1089. intersections[vertex]["h"].append(h)
  1090. return intersections
  1091. def obj_to_bbox(obj):
  1092. """
  1093. Return the bounding box for an object.
  1094. """
  1095. return bbox_getter(obj)
  1096. def intersections_to_cells(intersections):
  1097. """
  1098. Given a list of points (`intersections`), return all rectangular "cells"
  1099. that those points describe.
  1100. `intersections` should be a dictionary with (x0, top) tuples as keys,
  1101. and a list of edge objects as values. The edge objects should correspond
  1102. to the edges that touch the intersection.
  1103. """
  1104. def edge_connects(p1, p2) -> bool:
  1105. def edges_to_set(edges):
  1106. return set(map(obj_to_bbox, edges))
  1107. if p1[0] == p2[0]:
  1108. common = edges_to_set(intersections[p1]["v"]).intersection(
  1109. edges_to_set(intersections[p2]["v"])
  1110. )
  1111. if len(common):
  1112. return True
  1113. if p1[1] == p2[1]:
  1114. common = edges_to_set(intersections[p1]["h"]).intersection(
  1115. edges_to_set(intersections[p2]["h"])
  1116. )
  1117. if len(common):
  1118. return True
  1119. return False
  1120. points = list(sorted(intersections.keys()))
  1121. n_points = len(points)
  1122. def find_smallest_cell(points, i: int):
  1123. if i == n_points - 1:
  1124. return None
  1125. pt = points[i]
  1126. rest = points[i + 1 :]
  1127. # Get all the points directly below and directly right
  1128. below = [x for x in rest if x[0] == pt[0]]
  1129. right = [x for x in rest if x[1] == pt[1]]
  1130. for below_pt in below:
  1131. if not edge_connects(pt, below_pt):
  1132. continue
  1133. for right_pt in right:
  1134. if not edge_connects(pt, right_pt):
  1135. continue
  1136. bottom_right = (right_pt[0], below_pt[1])
  1137. if (
  1138. (bottom_right in intersections)
  1139. and edge_connects(bottom_right, right_pt)
  1140. and edge_connects(bottom_right, below_pt)
  1141. ):
  1142. return (pt[0], pt[1], bottom_right[0], bottom_right[1])
  1143. return None
  1144. cell_gen = (find_smallest_cell(points, i) for i in range(len(points)))
  1145. return list(filter(None, cell_gen))
  1146. def cells_to_tables(page, cells) -> list:
  1147. """
  1148. Given a list of bounding boxes (`cells`), return a list of tables that
  1149. hold those cells most simply (and contiguously).
  1150. """
  1151. def bbox_to_corners(bbox) -> tuple:
  1152. x0, top, x1, bottom = bbox
  1153. return ((x0, top), (x0, bottom), (x1, top), (x1, bottom))
  1154. remaining_cells = list(cells)
  1155. # Iterate through the cells found above, and assign them
  1156. # to contiguous tables
  1157. current_corners = set()
  1158. current_cells = []
  1159. tables = []
  1160. while len(remaining_cells):
  1161. initial_cell_count = len(current_cells)
  1162. for cell in list(remaining_cells):
  1163. cell_corners = bbox_to_corners(cell)
  1164. # If we're just starting a table ...
  1165. if len(current_cells) == 0:
  1166. # ... immediately assign it to the empty group
  1167. current_corners |= set(cell_corners)
  1168. current_cells.append(cell)
  1169. remaining_cells.remove(cell)
  1170. else:
  1171. # How many corners does this table share with the current group?
  1172. corner_count = sum(c in current_corners for c in cell_corners)
  1173. # If touching on at least one corner...
  1174. if corner_count > 0:
  1175. # ... assign it to the current group
  1176. current_corners |= set(cell_corners)
  1177. current_cells.append(cell)
  1178. remaining_cells.remove(cell)
  1179. # If this iteration did not find any more cells to append...
  1180. if len(current_cells) == initial_cell_count:
  1181. # ... start a new cell group
  1182. tables.append(list(current_cells))
  1183. current_corners.clear()
  1184. current_cells.clear()
  1185. # Once we have exhausting the list of cells ...
  1186. # ... and we have a cell group that has not been stored
  1187. if len(current_cells):
  1188. # ... store it.
  1189. tables.append(list(current_cells))
  1190. # PyMuPDF modification:
  1191. # Remove tables without text or having only 1 column
  1192. for i in range(len(tables) - 1, -1, -1):
  1193. r = pymupdf.EMPTY_RECT()
  1194. x1_vals = set()
  1195. x0_vals = set()
  1196. for c in tables[i]:
  1197. r |= c
  1198. x1_vals.add(c[2])
  1199. x0_vals.add(c[0])
  1200. if (
  1201. len(x1_vals) < 2
  1202. or len(x0_vals) < 2
  1203. or white_spaces.issuperset(
  1204. page.get_textbox(
  1205. r,
  1206. textpage=TEXTPAGE,
  1207. )
  1208. )
  1209. ):
  1210. del tables[i]
  1211. # Sort the tables top-to-bottom-left-to-right based on the value of the
  1212. # topmost-and-then-leftmost coordinate of a table.
  1213. _sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
  1214. return _sorted
  1215. class CellGroup:
  1216. def __init__(self, cells):
  1217. self.cells = cells
  1218. self.bbox = (
  1219. min(map(itemgetter(0), filter(None, cells))),
  1220. min(map(itemgetter(1), filter(None, cells))),
  1221. max(map(itemgetter(2), filter(None, cells))),
  1222. max(map(itemgetter(3), filter(None, cells))),
  1223. )
  1224. class TableRow(CellGroup):
  1225. pass
  1226. class TableHeader:
  1227. """PyMuPDF extension containing the identified table header."""
  1228. def __init__(self, bbox, cells, names, above):
  1229. self.bbox = bbox
  1230. self.cells = cells
  1231. self.names = names
  1232. self.external = above
  1233. class Table:
  1234. def __init__(self, page, cells):
  1235. self.page = page
  1236. self.cells = cells
  1237. self.header = self._get_header() # PyMuPDF extension
  1238. @property
  1239. def bbox(self):
  1240. c = self.cells
  1241. return (
  1242. min(map(itemgetter(0), c)),
  1243. min(map(itemgetter(1), c)),
  1244. max(map(itemgetter(2), c)),
  1245. max(map(itemgetter(3), c)),
  1246. )
  1247. @property
  1248. def rows(self) -> list:
  1249. _sorted = sorted(self.cells, key=itemgetter(1, 0))
  1250. xs = list(sorted(set(map(itemgetter(0), self.cells))))
  1251. rows = []
  1252. for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
  1253. xdict = {cell[0]: cell for cell in row_cells}
  1254. row = TableRow([xdict.get(x) for x in xs])
  1255. rows.append(row)
  1256. return rows
  1257. @property
  1258. def row_count(self) -> int: # PyMuPDF extension
  1259. return len(self.rows)
  1260. @property
  1261. def col_count(self) -> int: # PyMuPDF extension
  1262. return max([len(r.cells) for r in self.rows])
  1263. def extract(self, **kwargs) -> list:
  1264. chars = CHARS
  1265. table_arr = []
  1266. def char_in_bbox(char, bbox) -> bool:
  1267. v_mid = (char["top"] + char["bottom"]) / 2
  1268. h_mid = (char["x0"] + char["x1"]) / 2
  1269. x0, top, x1, bottom = bbox
  1270. return bool(
  1271. (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
  1272. )
  1273. for row in self.rows:
  1274. arr = []
  1275. row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
  1276. for cell in row.cells:
  1277. if cell is None:
  1278. cell_text = None
  1279. else:
  1280. cell_chars = [
  1281. char for char in row_chars if char_in_bbox(char, cell)
  1282. ]
  1283. if len(cell_chars):
  1284. kwargs["x_shift"] = cell[0]
  1285. kwargs["y_shift"] = cell[1]
  1286. if "layout" in kwargs:
  1287. kwargs["layout_width"] = cell[2] - cell[0]
  1288. kwargs["layout_height"] = cell[3] - cell[1]
  1289. cell_text = extract_text(cell_chars, **kwargs)
  1290. else:
  1291. cell_text = ""
  1292. arr.append(cell_text)
  1293. table_arr.append(arr)
  1294. return table_arr
  1295. def to_markdown(self, clean=False, fill_empty=True):
  1296. """Output table content as a string in Github-markdown format.
  1297. If "clean" then markdown syntax is removed from cell content.
  1298. If "fill_empty" then cell content None is replaced by the values
  1299. above (columns) or left (rows) in an effort to approximate row and
  1300. columns spans.
  1301. """
  1302. output = "|"
  1303. rows = self.row_count
  1304. cols = self.col_count
  1305. # cell coordinates
  1306. cell_boxes = [[c for c in r.cells] for r in self.rows]
  1307. # cell text strings
  1308. cells = [[None for i in range(cols)] for j in range(rows)]
  1309. for i, row in enumerate(cell_boxes):
  1310. for j, cell in enumerate(row):
  1311. if cell is not None:
  1312. cells[i][j] = extract_cells(
  1313. TEXTPAGE, cell_boxes[i][j], markdown=True
  1314. )
  1315. if fill_empty: # fill "None" cells where possible
  1316. # for rows, copy content from left to right
  1317. for j in range(rows):
  1318. for i in range(cols - 1):
  1319. if cells[j][i + 1] is None:
  1320. cells[j][i + 1] = cells[j][i]
  1321. # for columns, copy top to bottom
  1322. for i in range(cols):
  1323. for j in range(rows - 1):
  1324. if cells[j + 1][i] is None:
  1325. cells[j + 1][i] = cells[j][i]
  1326. # generate header string and MD separator
  1327. for i, name in enumerate(self.header.names):
  1328. if not name: # generate a name if empty
  1329. name = f"Col{i+1}"
  1330. name = name.replace("\n", "<br>") # use HTML line breaks
  1331. if clean: # remove sensitive syntax
  1332. name = html.escape(name.replace("-", "&#45;"))
  1333. output += name + "|"
  1334. output += "\n"
  1335. # insert GitHub header line separator
  1336. output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
  1337. # skip first row in details if header is part of the table
  1338. j = 0 if self.header.external else 1
  1339. # iterate over detail rows
  1340. for row in cells[j:]:
  1341. line = "|"
  1342. for i, cell in enumerate(row):
  1343. # replace None cells with empty string
  1344. # use HTML line break tag
  1345. if cell is None:
  1346. cell = ""
  1347. if clean: # remove sensitive syntax
  1348. cell = html.escape(cell.replace("-", "&#45;"))
  1349. line += cell + "|"
  1350. line += "\n"
  1351. output += line
  1352. return output + "\n"
  1353. def to_pandas(self, **kwargs):
  1354. """Return a pandas DataFrame version of the table."""
  1355. try:
  1356. import pandas as pd
  1357. except ModuleNotFoundError:
  1358. pymupdf.message("Package 'pandas' is not installed")
  1359. raise
  1360. pd_dict = {}
  1361. extract = self.extract()
  1362. hdr = self.header
  1363. names = self.header.names
  1364. hdr_len = len(names)
  1365. # ensure uniqueness of column names
  1366. for i in range(hdr_len):
  1367. name = names[i]
  1368. if not name:
  1369. names[i] = f"Col{i}"
  1370. if hdr_len != len(set(names)):
  1371. for i in range(hdr_len):
  1372. name = names[i]
  1373. if name != f"Col{i}":
  1374. names[i] = f"{i}-{name}"
  1375. if not hdr.external: # header is part of 'extract'
  1376. extract = extract[1:]
  1377. for i in range(hdr_len):
  1378. key = names[i]
  1379. value = []
  1380. for j in range(len(extract)):
  1381. value.append(extract[j][i])
  1382. pd_dict[key] = value
  1383. return pd.DataFrame(pd_dict)
  1384. def _get_header(self, y_tolerance=3):
  1385. """Identify the table header.
  1386. *** PyMuPDF extension. ***
  1387. Starting from the first line above the table upwards, check if it
  1388. qualifies to be part of the table header.
  1389. Criteria include:
  1390. * A one-line table never has an extra header.
  1391. * Column borders must not intersect any word. If this happens, all
  1392. text of this line and above of it is ignored.
  1393. * No excess inter-line distance: If a line further up has a distance
  1394. of more than 1.5 times of its font size, it will be ignored and
  1395. all lines above of it.
  1396. * Must have same text properties.
  1397. * Starting with the top table line, a bold text property cannot change
  1398. back to non-bold.
  1399. If not all criteria are met (or there is no text above the table),
  1400. the first table row is assumed to be the header.
  1401. """
  1402. page = self.page
  1403. y_delta = y_tolerance
  1404. def top_row_bg_color(self):
  1405. """
  1406. Compare top row background color with color of same-sized bbox
  1407. above. If different, return True indicating that the original
  1408. table top row is already the header.
  1409. """
  1410. bbox0 = pymupdf.Rect(self.rows[0].bbox)
  1411. bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
  1412. top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
  1413. top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
  1414. if top_color0 != top_colort:
  1415. return True # top row is header
  1416. return False
  1417. def row_has_bold(bbox):
  1418. """Check if a row contains some bold text.
  1419. If e.g. true for the top row, then it will be used as (internal)
  1420. column header row if any of the following is true:
  1421. * the previous (above) text line has no bold span
  1422. * the second table row text has no bold span
  1423. Returns True if any spans are bold else False.
  1424. """
  1425. blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT, clip=bbox)[
  1426. "blocks"
  1427. ]
  1428. spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
  1429. return any(s["flags"] & pymupdf.TEXT_FONT_BOLD for s in spans)
  1430. try:
  1431. row = self.rows[0]
  1432. cells = row.cells
  1433. bbox = pymupdf.Rect(row.bbox)
  1434. except IndexError: # this table has no rows
  1435. return None
  1436. # return this if we determine that the top row is the header
  1437. header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
  1438. # 1-line tables have no extra header
  1439. if len(self.rows) < 2:
  1440. return header_top_row
  1441. # 1-column tables have no extra header
  1442. if len(cells) < 2:
  1443. return header_top_row
  1444. # assume top row is the header if second row is empty
  1445. row2 = self.rows[1] # second row
  1446. if all(c is None for c in row2.cells): # no valid cell bboxes in row2
  1447. return header_top_row
  1448. # Special check: is top row bold?
  1449. top_row_bold = row_has_bold(bbox)
  1450. # assume top row is header if it is bold and any cell
  1451. # of 2nd row is non-bold
  1452. if top_row_bold and not row_has_bold(row2.bbox):
  1453. return header_top_row
  1454. if top_row_bg_color(self):
  1455. # if area above top row has a different background color,
  1456. # then top row is already the header
  1457. return header_top_row
  1458. # column coordinates (x1 values) in top row
  1459. col_x = [c[2] if c is not None else None for c in cells[:-1]]
  1460. # clip = page area above the table
  1461. # We will inspect this area for text qualifying as column header.
  1462. clip = +bbox # take row 0 bbox
  1463. clip.y0 = 0 # start at top of page
  1464. clip.y1 = bbox.y0 # end at top of table
  1465. blocks = page.get_text("dict", clip=clip, flags=pymupdf.TEXTFLAGS_TEXT)[
  1466. "blocks"
  1467. ]
  1468. # non-empty, non-superscript spans above table, sorted descending by y1
  1469. spans = sorted(
  1470. [
  1471. s
  1472. for b in blocks
  1473. for l in b["lines"]
  1474. for s in l["spans"]
  1475. if not (
  1476. white_spaces.issuperset(s["text"])
  1477. or s["flags"] & pymupdf.TEXT_FONT_SUPERSCRIPT
  1478. )
  1479. ],
  1480. key=lambda s: s["bbox"][3],
  1481. reverse=True,
  1482. )
  1483. select = [] # y1 coordinates above, sorted descending
  1484. line_heights = [] # line heights above, sorted descending
  1485. line_bolds = [] # bold indicator per line above, same sorting
  1486. # walk through the spans and fill above 3 lists
  1487. for i in range(len(spans)):
  1488. s = spans[i]
  1489. y1 = s["bbox"][3] # span bottom
  1490. h = y1 - s["bbox"][1] # span bbox height
  1491. bold = s["flags"] & pymupdf.TEXT_FONT_BOLD
  1492. # use first item to start the lists
  1493. if i == 0:
  1494. select.append(y1)
  1495. line_heights.append(h)
  1496. line_bolds.append(bold)
  1497. continue
  1498. # get previous items from the 3 lists
  1499. y0 = select[-1]
  1500. h0 = line_heights[-1]
  1501. bold0 = line_bolds[-1]
  1502. if bold0 and not bold:
  1503. break # stop if switching from bold to non-bold
  1504. # if fitting in height of previous span, modify bbox
  1505. if y0 - y1 <= y_delta or abs((y0 - h0) - s["bbox"][1]) <= y_delta:
  1506. s["bbox"] = (s["bbox"][0], y0 - h0, s["bbox"][2], y0)
  1507. spans[i] = s
  1508. if bold:
  1509. line_bolds[-1] = bold
  1510. continue
  1511. elif y0 - y1 > 1.5 * h0:
  1512. break # stop if distance to previous line too large
  1513. select.append(y1)
  1514. line_heights.append(h)
  1515. line_bolds.append(bold)
  1516. if select == []: # nothing above the table?
  1517. return header_top_row
  1518. select = select[:5] # accept up to 5 lines for an external header
  1519. # assume top row as header if text above is too far away
  1520. if bbox.y0 - select[0] >= line_heights[0]:
  1521. return header_top_row
  1522. # accept top row as header if bold, but line above is not
  1523. if top_row_bold and not line_bolds[0]:
  1524. return header_top_row
  1525. if spans == []: # nothing left above the table, return top row
  1526. return header_top_row
  1527. # re-compute clip above table
  1528. nclip = pymupdf.EMPTY_RECT()
  1529. for s in [s for s in spans if s["bbox"][3] >= select[-1]]:
  1530. nclip |= s["bbox"]
  1531. if not nclip.is_empty:
  1532. clip = nclip
  1533. clip.y1 = bbox.y0 # make sure we still include every word above
  1534. # Confirm that no word in clip is intersecting a column separator
  1535. word_rects = [pymupdf.Rect(w[:4]) for w in page.get_text("words", clip=clip)]
  1536. word_tops = sorted(list(set([r[1] for r in word_rects])), reverse=True)
  1537. select = []
  1538. # exclude lines with words that intersect a column border
  1539. for top in word_tops:
  1540. intersecting = [
  1541. (x, r)
  1542. for x in col_x
  1543. if x is not None
  1544. for r in word_rects
  1545. if r[1] == top and r[0] < x and r[2] > x
  1546. ]
  1547. if intersecting == []:
  1548. select.append(top)
  1549. else: # detected a word crossing a column border
  1550. break
  1551. if select == []: # nothing left over: return first row
  1552. return header_top_row
  1553. hdr_bbox = +clip # compute the header cells
  1554. hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words
  1555. hdr_cells = [
  1556. (c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None
  1557. for c in cells
  1558. ]
  1559. # adjust left/right of header bbox
  1560. hdr_bbox.x0 = self.bbox[0]
  1561. hdr_bbox.x1 = self.bbox[2]
  1562. # column names: no line breaks, no excess spaces
  1563. hdr_names = [
  1564. (
  1565. page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip()
  1566. if c is not None
  1567. else ""
  1568. )
  1569. for c in hdr_cells
  1570. ]
  1571. return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)
  1572. @dataclass
  1573. class TableSettings:
  1574. vertical_strategy: str = "lines"
  1575. horizontal_strategy: str = "lines"
  1576. explicit_vertical_lines: list = None
  1577. explicit_horizontal_lines: list = None
  1578. snap_tolerance: float = DEFAULT_SNAP_TOLERANCE
  1579. snap_x_tolerance: float = UNSET
  1580. snap_y_tolerance: float = UNSET
  1581. join_tolerance: float = DEFAULT_JOIN_TOLERANCE
  1582. join_x_tolerance: float = UNSET
  1583. join_y_tolerance: float = UNSET
  1584. edge_min_length: float = 3
  1585. min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL
  1586. min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL
  1587. intersection_tolerance: float = 3
  1588. intersection_x_tolerance: float = UNSET
  1589. intersection_y_tolerance: float = UNSET
  1590. text_settings: dict = None
  1591. def __post_init__(self) -> "TableSettings":
  1592. """Clean up user-provided table settings.
  1593. Validates that the table settings provided consists of acceptable values and
  1594. returns a cleaned up version. The cleaned up version fills out the missing
  1595. values with the default values in the provided settings.
  1596. TODO: Can be further used to validate that the values are of the correct
  1597. type. For example, raising a value error when a non-boolean input is
  1598. provided for the key ``keep_blank_chars``.
  1599. :param table_settings: User-provided table settings.
  1600. :returns: A cleaned up version of the user-provided table settings.
  1601. :raises ValueError: When an unrecognised key is provided.
  1602. """
  1603. for setting in NON_NEGATIVE_SETTINGS:
  1604. if (getattr(self, setting) or 0) < 0:
  1605. raise ValueError(f"Table setting '{setting}' cannot be negative")
  1606. for orientation in ["horizontal", "vertical"]:
  1607. strategy = getattr(self, orientation + "_strategy")
  1608. if strategy not in TABLE_STRATEGIES:
  1609. raise ValueError(
  1610. f"{orientation}_strategy must be one of"
  1611. f'{{{",".join(TABLE_STRATEGIES)}}}'
  1612. )
  1613. if self.text_settings is None:
  1614. self.text_settings = {}
  1615. # This next section is for backwards compatibility
  1616. for attr in ["x_tolerance", "y_tolerance"]:
  1617. if attr not in self.text_settings:
  1618. self.text_settings[attr] = self.text_settings.get("tolerance", 3)
  1619. if "tolerance" in self.text_settings:
  1620. del self.text_settings["tolerance"]
  1621. # End of that section
  1622. for attr, fallback in [
  1623. ("snap_x_tolerance", "snap_tolerance"),
  1624. ("snap_y_tolerance", "snap_tolerance"),
  1625. ("join_x_tolerance", "join_tolerance"),
  1626. ("join_y_tolerance", "join_tolerance"),
  1627. ("intersection_x_tolerance", "intersection_tolerance"),
  1628. ("intersection_y_tolerance", "intersection_tolerance"),
  1629. ]:
  1630. if getattr(self, attr) is UNSET:
  1631. setattr(self, attr, getattr(self, fallback))
  1632. return self
  1633. @classmethod
  1634. def resolve(cls, settings=None):
  1635. if settings is None:
  1636. return cls()
  1637. elif isinstance(settings, cls):
  1638. return settings
  1639. elif isinstance(settings, dict):
  1640. core_settings = {}
  1641. text_settings = {}
  1642. for k, v in settings.items():
  1643. if k[:5] == "text_":
  1644. text_settings[k[5:]] = v
  1645. else:
  1646. core_settings[k] = v
  1647. core_settings["text_settings"] = text_settings
  1648. return cls(**core_settings)
  1649. else:
  1650. raise ValueError(f"Cannot resolve settings: {settings}")
  1651. class TableFinder:
  1652. """
  1653. Given a PDF page, find plausible table structures.
  1654. Largely borrowed from Anssi Nurminen's master's thesis:
  1655. http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
  1656. ... and inspired by Tabula:
  1657. https://github.com/tabulapdf/tabula-extractor/issues/16
  1658. """
  1659. def __init__(self, page, settings=None):
  1660. self.page = weakref.proxy(page)
  1661. self.settings = TableSettings.resolve(settings)
  1662. self.edges = self.get_edges()
  1663. self.intersections = edges_to_intersections(
  1664. self.edges,
  1665. self.settings.intersection_x_tolerance,
  1666. self.settings.intersection_y_tolerance,
  1667. )
  1668. self.cells = intersections_to_cells(self.intersections)
  1669. self.tables = [
  1670. Table(self.page, cell_group)
  1671. for cell_group in cells_to_tables(self.page, self.cells)
  1672. ]
  1673. def get_edges(self) -> list:
  1674. settings = self.settings
  1675. for orientation in ["vertical", "horizontal"]:
  1676. strategy = getattr(settings, orientation + "_strategy")
  1677. if strategy == "explicit":
  1678. lines = getattr(settings, "explicit_" + orientation + "_lines")
  1679. if len(lines) < 2:
  1680. raise ValueError(
  1681. f"If {orientation}_strategy == 'explicit', "
  1682. f"explicit_{orientation}_lines "
  1683. f"must be specified as a list/tuple of two or more "
  1684. f"floats/ints."
  1685. )
  1686. v_strat = settings.vertical_strategy
  1687. h_strat = settings.horizontal_strategy
  1688. if v_strat == "text" or h_strat == "text":
  1689. words = extract_words(CHARS, **(settings.text_settings or {}))
  1690. else:
  1691. words = []
  1692. v_explicit = []
  1693. for desc in settings.explicit_vertical_lines or []:
  1694. if isinstance(desc, dict):
  1695. for e in obj_to_edges(desc):
  1696. if e["orientation"] == "v":
  1697. v_explicit.append(e)
  1698. else:
  1699. v_explicit.append(
  1700. {
  1701. "x0": desc,
  1702. "x1": desc,
  1703. "top": self.page.rect[1],
  1704. "bottom": self.page.rect[3],
  1705. "height": self.page.rect[3] - self.page.rect[1],
  1706. "orientation": "v",
  1707. }
  1708. )
  1709. if v_strat == "lines":
  1710. v_base = filter_edges(EDGES, "v")
  1711. elif v_strat == "lines_strict":
  1712. v_base = filter_edges(EDGES, "v", edge_type="line")
  1713. elif v_strat == "text":
  1714. v_base = words_to_edges_v(words, word_threshold=settings.min_words_vertical)
  1715. elif v_strat == "explicit":
  1716. v_base = []
  1717. else:
  1718. v_base = []
  1719. v = v_base + v_explicit
  1720. h_explicit = []
  1721. for desc in settings.explicit_horizontal_lines or []:
  1722. if isinstance(desc, dict):
  1723. for e in obj_to_edges(desc):
  1724. if e["orientation"] == "h":
  1725. h_explicit.append(e)
  1726. else:
  1727. h_explicit.append(
  1728. {
  1729. "x0": self.page.rect[0],
  1730. "x1": self.page.rect[2],
  1731. "width": self.page.rect[2] - self.page.rect[0],
  1732. "top": desc,
  1733. "bottom": desc,
  1734. "orientation": "h",
  1735. }
  1736. )
  1737. if h_strat == "lines":
  1738. h_base = filter_edges(EDGES, "h")
  1739. elif h_strat == "lines_strict":
  1740. h_base = filter_edges(EDGES, "h", edge_type="line")
  1741. elif h_strat == "text":
  1742. h_base = words_to_edges_h(
  1743. words, word_threshold=settings.min_words_horizontal
  1744. )
  1745. elif h_strat == "explicit":
  1746. h_base = []
  1747. else:
  1748. h_base = []
  1749. h = h_base + h_explicit
  1750. edges = list(v) + list(h)
  1751. edges = merge_edges(
  1752. edges,
  1753. snap_x_tolerance=settings.snap_x_tolerance,
  1754. snap_y_tolerance=settings.snap_y_tolerance,
  1755. join_x_tolerance=settings.join_x_tolerance,
  1756. join_y_tolerance=settings.join_y_tolerance,
  1757. )
  1758. return filter_edges(edges, min_length=settings.edge_min_length)
  1759. def __getitem__(self, i):
  1760. tcount = len(self.tables)
  1761. if i >= tcount:
  1762. raise IndexError("table not on page")
  1763. while i < 0:
  1764. i += tcount
  1765. return self.tables[i]
  1766. """
  1767. Start of PyMuPDF interface code.
  1768. The following functions are executed when "page.find_tables()" is called.
  1769. * make_chars: Fills the CHARS list with text character information extracted
  1770. via "rawdict" text extraction. Items in CHARS are formatted
  1771. as expected by the table code.
  1772. * make_edges: Fills the EDGES list with vector graphic information extracted
  1773. via "get_drawings". Items in EDGES are formatted as expected
  1774. by the table code.
  1775. The lists CHARS and EDGES are used to replace respective document access
  1776. of pdfplumber or, respectively pdfminer.
  1777. The table code has been modified to use these lists instead of accessing
  1778. page information themselves.
  1779. """
  1780. # -----------------------------------------------------------------------------
  1781. # Extract all page characters to fill the CHARS list
  1782. # -----------------------------------------------------------------------------
  1783. def make_chars(page, clip=None):
  1784. """Extract text as "rawdict" to fill CHARS."""
  1785. global TEXTPAGE
  1786. page_number = page.number + 1
  1787. page_height = page.rect.height
  1788. ctm = page.transformation_matrix
  1789. TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
  1790. blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
  1791. doctop_base = page_height * page.number
  1792. for block in blocks:
  1793. for line in block["lines"]:
  1794. ldir = line["dir"] # = (cosine, sine) of angle
  1795. ldir = (round(ldir[0], 4), round(ldir[1], 4))
  1796. matrix = pymupdf.Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
  1797. if ldir[1] == 0:
  1798. upright = True
  1799. else:
  1800. upright = False
  1801. for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
  1802. fontname = span["font"]
  1803. fontsize = span["size"]
  1804. color = pymupdf.sRGB_to_pdf(span["color"])
  1805. for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
  1806. bbox = pymupdf.Rect(char["bbox"])
  1807. bbox_ctm = bbox * ctm
  1808. origin = pymupdf.Point(char["origin"]) * ctm
  1809. matrix.e = origin.x
  1810. matrix.f = origin.y
  1811. text = char["c"]
  1812. char_dict = {
  1813. "adv": bbox.x1 - bbox.x0 if upright else bbox.y1 - bbox.y0,
  1814. "bottom": bbox.y1,
  1815. "doctop": bbox.y0 + doctop_base,
  1816. "fontname": fontname,
  1817. "height": bbox.y1 - bbox.y0,
  1818. "matrix": tuple(matrix),
  1819. "ncs": "DeviceRGB",
  1820. "non_stroking_color": color,
  1821. "non_stroking_pattern": None,
  1822. "object_type": "char",
  1823. "page_number": page_number,
  1824. "size": fontsize if upright else bbox.y1 - bbox.y0,
  1825. "stroking_color": color,
  1826. "stroking_pattern": None,
  1827. "text": text,
  1828. "top": bbox.y0,
  1829. "upright": upright,
  1830. "width": bbox.x1 - bbox.x0,
  1831. "x0": bbox.x0,
  1832. "x1": bbox.x1,
  1833. "y0": bbox_ctm.y0,
  1834. "y1": bbox_ctm.y1,
  1835. }
  1836. CHARS.append(char_dict)
  1837. # ------------------------------------------------------------------------
  1838. # Extract all page vector graphics to fill the EDGES list.
  1839. # We are ignoring Bézier curves completely and are converting everything
  1840. # else to lines.
  1841. # ------------------------------------------------------------------------
  1842. def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
  1843. snap_x = tset.snap_x_tolerance
  1844. snap_y = tset.snap_y_tolerance
  1845. min_length = tset.edge_min_length
  1846. lines_strict = (
  1847. tset.vertical_strategy == "lines_strict"
  1848. or tset.horizontal_strategy == "lines_strict"
  1849. )
  1850. page_height = page.rect.height
  1851. doctop_basis = page.number * page_height
  1852. page_number = page.number + 1
  1853. prect = page.rect
  1854. if page.rotation in (90, 270):
  1855. w, h = prect.br
  1856. prect = pymupdf.Rect(0, 0, h, w)
  1857. if clip is not None:
  1858. clip = pymupdf.Rect(clip)
  1859. else:
  1860. clip = prect
  1861. def are_neighbors(r1, r2):
  1862. """Detect whether r1, r2 are neighbors.
  1863. Defined as:
  1864. The minimum distance between points of r1 and points of r2 is not
  1865. larger than some delta.
  1866. This check supports empty rect-likes and thus also lines.
  1867. Note:
  1868. This type of check is MUCH faster than native Rect containment checks.
  1869. """
  1870. if ( # check if x-coordinates of r1 are within those of r2
  1871. r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
  1872. or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
  1873. ) and ( # ... same for y-coordinates
  1874. r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
  1875. or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
  1876. ):
  1877. return True
  1878. # same check with r1 / r2 exchanging their roles (this is necessary!)
  1879. if (
  1880. r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x
  1881. or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x
  1882. ) and (
  1883. r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y
  1884. or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y
  1885. ):
  1886. return True
  1887. return False
  1888. def clean_graphics(npaths=None):
  1889. """Detect and join rectangles of "connected" vector graphics."""
  1890. if npaths is None:
  1891. allpaths = page.get_drawings()
  1892. else: # accept passed-in vector graphics
  1893. allpaths = npaths[:] # paths relevant for table detection
  1894. paths = []
  1895. for p in allpaths:
  1896. # If only looking at lines, we ignore fill-only paths,
  1897. # except simulated lines (i.e. small width or height).
  1898. if (
  1899. lines_strict
  1900. and p["type"] == "f"
  1901. and p["rect"].width > snap_x
  1902. and p["rect"].height > snap_y
  1903. ):
  1904. continue
  1905. paths.append(p)
  1906. # start with all vector graphics rectangles
  1907. prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0))
  1908. new_rects = [] # the final list of joined rectangles
  1909. # ----------------------------------------------------------------
  1910. # Strategy: Join rectangles that "almost touch" each other.
  1911. # Extend first rectangle with any other that is a "neighbor".
  1912. # Then move it to the final list and continue with the rest.
  1913. # ----------------------------------------------------------------
  1914. while prects: # the algorithm will empty this list
  1915. prect0 = prects[0] # copy of first rectangle (performance reasons!)
  1916. repeat = True
  1917. while repeat: # this loop extends first rect in list
  1918. repeat = False # set to true again if some other rect touches
  1919. for i in range(len(prects) - 1, 0, -1): # run backwards
  1920. if are_neighbors(prect0, prects[i]): # close enough to rect 0?
  1921. prect0 |= prects[i].tl # extend rect 0
  1922. prect0 |= prects[i].br # extend rect 0
  1923. del prects[i] # delete this rect
  1924. repeat = True # keep checking the rest
  1925. # move rect 0 over to result list if there is some text in it
  1926. if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
  1927. # contains text, so accept it as a table bbox candidate
  1928. new_rects.append(prect0)
  1929. del prects[0] # remove from rect list
  1930. return new_rects, paths
  1931. bboxes, paths = clean_graphics(npaths=paths)
  1932. def is_parallel(p1, p2):
  1933. """Check if line is roughly axis-parallel."""
  1934. if abs(p1.x - p2.x) <= snap_x or abs(p1.y - p2.y) <= snap_y:
  1935. return True
  1936. return False
  1937. def make_line(p, p1, p2, clip):
  1938. """Given 2 points, make a line dictionary for table detection."""
  1939. if not is_parallel(p1, p2): # only accepting axis-parallel lines
  1940. return {}
  1941. # compute the extremal values
  1942. x0 = min(p1.x, p2.x)
  1943. x1 = max(p1.x, p2.x)
  1944. y0 = min(p1.y, p2.y)
  1945. y1 = max(p1.y, p2.y)
  1946. # check for outside clip
  1947. if x0 > clip.x1 or x1 < clip.x0 or y0 > clip.y1 or y1 < clip.y0:
  1948. return {}
  1949. if x0 < clip.x0:
  1950. x0 = clip.x0 # adjust to clip boundary
  1951. if x1 > clip.x1:
  1952. x1 = clip.x1 # adjust to clip boundary
  1953. if y0 < clip.y0:
  1954. y0 = clip.y0 # adjust to clip boundary
  1955. if y1 > clip.y1:
  1956. y1 = clip.y1 # adjust to clip boundary
  1957. width = x1 - x0 # from adjusted values
  1958. height = y1 - y0 # from adjusted values
  1959. if width == height == 0:
  1960. return {} # nothing left to deal with
  1961. line_dict = {
  1962. "x0": x0,
  1963. "y0": page_height - y0,
  1964. "x1": x1,
  1965. "y1": page_height - y1,
  1966. "width": width,
  1967. "height": height,
  1968. "pts": [(x0, y0), (x1, y1)],
  1969. "linewidth": p["width"],
  1970. "stroke": True,
  1971. "fill": False,
  1972. "evenodd": False,
  1973. "stroking_color": p["color"] if p["color"] else p["fill"],
  1974. "non_stroking_color": None,
  1975. "object_type": "line",
  1976. "page_number": page_number,
  1977. "stroking_pattern": None,
  1978. "non_stroking_pattern": None,
  1979. "top": y0,
  1980. "bottom": y1,
  1981. "doctop": y0 + doctop_basis,
  1982. }
  1983. return line_dict
  1984. for p in paths:
  1985. items = p["items"] # items in this path
  1986. # if 'closePath', add a line from last to first point
  1987. if p["closePath"] and items[0][0] == "l" and items[-1][0] == "l":
  1988. items.append(("l", items[-1][2], items[0][1]))
  1989. for i in items:
  1990. if i[0] not in ("l", "re", "qu"):
  1991. continue # ignore anything else
  1992. if i[0] == "l": # a line
  1993. p1, p2 = i[1:]
  1994. line_dict = make_line(p, p1, p2, clip)
  1995. if line_dict:
  1996. EDGES.append(line_to_edge(line_dict))
  1997. elif i[0] == "re":
  1998. # A rectangle: decompose into 4 lines, but filter out
  1999. # the ones that simulate a line
  2000. rect = i[1].normalize() # normalize the rectangle
  2001. if (
  2002. rect.width <= min_length and rect.width < rect.height
  2003. ): # simulates a vertical line
  2004. x = abs(rect.x1 + rect.x0) / 2 # take middle value for x
  2005. p1 = pymupdf.Point(x, rect.y0)
  2006. p2 = pymupdf.Point(x, rect.y1)
  2007. line_dict = make_line(p, p1, p2, clip)
  2008. if line_dict:
  2009. EDGES.append(line_to_edge(line_dict))
  2010. continue
  2011. if (
  2012. rect.height <= min_length and rect.height < rect.width
  2013. ): # simulates a horizontal line
  2014. y = abs(rect.y1 + rect.y0) / 2 # take middle value for y
  2015. p1 = pymupdf.Point(rect.x0, y)
  2016. p2 = pymupdf.Point(rect.x1, y)
  2017. line_dict = make_line(p, p1, p2, clip)
  2018. if line_dict:
  2019. EDGES.append(line_to_edge(line_dict))
  2020. continue
  2021. line_dict = make_line(p, rect.tl, rect.bl, clip)
  2022. if line_dict:
  2023. EDGES.append(line_to_edge(line_dict))
  2024. line_dict = make_line(p, rect.bl, rect.br, clip)
  2025. if line_dict:
  2026. EDGES.append(line_to_edge(line_dict))
  2027. line_dict = make_line(p, rect.br, rect.tr, clip)
  2028. if line_dict:
  2029. EDGES.append(line_to_edge(line_dict))
  2030. line_dict = make_line(p, rect.tr, rect.tl, clip)
  2031. if line_dict:
  2032. EDGES.append(line_to_edge(line_dict))
  2033. else: # must be a quad
  2034. # we convert it into (up to) 4 lines
  2035. ul, ur, ll, lr = i[1]
  2036. line_dict = make_line(p, ul, ll, clip)
  2037. if line_dict:
  2038. EDGES.append(line_to_edge(line_dict))
  2039. line_dict = make_line(p, ll, lr, clip)
  2040. if line_dict:
  2041. EDGES.append(line_to_edge(line_dict))
  2042. line_dict = make_line(p, lr, ur, clip)
  2043. if line_dict:
  2044. EDGES.append(line_to_edge(line_dict))
  2045. line_dict = make_line(p, ur, ul, clip)
  2046. if line_dict:
  2047. EDGES.append(line_to_edge(line_dict))
  2048. path = {"color": (0, 0, 0), "fill": None, "width": 1}
  2049. for bbox in bboxes: # add the border lines for all enveloping bboxes
  2050. line_dict = make_line(path, bbox.tl, bbox.tr, clip)
  2051. if line_dict:
  2052. EDGES.append(line_to_edge(line_dict))
  2053. line_dict = make_line(path, bbox.bl, bbox.br, clip)
  2054. if line_dict:
  2055. EDGES.append(line_to_edge(line_dict))
  2056. line_dict = make_line(path, bbox.tl, bbox.bl, clip)
  2057. if line_dict:
  2058. EDGES.append(line_to_edge(line_dict))
  2059. line_dict = make_line(path, bbox.tr, bbox.br, clip)
  2060. if line_dict:
  2061. EDGES.append(line_to_edge(line_dict))
  2062. if add_lines is not None: # add user-specified lines
  2063. assert isinstance(add_lines, (tuple, list))
  2064. else:
  2065. add_lines = []
  2066. for p1, p2 in add_lines:
  2067. p1 = pymupdf.Point(p1)
  2068. p2 = pymupdf.Point(p2)
  2069. line_dict = make_line(path, p1, p2, clip)
  2070. if line_dict:
  2071. EDGES.append(line_to_edge(line_dict))
  2072. if add_boxes is not None: # add user-specified rectangles
  2073. assert isinstance(add_boxes, (tuple, list))
  2074. else:
  2075. add_boxes = []
  2076. for box in add_boxes:
  2077. r = pymupdf.Rect(box)
  2078. line_dict = make_line(path, r.tl, r.bl, clip)
  2079. if line_dict:
  2080. EDGES.append(line_to_edge(line_dict))
  2081. line_dict = make_line(path, r.bl, r.br, clip)
  2082. if line_dict:
  2083. EDGES.append(line_to_edge(line_dict))
  2084. line_dict = make_line(path, r.br, r.tr, clip)
  2085. if line_dict:
  2086. EDGES.append(line_to_edge(line_dict))
  2087. line_dict = make_line(path, r.tr, r.tl, clip)
  2088. if line_dict:
  2089. EDGES.append(line_to_edge(line_dict))
  2090. def page_rotation_set0(page):
  2091. """Nullify page rotation.
  2092. To correctly detect tables, page rotation must be zero.
  2093. This function performs the necessary adjustments and returns information
  2094. for reverting this changes.
  2095. """
  2096. mediabox = page.mediabox
  2097. rot = page.rotation # contains normalized rotation value
  2098. # need to derotate the page's content
  2099. mb = page.mediabox # current mediabox
  2100. if rot == 90:
  2101. # before derotation, shift content horizontally
  2102. mat0 = pymupdf.Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0)
  2103. elif rot == 270:
  2104. # before derotation, shift content vertically
  2105. mat0 = pymupdf.Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0)
  2106. else:
  2107. mat0 = pymupdf.Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0)
  2108. # prefix with derotation matrix
  2109. mat = mat0 * page.derotation_matrix
  2110. cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
  2111. xref = pymupdf.TOOLS._insert_contents(page, cmd, 0)
  2112. # swap x- and y-coordinates
  2113. if rot in (90, 270):
  2114. x0, y0, x1, y1 = mb
  2115. mb.x0 = y0
  2116. mb.y0 = x0
  2117. mb.x1 = y1
  2118. mb.y1 = x1
  2119. page.set_mediabox(mb)
  2120. page.set_rotation(0)
  2121. # refresh the page to apply these changes
  2122. doc = page.parent
  2123. pno = page.number
  2124. page = doc[pno]
  2125. return page, xref, rot, mediabox
  2126. def page_rotation_reset(page, xref, rot, mediabox):
  2127. """Reset page rotation to original values.
  2128. To be used before we return tables."""
  2129. doc = page.parent # document of the page
  2130. doc.update_stream(xref, b" ") # remove de-rotation matrix
  2131. page.set_mediabox(mediabox) # set mediabox to old value
  2132. page.set_rotation(rot) # set rotation to old value
  2133. pno = page.number
  2134. page = doc[pno] # update page info
  2135. return page
  2136. def find_tables(
  2137. page,
  2138. clip=None,
  2139. vertical_strategy: str = "lines",
  2140. horizontal_strategy: str = "lines",
  2141. vertical_lines: list = None,
  2142. horizontal_lines: list = None,
  2143. snap_tolerance: float = DEFAULT_SNAP_TOLERANCE,
  2144. snap_x_tolerance: float = None,
  2145. snap_y_tolerance: float = None,
  2146. join_tolerance: float = DEFAULT_JOIN_TOLERANCE,
  2147. join_x_tolerance: float = None,
  2148. join_y_tolerance: float = None,
  2149. edge_min_length: float = 3,
  2150. min_words_vertical: float = DEFAULT_MIN_WORDS_VERTICAL,
  2151. min_words_horizontal: float = DEFAULT_MIN_WORDS_HORIZONTAL,
  2152. intersection_tolerance: float = 3,
  2153. intersection_x_tolerance: float = None,
  2154. intersection_y_tolerance: float = None,
  2155. text_tolerance=3,
  2156. text_x_tolerance=3,
  2157. text_y_tolerance=3,
  2158. strategy=None, # offer abbreviation
  2159. add_lines=None, # user-specified lines
  2160. add_boxes=None, # user-specified rectangles
  2161. paths=None, # accept vector graphics as parameter
  2162. ):
  2163. pymupdf._warn_layout_once()
  2164. global CHARS, EDGES
  2165. CHARS = []
  2166. EDGES = []
  2167. old_small = bool(pymupdf.TOOLS.set_small_glyph_heights()) # save old value
  2168. pymupdf.TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
  2169. if page.rotation != 0:
  2170. page, old_xref, old_rot, old_mediabox = page_rotation_set0(page)
  2171. else:
  2172. old_xref, old_rot, old_mediabox = None, None, None
  2173. if snap_x_tolerance is None:
  2174. snap_x_tolerance = UNSET
  2175. if snap_y_tolerance is None:
  2176. snap_y_tolerance = UNSET
  2177. if join_x_tolerance is None:
  2178. join_x_tolerance = UNSET
  2179. if join_y_tolerance is None:
  2180. join_y_tolerance = UNSET
  2181. if intersection_x_tolerance is None:
  2182. intersection_x_tolerance = UNSET
  2183. if intersection_y_tolerance is None:
  2184. intersection_y_tolerance = UNSET
  2185. if strategy is not None:
  2186. vertical_strategy = strategy
  2187. horizontal_strategy = strategy
  2188. settings = {
  2189. "vertical_strategy": vertical_strategy,
  2190. "horizontal_strategy": horizontal_strategy,
  2191. "explicit_vertical_lines": vertical_lines,
  2192. "explicit_horizontal_lines": horizontal_lines,
  2193. "snap_tolerance": snap_tolerance,
  2194. "snap_x_tolerance": snap_x_tolerance,
  2195. "snap_y_tolerance": snap_y_tolerance,
  2196. "join_tolerance": join_tolerance,
  2197. "join_x_tolerance": join_x_tolerance,
  2198. "join_y_tolerance": join_y_tolerance,
  2199. "edge_min_length": edge_min_length,
  2200. "min_words_vertical": min_words_vertical,
  2201. "min_words_horizontal": min_words_horizontal,
  2202. "intersection_tolerance": intersection_tolerance,
  2203. "intersection_x_tolerance": intersection_x_tolerance,
  2204. "intersection_y_tolerance": intersection_y_tolerance,
  2205. "text_tolerance": text_tolerance,
  2206. "text_x_tolerance": text_x_tolerance,
  2207. "text_y_tolerance": text_y_tolerance,
  2208. }
  2209. old_quad_corrections = pymupdf.TOOLS.unset_quad_corrections()
  2210. try:
  2211. page.get_layout()
  2212. if page.layout_information:
  2213. pymupdf.TOOLS.unset_quad_corrections(True)
  2214. boxes = [
  2215. pymupdf.Rect(b[:4]) for b in page.layout_information if b[-1] == "table"
  2216. ]
  2217. else:
  2218. boxes = []
  2219. if boxes: # layout did find some tables
  2220. pass
  2221. elif page.layout_information is not None:
  2222. # layout was executed but found no tables
  2223. # make sure we exit quickly with an empty TableFinder
  2224. tbf = TableFinder(page)
  2225. return tbf
  2226. tset = TableSettings.resolve(settings=settings)
  2227. page.table_settings = tset
  2228. make_chars(page, clip=clip) # create character list of page
  2229. make_edges(
  2230. page,
  2231. clip=clip,
  2232. tset=tset,
  2233. paths=paths,
  2234. add_lines=add_lines,
  2235. add_boxes=add_boxes,
  2236. ) # create lines and curves
  2237. tbf = TableFinder(page, settings=tset)
  2238. if boxes:
  2239. # only keep Finder tables that match a layout box
  2240. tbf.tables = [
  2241. tab
  2242. for tab in tbf.tables
  2243. if any(_iou(tab.bbox, r) >= 0.6 for r in boxes)
  2244. ]
  2245. # build the complementary list of layout table boxes
  2246. my_boxes = [
  2247. r for r in boxes if all(_iou(r, tab.bbox) < 0.6 for tab in tbf.tables)
  2248. ]
  2249. if my_boxes:
  2250. word_rects = [pymupdf.Rect(w[:4]) for w in TEXTPAGE.extractWORDS()]
  2251. tp2 = page.get_textpage(flags=TABLE_DETECTOR_FLAGS)
  2252. for rect in my_boxes:
  2253. cells = make_table_from_bbox(tp2, word_rects, rect) # pylint: disable=E0606
  2254. tbf.tables.append(Table(page, cells))
  2255. except Exception as e:
  2256. pymupdf.message("find_tables: exception occurred: %s" % str(e))
  2257. return None
  2258. finally:
  2259. pymupdf.TOOLS.set_small_glyph_heights(old_small)
  2260. if old_xref is not None:
  2261. page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
  2262. pymupdf.TOOLS.unset_quad_corrections(old_quad_corrections)
  2263. return tbf